189 lines
4.0 KiB
Go
189 lines
4.0 KiB
Go
|
package imdbs
|
|||
|
|
|||
|
import (
|
|||
|
"encoding/json"
|
|||
|
"errors"
|
|||
|
"fmt"
|
|||
|
"net/url"
|
|||
|
"strconv"
|
|||
|
"strings"
|
|||
|
"unicode"
|
|||
|
|
|||
|
"github.com/gocolly/colly"
|
|||
|
)
|
|||
|
|
|||
|
// GetIMDbIDFromQuery tries to extract an IMDb ID from query
|
|||
|
func GetIMDbIDFromQuery(query string) (string, error) {
|
|||
|
if IsValidIMDbID(query) {
|
|||
|
return query, nil
|
|||
|
}
|
|||
|
|
|||
|
u, err := url.Parse(query)
|
|||
|
if err != nil || u.Scheme == "" || u.Host == "" {
|
|||
|
return "", errors.New("no IMDb ID in query")
|
|||
|
}
|
|||
|
|
|||
|
if u.Host != "www.imdb.com" && u.Host != "m.imdb.com" {
|
|||
|
return "", errors.New("no IMDb ID in query")
|
|||
|
}
|
|||
|
|
|||
|
for _, s := range strings.Split(u.Path, "/") {
|
|||
|
if IsValidIMDbID(s) {
|
|||
|
return s, nil
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
return "", errors.New("no IMDb ID in query")
|
|||
|
}
|
|||
|
|
|||
|
// IsValidIMDbID returns true, if s is a valid IMDb id
|
|||
|
func IsValidIMDbID(s string) bool {
|
|||
|
if len(s) < 9 || len(s) > 10 {
|
|||
|
return false
|
|||
|
}
|
|||
|
|
|||
|
if string([]rune(s)[0:2]) != "tt" {
|
|||
|
return false
|
|||
|
}
|
|||
|
|
|||
|
afterPrefix := string([]rune(s)[2:])
|
|||
|
for _, r := range afterPrefix {
|
|||
|
if !unicode.IsDigit(r) {
|
|||
|
return false
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
return true
|
|||
|
}
|
|||
|
|
|||
|
// Scrap tries to scrap an entry from the IMDb
|
|||
|
func Scrap(id string) (IMDbEntry, error) {
|
|||
|
if !IsValidIMDbID(id) {
|
|||
|
return IMDbEntry{}, errors.New("not a valid IMDb ID")
|
|||
|
}
|
|||
|
|
|||
|
var year int64 = 0
|
|||
|
foundJSON := false
|
|||
|
j := IMDbJSON{}
|
|||
|
c := colly.NewCollector(colly.MaxDepth(1), colly.AllowedDomains("www.imdb.com"))
|
|||
|
|
|||
|
c.OnHTML("head", func(e *colly.HTMLElement) {
|
|||
|
e.ForEach("script", func(_ int, script *colly.HTMLElement) {
|
|||
|
if script.Attr("type") == "application/ld+json" {
|
|||
|
err := json.Unmarshal([]byte(script.Text), &j)
|
|||
|
foundJSON = err == nil
|
|||
|
}
|
|||
|
})
|
|||
|
e.ForEach("title", func(_ int, title *colly.HTMLElement) {
|
|||
|
if j.AlternateName != "" {
|
|||
|
year = extractYearFromIMDbTitle(title.Text, j.AlternateName)
|
|||
|
return
|
|||
|
}
|
|||
|
year = extractYearFromIMDbTitle(title.Text, j.Name)
|
|||
|
})
|
|||
|
})
|
|||
|
|
|||
|
c.Visit(buildScrapingURL(id))
|
|||
|
|
|||
|
if !foundJSON {
|
|||
|
return IMDbEntry{}, errors.New("could not scrape IMDb")
|
|||
|
}
|
|||
|
|
|||
|
entry := j.TransformIntoIMDbEntry(id, year)
|
|||
|
|
|||
|
return entry, nil
|
|||
|
}
|
|||
|
|
|||
|
func buildScrapingURL(id string) string {
|
|||
|
return fmt.Sprintf("https://www.imdb.com/title/%s/", id)
|
|||
|
}
|
|||
|
|
|||
|
func convertIMDbRuntimeIntoMinutes(s string) int64 {
|
|||
|
if !strings.Contains(s, "PT") {
|
|||
|
return 0
|
|||
|
}
|
|||
|
|
|||
|
s = strings.Replace(s, "PT", "", -1)
|
|||
|
|
|||
|
if strings.Contains(s, "H") {
|
|||
|
if strings.Contains(s, "M") {
|
|||
|
ss := strings.Split(strings.Replace(s, "M", "", -1), "H")
|
|||
|
return (convertStringToIntOrZeroOnError(ss[0]) * 60) + convertStringToIntOrZeroOnError(ss[1])
|
|||
|
}
|
|||
|
|
|||
|
return convertStringToIntOrZeroOnError(strings.Replace(s, "H", "", -1)) * 60
|
|||
|
}
|
|||
|
|
|||
|
return convertStringToIntOrZeroOnError(strings.Replace(s, "M", "", -1))
|
|||
|
}
|
|||
|
|
|||
|
func convertStringToIntOrZeroOnError(s string) int64 {
|
|||
|
i, err := strconv.ParseInt(s, 10, 64)
|
|||
|
if err != nil {
|
|||
|
return 0
|
|||
|
}
|
|||
|
|
|||
|
return i
|
|||
|
}
|
|||
|
|
|||
|
func extractYearFromIMDbTitle(s string, title string) int64 {
|
|||
|
s = sanitizeIMDbTitleForYearExtraction(s, title)
|
|||
|
|
|||
|
if isIMDbTitleOfSeries(s) {
|
|||
|
s = prepareChunckOfIMDbTitleOfSeriesForYearExtraction(s)
|
|||
|
}
|
|||
|
|
|||
|
for _, chunck := range strings.Split(s, " ") {
|
|||
|
if !strings.Contains(chunck, "(") || !strings.Contains(chunck, ")") {
|
|||
|
continue
|
|||
|
}
|
|||
|
|
|||
|
year := convertStringToIntOrZeroOnError(
|
|||
|
strings.Replace(
|
|||
|
strings.Replace(chunck, "(", "", -1),
|
|||
|
")",
|
|||
|
"",
|
|||
|
-1,
|
|||
|
),
|
|||
|
)
|
|||
|
if year != 0 {
|
|||
|
return year
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
return 0
|
|||
|
}
|
|||
|
|
|||
|
func isIMDbTitleOfSeries(title string) bool {
|
|||
|
if strings.Contains(title, "(Fernsehserie ") {
|
|||
|
return true
|
|||
|
}
|
|||
|
|
|||
|
if strings.Contains(title, "(TV Series ") {
|
|||
|
return true
|
|||
|
}
|
|||
|
|
|||
|
return false
|
|||
|
}
|
|||
|
|
|||
|
func prepareChunckOfIMDbTitleOfSeriesForYearExtraction(s string) string {
|
|||
|
s = strings.Replace(s, "(Fernsehserie ", "(", -1)
|
|||
|
s = strings.Replace(s, "(TV Series ", "(", -1)
|
|||
|
|
|||
|
sSplitted := strings.Split(s, "–")
|
|||
|
if len(sSplitted) != 2 {
|
|||
|
return s
|
|||
|
}
|
|||
|
|
|||
|
return fmt.Sprintf("%s)", sSplitted[0])
|
|||
|
}
|
|||
|
|
|||
|
func sanitizeIMDbTitleForYearExtraction(s string, title string) string {
|
|||
|
s = strings.Replace(s, fmt.Sprintf("%s ", title), "", -1)
|
|||
|
s = strings.Replace(s, "(Fernsehfilm ", "(", -1)
|
|||
|
s = strings.Replace(s, "(Video ", "(", -1)
|
|||
|
s = strings.Replace(s, " )", ")", -1)
|
|||
|
|
|||
|
return s
|
|||
|
}
|