package imdbs import ( "encoding/json" "errors" "fmt" "net/url" "strings" "unicode" "git.0x0001f346.de/andreas/utils" "github.com/gocolly/colly" ) // GetIMDbIDFromQuery tries to extract an IMDb ID from query func GetIMDbIDFromQuery(query string) (string, error) { if IsValidIMDbID(query) { return query, nil } u, err := url.Parse(query) if err != nil || u.Scheme == "" || u.Host == "" { return "", errors.New("no IMDb ID in query") } if u.Host != "www.imdb.com" && u.Host != "m.imdb.com" { return "", errors.New("no IMDb ID in query") } for _, s := range strings.Split(u.Path, "/") { if IsValidIMDbID(s) { return s, nil } } return "", errors.New("no IMDb ID in query") } // IsValidIMDbID returns true, if s is a valid IMDb id func IsValidIMDbID(s string) bool { if len(s) < 9 || len(s) > 10 { return false } if string([]rune(s)[0:2]) != "tt" { return false } afterPrefix := string([]rune(s)[2:]) for _, r := range afterPrefix { if !unicode.IsDigit(r) { return false } } return true } // Scrap tries to scrap an entry from the IMDb func Scrap(id string) (IMDbEntry, error) { if !IsValidIMDbID(id) { return IMDbEntry{}, errors.New("not a valid IMDb ID") } var year int64 = 0 foundJSON := false j := IMDbJSON{} c := colly.NewCollector(colly.MaxDepth(1), colly.AllowedDomains("www.imdb.com")) c.OnHTML("head", func(e *colly.HTMLElement) { e.ForEach("script", func(_ int, script *colly.HTMLElement) { if script.Attr("type") == "application/ld+json" { err := json.Unmarshal([]byte(script.Text), &j) foundJSON = err == nil } }) e.ForEach("title", func(_ int, title *colly.HTMLElement) { if j.AlternateName != "" { year = extractYearFromIMDbTitle(title.Text, j.AlternateName) return } year = extractYearFromIMDbTitle(title.Text, j.Name) }) }) c.Visit(buildScrapingURL(id)) if !foundJSON { return IMDbEntry{}, errors.New("could not scrape IMDb") } entry := j.TransformIntoIMDbEntry(id, year) return entry, nil } func buildScrapingURL(id string) string { return fmt.Sprintf("https://www.imdb.com/title/%s/", id) } func convertIMDbRuntimeIntoMinutes(s string) int64 { if !strings.Contains(s, "PT") { return 0 } s = strings.Replace(s, "PT", "", -1) if strings.Contains(s, "H") { if strings.Contains(s, "M") { ss := strings.Split(strings.Replace(s, "M", "", -1), "H") return (utils.ConvertStringToIntOrZeroOnError(ss[0]) * 60) + utils.ConvertStringToIntOrZeroOnError(ss[1]) } return utils.ConvertStringToIntOrZeroOnError(strings.Replace(s, "H", "", -1)) * 60 } return utils.ConvertStringToIntOrZeroOnError(strings.Replace(s, "M", "", -1)) } func extractYearFromIMDbTitle(s string, title string) int64 { s = sanitizeIMDbTitleForYearExtraction(s, title) if isIMDbTitleOfSeries(s) { s = prepareChunckOfIMDbTitleOfSeriesForYearExtraction(s) } for _, chunck := range strings.Split(s, " ") { if !strings.Contains(chunck, "(") || !strings.Contains(chunck, ")") { continue } year := utils.ConvertStringToIntOrZeroOnError( strings.Replace( strings.Replace(chunck, "(", "", -1), ")", "", -1, ), ) if year != 0 { return year } } return 0 } func isIMDbTitleOfSeries(title string) bool { if strings.Contains(title, "(Fernsehserie ") { return true } if strings.Contains(title, "(TV Series ") { return true } return false } func prepareChunckOfIMDbTitleOfSeriesForYearExtraction(s string) string { s = strings.Replace(s, "(Fernsehserie ", "(", -1) s = strings.Replace(s, "(TV Series ", "(", -1) sSplitted := strings.Split(s, "–") if len(sSplitted) != 2 { return s } return fmt.Sprintf("%s)", sSplitted[0]) } func sanitizeIMDbTitleForYearExtraction(s string, title string) string { s = strings.Replace(s, fmt.Sprintf("%s ", title), "", -1) s = strings.Replace(s, "(Fernsehfilm ", "(", -1) s = strings.Replace(s, "(Video ", "(", -1) s = strings.Replace(s, " )", ")", -1) return s }