package imdbs import ( "encoding/json" "errors" "fmt" "regexp" "strconv" "strings" "git.0x0001f346.de/andreas/useragents" "github.com/gocolly/colly" ) // GetIMDbIDFromQuery tries to extract an IMDb ID from query func GetIMDbIDFromQuery(query string) (string, error) { r := regexp.MustCompile(`tt\d{7,8}`) match := r.FindStringSubmatch(query) if match != nil { return match[0], nil } return "", errors.New("no IMDb ID in query") } // IsValidIMDbID returns true, if s is a valid IMDb id func IsValidIMDbID(s string) bool { r := regexp.MustCompile(`^tt\d{7,8}$`) return r.MatchString(s) } // Scrap tries to scrap an entry from the IMDb func Scrap(id string) (IMDbEntry, error) { if !IsValidIMDbID(id) { return IMDbEntry{}, errors.New("not a valid IMDb ID") } var year int64 = 0 foundJSON := false j := IMDbJSON{} c := colly.NewCollector( colly.MaxDepth(1), colly.AllowedDomains("www.imdb.com"), colly.UserAgent(useragents.GetRandomUseragent()), ) c.OnRequest(func(r *colly.Request) { r.Headers.Set("Accept-Language", "de,en-US;q=0.7,en;q=0.3") }) c.OnHTML("head", func(e *colly.HTMLElement) { e.ForEach("script", func(_ int, script *colly.HTMLElement) { if script.Attr("type") == "application/ld+json" { err := json.Unmarshal([]byte(script.Text), &j) foundJSON = err == nil } }) e.ForEach("title", func(_ int, title *colly.HTMLElement) { year = extractYearFromIMDbTitle(title.Text) }) }) c.Visit(buildScrapingURL(id)) if !foundJSON { return IMDbEntry{}, errors.New("could not scrape IMDb") } entry := j.TransformIntoIMDbEntry(id, year) return entry, nil } func buildScrapingURL(id string) string { return fmt.Sprintf("https://www.imdb.com/title/%s/", id) } func convertIMDbRuntimeIntoMinutes(s string) int64 { r := regexp.MustCompile(`^PT(\dH)?(\d{1,2}M)?$`) match := r.FindStringSubmatch(s) if match == nil { return 0 } var hours int64 = 0 var minutes int64 = 0 if len(match[1]) > 0 { i, err := strconv.ParseInt(strings.Replace(match[1], "H", "", -1), 10, 64) if err == nil { hours = i } } if len(match[2]) > 0 { i, err := strconv.ParseInt(strings.Replace(match[2], "M", "", -1), 10, 64) if err == nil { minutes = i } } return hours*60 + minutes } func extractYearFromIMDbTitle(s string) int64 { r := regexp.MustCompile(`\(.*?(\d\d\d\d).*\)`) match := r.FindStringSubmatch(s) if match != nil { year, err := strconv.ParseInt(match[1], 10, 64) if err == nil { return year } } return 0 }