imdbs/imdb-scraper.go

122 lines
2.5 KiB
Go

package imdbs
import (
"encoding/json"
"errors"
"fmt"
"regexp"
"strconv"
"strings"
"git.0x0001f346.de/andreas/useragents"
"github.com/gocolly/colly"
)
// GetIMDbIDFromQuery tries to extract an IMDb ID from query
func GetIMDbIDFromQuery(query string) (string, error) {
r := regexp.MustCompile(`tt\d{7,8}`)
match := r.FindStringSubmatch(query)
if match != nil {
return match[0], nil
}
return "", errors.New("no IMDb ID in query")
}
// IsValidIMDbID returns true, if s is a valid IMDb id
func IsValidIMDbID(s string) bool {
r := regexp.MustCompile(`^tt\d{7,8}$`)
return r.MatchString(s)
}
// Scrap tries to scrap an entry from the IMDb
func Scrap(id string) (IMDbEntry, error) {
if !IsValidIMDbID(id) {
return IMDbEntry{}, errors.New("not a valid IMDb ID")
}
var year int64 = 0
foundJSON := false
j := IMDbJSON{}
c := colly.NewCollector(
colly.MaxDepth(1),
colly.AllowedDomains("www.imdb.com"),
colly.UserAgent(useragents.GetRandomUseragent()),
)
c.OnRequest(func(r *colly.Request) {
r.Headers.Set("Accept-Language", "de,en-US;q=0.7,en;q=0.3")
})
c.OnHTML("head", func(e *colly.HTMLElement) {
e.ForEach("script", func(_ int, script *colly.HTMLElement) {
if script.Attr("type") == "application/ld+json" {
err := json.Unmarshal([]byte(script.Text), &j)
foundJSON = err == nil
}
})
e.ForEach("title", func(_ int, title *colly.HTMLElement) {
year = extractYearFromIMDbTitle(title.Text)
})
})
c.Visit(buildScrapingURL(id))
if !foundJSON {
return IMDbEntry{}, errors.New("could not scrape IMDb")
}
entry := j.TransformIntoIMDbEntry(id, year)
return entry, nil
}
func buildScrapingURL(id string) string {
return fmt.Sprintf("https://www.imdb.com/title/%s/", id)
}
func convertIMDbRuntimeIntoMinutes(s string) int64 {
r := regexp.MustCompile(`^PT(\dH)?(\d{1,2}M)?$`)
match := r.FindStringSubmatch(s)
if match == nil {
return 0
}
var hours int64 = 0
var minutes int64 = 0
if len(match[1]) > 0 {
i, err := strconv.ParseInt(strings.Replace(match[1], "H", "", -1), 10, 64)
if err == nil {
hours = i
}
}
if len(match[2]) > 0 {
i, err := strconv.ParseInt(strings.Replace(match[2], "M", "", -1), 10, 64)
if err == nil {
minutes = i
}
}
return hours*60 + minutes
}
func extractYearFromIMDbTitle(s string) int64 {
r := regexp.MustCompile(`\(.*?(\d\d\d\d).*\)`)
match := r.FindStringSubmatch(s)
if match != nil {
year, err := strconv.ParseInt(match[1], 10, 64)
if err == nil {
return year
}
}
return 0
}