2023-10-01 09:00:33 +02:00
|
|
|
package imdbs
|
|
|
|
|
|
|
|
import (
|
|
|
|
"encoding/json"
|
|
|
|
"errors"
|
|
|
|
"fmt"
|
2024-01-23 13:08:55 +01:00
|
|
|
"regexp"
|
|
|
|
"strconv"
|
2023-10-01 09:00:33 +02:00
|
|
|
"strings"
|
|
|
|
|
2023-10-27 16:05:27 +02:00
|
|
|
"git.0x0001f346.de/andreas/useragents"
|
2023-10-01 09:00:33 +02:00
|
|
|
"github.com/gocolly/colly"
|
|
|
|
)
|
|
|
|
|
|
|
|
// GetIMDbIDFromQuery tries to extract an IMDb ID from query
|
|
|
|
func GetIMDbIDFromQuery(query string) (string, error) {
|
2024-01-23 13:08:55 +01:00
|
|
|
r := regexp.MustCompile(`tt\d{7,8}`)
|
2023-10-01 09:00:33 +02:00
|
|
|
|
2024-01-23 13:08:55 +01:00
|
|
|
match := r.FindStringSubmatch(query)
|
2023-10-01 09:00:33 +02:00
|
|
|
|
2024-01-23 13:08:55 +01:00
|
|
|
if match != nil {
|
|
|
|
return match[0], nil
|
2023-10-01 09:00:33 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
return "", errors.New("no IMDb ID in query")
|
|
|
|
}
|
|
|
|
|
|
|
|
// IsValidIMDbID returns true, if s is a valid IMDb id
|
|
|
|
func IsValidIMDbID(s string) bool {
|
2024-01-23 13:08:55 +01:00
|
|
|
r := regexp.MustCompile(`^tt\d{7,8}$`)
|
2023-10-01 09:00:33 +02:00
|
|
|
|
2024-01-23 13:08:55 +01:00
|
|
|
return r.MatchString(s)
|
2023-10-01 09:00:33 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// Scrap tries to scrap an entry from the IMDb
|
|
|
|
func Scrap(id string) (IMDbEntry, error) {
|
|
|
|
if !IsValidIMDbID(id) {
|
|
|
|
return IMDbEntry{}, errors.New("not a valid IMDb ID")
|
|
|
|
}
|
|
|
|
|
|
|
|
var year int64 = 0
|
|
|
|
foundJSON := false
|
|
|
|
j := IMDbJSON{}
|
2023-10-24 00:39:04 +02:00
|
|
|
c := colly.NewCollector(
|
|
|
|
colly.MaxDepth(1),
|
|
|
|
colly.AllowedDomains("www.imdb.com"),
|
2023-10-29 19:23:11 +01:00
|
|
|
colly.UserAgent(useragents.GetRandomUseragent()),
|
2023-10-24 00:39:04 +02:00
|
|
|
)
|
|
|
|
|
|
|
|
c.OnRequest(func(r *colly.Request) {
|
|
|
|
r.Headers.Set("Accept-Language", "de,en-US;q=0.7,en;q=0.3")
|
|
|
|
})
|
2023-10-01 09:00:33 +02:00
|
|
|
|
|
|
|
c.OnHTML("head", func(e *colly.HTMLElement) {
|
|
|
|
e.ForEach("script", func(_ int, script *colly.HTMLElement) {
|
|
|
|
if script.Attr("type") == "application/ld+json" {
|
|
|
|
err := json.Unmarshal([]byte(script.Text), &j)
|
|
|
|
foundJSON = err == nil
|
|
|
|
}
|
|
|
|
})
|
|
|
|
e.ForEach("title", func(_ int, title *colly.HTMLElement) {
|
2024-01-23 13:08:55 +01:00
|
|
|
year = extractYearFromIMDbTitle(title.Text)
|
2023-10-01 09:00:33 +02:00
|
|
|
})
|
|
|
|
})
|
|
|
|
|
|
|
|
c.Visit(buildScrapingURL(id))
|
|
|
|
|
|
|
|
if !foundJSON {
|
|
|
|
return IMDbEntry{}, errors.New("could not scrape IMDb")
|
|
|
|
}
|
|
|
|
|
|
|
|
entry := j.TransformIntoIMDbEntry(id, year)
|
|
|
|
|
|
|
|
return entry, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func buildScrapingURL(id string) string {
|
|
|
|
return fmt.Sprintf("https://www.imdb.com/title/%s/", id)
|
|
|
|
}
|
|
|
|
|
|
|
|
func convertIMDbRuntimeIntoMinutes(s string) int64 {
|
2024-01-23 13:08:55 +01:00
|
|
|
r := regexp.MustCompile(`^PT(\dH)?(\d{1,2}M)?$`)
|
|
|
|
|
|
|
|
match := r.FindStringSubmatch(s)
|
|
|
|
if match == nil {
|
2023-10-01 09:00:33 +02:00
|
|
|
return 0
|
|
|
|
}
|
|
|
|
|
2024-01-23 13:08:55 +01:00
|
|
|
var hours int64 = 0
|
|
|
|
var minutes int64 = 0
|
2023-10-01 09:00:33 +02:00
|
|
|
|
2024-01-23 13:08:55 +01:00
|
|
|
if len(match[1]) > 0 {
|
|
|
|
i, err := strconv.ParseInt(strings.Replace(match[1], "H", "", -1), 10, 64)
|
|
|
|
if err == nil {
|
|
|
|
hours = i
|
2023-10-01 09:00:33 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-01-23 13:08:55 +01:00
|
|
|
if len(match[2]) > 0 {
|
|
|
|
i, err := strconv.ParseInt(strings.Replace(match[2], "M", "", -1), 10, 64)
|
|
|
|
if err == nil {
|
|
|
|
minutes = i
|
2023-10-01 09:00:33 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-01-23 13:08:55 +01:00
|
|
|
return hours*60 + minutes
|
2023-10-01 09:00:33 +02:00
|
|
|
}
|
|
|
|
|
2024-01-23 13:08:55 +01:00
|
|
|
func extractYearFromIMDbTitle(s string) int64 {
|
|
|
|
r := regexp.MustCompile(`\(.*?(\d\d\d\d).*\)`)
|
2023-10-23 23:59:20 +02:00
|
|
|
|
2024-01-23 13:08:55 +01:00
|
|
|
match := r.FindStringSubmatch(s)
|
|
|
|
if match != nil {
|
|
|
|
year, err := strconv.ParseInt(match[1], 10, 64)
|
|
|
|
if err == nil {
|
|
|
|
return year
|
2023-10-24 00:39:04 +02:00
|
|
|
}
|
2023-10-01 09:00:33 +02:00
|
|
|
}
|
|
|
|
|
2024-01-23 13:08:55 +01:00
|
|
|
return 0
|
2023-10-01 09:00:33 +02:00
|
|
|
}
|