imdbs/imdb-scraper.go

180 lines
3.9 KiB
Go
Raw Normal View History

2023-10-01 09:00:33 +02:00
package imdbs
import (
"encoding/json"
"errors"
"fmt"
"net/url"
"strings"
"unicode"
"git.0x0001f346.de/andreas/utils"
2023-10-01 09:00:33 +02:00
"github.com/gocolly/colly"
)
// GetIMDbIDFromQuery tries to extract an IMDb ID from query
func GetIMDbIDFromQuery(query string) (string, error) {
if IsValidIMDbID(query) {
return query, nil
}
u, err := url.Parse(query)
if err != nil || u.Scheme == "" || u.Host == "" {
return "", errors.New("no IMDb ID in query")
}
if u.Host != "www.imdb.com" && u.Host != "m.imdb.com" {
return "", errors.New("no IMDb ID in query")
}
for _, s := range strings.Split(u.Path, "/") {
if IsValidIMDbID(s) {
return s, nil
}
}
return "", errors.New("no IMDb ID in query")
}
// IsValidIMDbID returns true, if s is a valid IMDb id
func IsValidIMDbID(s string) bool {
if len(s) < 9 || len(s) > 10 {
return false
}
if string([]rune(s)[0:2]) != "tt" {
return false
}
afterPrefix := string([]rune(s)[2:])
for _, r := range afterPrefix {
if !unicode.IsDigit(r) {
return false
}
}
return true
}
// Scrap tries to scrap an entry from the IMDb
func Scrap(id string) (IMDbEntry, error) {
if !IsValidIMDbID(id) {
return IMDbEntry{}, errors.New("not a valid IMDb ID")
}
var year int64 = 0
foundJSON := false
j := IMDbJSON{}
c := colly.NewCollector(colly.MaxDepth(1), colly.AllowedDomains("www.imdb.com"))
c.OnHTML("head", func(e *colly.HTMLElement) {
e.ForEach("script", func(_ int, script *colly.HTMLElement) {
if script.Attr("type") == "application/ld+json" {
err := json.Unmarshal([]byte(script.Text), &j)
foundJSON = err == nil
}
})
e.ForEach("title", func(_ int, title *colly.HTMLElement) {
if j.AlternateName != "" {
year = extractYearFromIMDbTitle(title.Text, j.AlternateName)
return
}
year = extractYearFromIMDbTitle(title.Text, j.Name)
})
})
c.Visit(buildScrapingURL(id))
if !foundJSON {
return IMDbEntry{}, errors.New("could not scrape IMDb")
}
entry := j.TransformIntoIMDbEntry(id, year)
return entry, nil
}
func buildScrapingURL(id string) string {
return fmt.Sprintf("https://www.imdb.com/title/%s/", id)
}
func convertIMDbRuntimeIntoMinutes(s string) int64 {
if !strings.Contains(s, "PT") {
return 0
}
s = strings.Replace(s, "PT", "", -1)
if strings.Contains(s, "H") {
if strings.Contains(s, "M") {
ss := strings.Split(strings.Replace(s, "M", "", -1), "H")
return (utils.ConvertStringToIntOrZeroOnError(ss[0]) * 60) + utils.ConvertStringToIntOrZeroOnError(ss[1])
2023-10-01 09:00:33 +02:00
}
return utils.ConvertStringToIntOrZeroOnError(strings.Replace(s, "H", "", -1)) * 60
2023-10-01 09:00:33 +02:00
}
return utils.ConvertStringToIntOrZeroOnError(strings.Replace(s, "M", "", -1))
2023-10-01 09:00:33 +02:00
}
func extractYearFromIMDbTitle(s string, title string) int64 {
s = sanitizeIMDbTitleForYearExtraction(s, title)
if isIMDbTitleOfSeries(s) {
s = prepareChunckOfIMDbTitleOfSeriesForYearExtraction(s)
}
for _, chunck := range strings.Split(s, " ") {
if !strings.Contains(chunck, "(") || !strings.Contains(chunck, ")") {
continue
}
year := utils.ConvertStringToIntOrZeroOnError(
2023-10-01 09:00:33 +02:00
strings.Replace(
strings.Replace(chunck, "(", "", -1),
")",
"",
-1,
),
)
if year != 0 {
return year
}
}
return 0
}
func isIMDbTitleOfSeries(title string) bool {
if strings.Contains(title, "(Fernsehserie ") {
return true
}
if strings.Contains(title, "(TV Series ") {
return true
}
return false
}
func prepareChunckOfIMDbTitleOfSeriesForYearExtraction(s string) string {
s = strings.Replace(s, "(Fernsehserie ", "(", -1)
s = strings.Replace(s, "(TV Series ", "(", -1)
sSplitted := strings.Split(s, "")
if len(sSplitted) != 2 {
return s
}
return fmt.Sprintf("%s)", sSplitted[0])
}
func sanitizeIMDbTitleForYearExtraction(s string, title string) string {
s = strings.Replace(s, fmt.Sprintf("%s ", title), "", -1)
s = strings.Replace(s, "(Fernsehfilm ", "(", -1)
s = strings.Replace(s, "(Video ", "(", -1)
s = strings.Replace(s, " )", ")", -1)
return s
}