imdbs/imdb-scraper.go

180 lines
3.9 KiB
Go
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package imdbs
import (
"encoding/json"
"errors"
"fmt"
"net/url"
"strings"
"unicode"
"git.0x0001f346.de/andreas/utils"
"github.com/gocolly/colly"
)
// GetIMDbIDFromQuery tries to extract an IMDb ID from query
func GetIMDbIDFromQuery(query string) (string, error) {
if IsValidIMDbID(query) {
return query, nil
}
u, err := url.Parse(query)
if err != nil || u.Scheme == "" || u.Host == "" {
return "", errors.New("no IMDb ID in query")
}
if u.Host != "www.imdb.com" && u.Host != "m.imdb.com" {
return "", errors.New("no IMDb ID in query")
}
for _, s := range strings.Split(u.Path, "/") {
if IsValidIMDbID(s) {
return s, nil
}
}
return "", errors.New("no IMDb ID in query")
}
// IsValidIMDbID returns true, if s is a valid IMDb id
func IsValidIMDbID(s string) bool {
if len(s) < 9 || len(s) > 10 {
return false
}
if string([]rune(s)[0:2]) != "tt" {
return false
}
afterPrefix := string([]rune(s)[2:])
for _, r := range afterPrefix {
if !unicode.IsDigit(r) {
return false
}
}
return true
}
// Scrap tries to scrap an entry from the IMDb
func Scrap(id string) (IMDbEntry, error) {
if !IsValidIMDbID(id) {
return IMDbEntry{}, errors.New("not a valid IMDb ID")
}
var year int64 = 0
foundJSON := false
j := IMDbJSON{}
c := colly.NewCollector(colly.MaxDepth(1), colly.AllowedDomains("www.imdb.com"))
c.OnHTML("head", func(e *colly.HTMLElement) {
e.ForEach("script", func(_ int, script *colly.HTMLElement) {
if script.Attr("type") == "application/ld+json" {
err := json.Unmarshal([]byte(script.Text), &j)
foundJSON = err == nil
}
})
e.ForEach("title", func(_ int, title *colly.HTMLElement) {
if j.AlternateName != "" {
year = extractYearFromIMDbTitle(title.Text, j.AlternateName)
return
}
year = extractYearFromIMDbTitle(title.Text, j.Name)
})
})
c.Visit(buildScrapingURL(id))
if !foundJSON {
return IMDbEntry{}, errors.New("could not scrape IMDb")
}
entry := j.TransformIntoIMDbEntry(id, year)
return entry, nil
}
func buildScrapingURL(id string) string {
return fmt.Sprintf("https://www.imdb.com/title/%s/", id)
}
func convertIMDbRuntimeIntoMinutes(s string) int64 {
if !strings.Contains(s, "PT") {
return 0
}
s = strings.Replace(s, "PT", "", -1)
if strings.Contains(s, "H") {
if strings.Contains(s, "M") {
ss := strings.Split(strings.Replace(s, "M", "", -1), "H")
return (utils.ConvertStringToIntOrZeroOnError(ss[0]) * 60) + utils.ConvertStringToIntOrZeroOnError(ss[1])
}
return utils.ConvertStringToIntOrZeroOnError(strings.Replace(s, "H", "", -1)) * 60
}
return utils.ConvertStringToIntOrZeroOnError(strings.Replace(s, "M", "", -1))
}
func extractYearFromIMDbTitle(s string, title string) int64 {
s = sanitizeIMDbTitleForYearExtraction(s, title)
if isIMDbTitleOfSeries(s) {
s = prepareChunckOfIMDbTitleOfSeriesForYearExtraction(s)
}
for _, chunck := range strings.Split(s, " ") {
if !strings.Contains(chunck, "(") || !strings.Contains(chunck, ")") {
continue
}
year := utils.ConvertStringToIntOrZeroOnError(
strings.Replace(
strings.Replace(chunck, "(", "", -1),
")",
"",
-1,
),
)
if year != 0 {
return year
}
}
return 0
}
func isIMDbTitleOfSeries(title string) bool {
if strings.Contains(title, "(Fernsehserie ") {
return true
}
if strings.Contains(title, "(TV Series ") {
return true
}
return false
}
func prepareChunckOfIMDbTitleOfSeriesForYearExtraction(s string) string {
s = strings.Replace(s, "(Fernsehserie ", "(", -1)
s = strings.Replace(s, "(TV Series ", "(", -1)
sSplitted := strings.Split(s, "")
if len(sSplitted) != 2 {
return s
}
return fmt.Sprintf("%s)", sSplitted[0])
}
func sanitizeIMDbTitleForYearExtraction(s string, title string) string {
s = strings.Replace(s, fmt.Sprintf("%s ", title), "", -1)
s = strings.Replace(s, "(Fernsehfilm ", "(", -1)
s = strings.Replace(s, "(Video ", "(", -1)
s = strings.Replace(s, " )", ")", -1)
return s
}