imdbs/imdb-scraper.go

package imdbs

import (
	"encoding/json"
	"errors"
	"fmt"
	"net/url"
	"strings"
	"unicode"

	"git.0x0001f346.de/andreas/utils"
	"github.com/gocolly/colly"
)

// GetIMDbIDFromQuery tries to extract an IMDb ID from query
func GetIMDbIDFromQuery(query string) (string, error) {
	if IsValidIMDbID(query) {
		return query, nil
	}

	u, err := url.Parse(query)
	if err != nil || u.Scheme == "" || u.Host == "" {
		return "", errors.New("no IMDb ID in query")
	}

	if u.Host != "www.imdb.com" && u.Host != "m.imdb.com" {
		return "", errors.New("no IMDb ID in query")
	}

	for _, s := range strings.Split(u.Path, "/") {
		if IsValidIMDbID(s) {
			return s, nil
		}
	}

	return "", errors.New("no IMDb ID in query")
}

// IsValidIMDbID returns true, if s is a valid IMDb id
func IsValidIMDbID(s string) bool {
	if len(s) < 9 || len(s) > 10 {
		return false
	}

	if string([]rune(s)[0:2]) != "tt" {
		return false
	}

	afterPrefix := string([]rune(s)[2:])
	for _, r := range afterPrefix {
		if !unicode.IsDigit(r) {
			return false
		}
	}

	return true
}

// Scrap tries to scrap an entry from the IMDb
func Scrap(id string) (IMDbEntry, error) {
	if !IsValidIMDbID(id) {
		return IMDbEntry{}, errors.New("not a valid IMDb ID")
	}

	var year int64 = 0
	foundJSON := false
	j := IMDbJSON{}
	c := colly.NewCollector(colly.MaxDepth(1), colly.AllowedDomains("www.imdb.com"))

	c.OnHTML("head", func(e *colly.HTMLElement) {
		e.ForEach("script", func(_ int, script *colly.HTMLElement) {
			if script.Attr("type") == "application/ld+json" {
				err := json.Unmarshal([]byte(script.Text), &j)
				foundJSON = err == nil
			}
		})
		e.ForEach("title", func(_ int, title *colly.HTMLElement) {
			if j.AlternateName != "" {
				year = extractYearFromIMDbTitle(title.Text, j.AlternateName)
				return
			}
			year = extractYearFromIMDbTitle(title.Text, j.Name)
		})
	})

	c.Visit(buildScrapingURL(id))

	if !foundJSON {
		return IMDbEntry{}, errors.New("could not scrape IMDb")
	}

	entry := j.TransformIntoIMDbEntry(id, year)

	return entry, nil
}

func buildScrapingURL(id string) string {
	return fmt.Sprintf("https://www.imdb.com/title/%s/", id)
}

func convertIMDbRuntimeIntoMinutes(s string) int64 {
	if !strings.Contains(s, "PT") {
		return 0
	}

	s = strings.Replace(s, "PT", "", -1)

	if strings.Contains(s, "H") {
		if strings.Contains(s, "M") {
			ss := strings.Split(strings.Replace(s, "M", "", -1), "H")
			return (utils.ConvertStringToIntOrZeroOnError(ss[0]) * 60) + utils.ConvertStringToIntOrZeroOnError(ss[1])
		}

		return utils.ConvertStringToIntOrZeroOnError(strings.Replace(s, "H", "", -1)) * 60
	}

	return utils.ConvertStringToIntOrZeroOnError(strings.Replace(s, "M", "", -1))
}

func extractYearFromIMDbTitle(s string, title string) int64 {
	s = sanitizeIMDbTitleForYearExtraction(s, title)

	if isIMDbTitleOfSeries(s) {
		s = prepareChunckOfIMDbTitleOfSeriesForYearExtraction(s)
	}

	for _, chunck := range strings.Split(s, " ") {
		if !strings.Contains(chunck, "(") || !strings.Contains(chunck, ")") {
			continue
		}

		year := utils.ConvertStringToIntOrZeroOnError(
			strings.Replace(
				strings.Replace(chunck, "(", "", -1),
				")",
				"",
				-1,
			),
		)
		if year != 0 {
			return year
		}
	}

	return 0
}

func isIMDbTitleOfSeries(title string) bool {
	if strings.Contains(title, "(Fernsehserie ") {
		return true
	}

	if strings.Contains(title, "(TV Series ") {
		return true
	}

	return false
}

func prepareChunckOfIMDbTitleOfSeriesForYearExtraction(s string) string {
	s = strings.Replace(s, "(Fernsehserie ", "(", -1)
	s = strings.Replace(s, "(TV Series ", "(", -1)

	sSplitted := strings.Split(s, "–")
	if len(sSplitted) != 2 {
		return s
	}

	return fmt.Sprintf("%s)", sSplitted[0])
}

func sanitizeIMDbTitleForYearExtraction(s string, title string) string {
	s = strings.Replace(s, fmt.Sprintf("%s ", title), "", -1)
	s = strings.Replace(s, "(Fernsehfilm ", "(", -1)
	s = strings.Replace(s, "(Video ", "(", -1)
	s = strings.Replace(s, " )", ")", -1)

	return s
}