imdbs/imdb-scraper.go

196 lines
4.3 KiB
Go
Raw Normal View History

2023-10-01 09:00:33 +02:00
package imdbs
import (
"encoding/json"
"errors"
"fmt"
"net/url"
"strings"
"unicode"
"git.0x0001f346.de/andreas/utils"
2023-10-01 09:00:33 +02:00
"github.com/gocolly/colly"
)
// GetIMDbIDFromQuery tries to extract an IMDb ID from query
func GetIMDbIDFromQuery(query string) (string, error) {
if IsValidIMDbID(query) {
return query, nil
}
u, err := url.Parse(query)
if err != nil || u.Scheme == "" || u.Host == "" {
return "", errors.New("no IMDb ID in query")
}
if u.Host != "www.imdb.com" && u.Host != "m.imdb.com" {
return "", errors.New("no IMDb ID in query")
}
for _, s := range strings.Split(u.Path, "/") {
if IsValidIMDbID(s) {
return s, nil
}
}
return "", errors.New("no IMDb ID in query")
}
// IsValidIMDbID returns true, if s is a valid IMDb id
func IsValidIMDbID(s string) bool {
if len(s) < 9 || len(s) > 10 {
return false
}
if string([]rune(s)[0:2]) != "tt" {
return false
}
afterPrefix := string([]rune(s)[2:])
for _, r := range afterPrefix {
if !unicode.IsDigit(r) {
return false
}
}
return true
}
// Scrap tries to scrap an entry from the IMDb
func Scrap(id string) (IMDbEntry, error) {
if !IsValidIMDbID(id) {
return IMDbEntry{}, errors.New("not a valid IMDb ID")
}
var year int64 = 0
foundJSON := false
j := IMDbJSON{}
2023-10-24 00:39:04 +02:00
c := colly.NewCollector(
colly.MaxDepth(1),
colly.AllowedDomains("www.imdb.com"),
colly.UserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36"),
)
c.OnRequest(func(r *colly.Request) {
r.Headers.Set("Accept-Language", "de,en-US;q=0.7,en;q=0.3")
})
2023-10-01 09:00:33 +02:00
c.OnHTML("head", func(e *colly.HTMLElement) {
e.ForEach("script", func(_ int, script *colly.HTMLElement) {
if script.Attr("type") == "application/ld+json" {
err := json.Unmarshal([]byte(script.Text), &j)
foundJSON = err == nil
}
})
e.ForEach("title", func(_ int, title *colly.HTMLElement) {
if j.AlternateName != "" {
year = extractYearFromIMDbTitle(title.Text, j.AlternateName)
return
}
year = extractYearFromIMDbTitle(title.Text, j.Name)
})
})
c.Visit(buildScrapingURL(id))
if !foundJSON {
return IMDbEntry{}, errors.New("could not scrape IMDb")
}
entry := j.TransformIntoIMDbEntry(id, year)
return entry, nil
}
func buildScrapingURL(id string) string {
return fmt.Sprintf("https://www.imdb.com/title/%s/", id)
}
func convertIMDbRuntimeIntoMinutes(s string) int64 {
if !strings.Contains(s, "PT") {
return 0
}
s = strings.Replace(s, "PT", "", -1)
if strings.Contains(s, "H") {
if strings.Contains(s, "M") {
ss := strings.Split(strings.Replace(s, "M", "", -1), "H")
return (utils.ConvertStringToIntOrZeroOnError(ss[0]) * 60) + utils.ConvertStringToIntOrZeroOnError(ss[1])
2023-10-01 09:00:33 +02:00
}
return utils.ConvertStringToIntOrZeroOnError(strings.Replace(s, "H", "", -1)) * 60
2023-10-01 09:00:33 +02:00
}
return utils.ConvertStringToIntOrZeroOnError(strings.Replace(s, "M", "", -1))
2023-10-01 09:00:33 +02:00
}
func extractYearFromIMDbTitle(s string, title string) int64 {
s = sanitizeIMDbTitleForYearExtraction(s, title)
if isIMDbTitleOfSeries(s) {
s = prepareChunckOfIMDbTitleOfSeriesForYearExtraction(s)
}
for _, chunck := range strings.Split(s, " ") {
if !strings.Contains(chunck, "(") || !strings.Contains(chunck, ")") {
continue
}
year := utils.ConvertStringToIntOrZeroOnError(
2023-10-01 09:00:33 +02:00
strings.Replace(
strings.Replace(chunck, "(", "", -1),
")",
"",
-1,
),
)
if year != 0 {
return year
}
}
return 0
}
2023-10-24 00:39:04 +02:00
func getSeriesIndicators() []string {
return []string{
"(Fernsehserie ",
"(Miniserie ",
"(TV Mini Series ",
"(TV Series ",
2023-10-23 23:59:20 +02:00
}
2023-10-24 00:39:04 +02:00
}
2023-10-23 23:59:20 +02:00
2023-10-24 00:39:04 +02:00
func isIMDbTitleOfSeries(title string) bool {
for _, indicator := range getSeriesIndicators() {
if strings.Contains(title, indicator) {
return true
}
2023-10-01 09:00:33 +02:00
}
return false
}
func prepareChunckOfIMDbTitleOfSeriesForYearExtraction(s string) string {
2023-10-24 00:39:04 +02:00
for _, indicator := range getSeriesIndicators() {
s = strings.Replace(s, indicator, "(", -1)
}
2023-10-01 09:00:33 +02:00
sSplitted := strings.Split(s, "")
if len(sSplitted) != 2 {
return s
}
return fmt.Sprintf("%s)", sSplitted[0])
}
func sanitizeIMDbTitleForYearExtraction(s string, title string) string {
s = strings.Replace(s, fmt.Sprintf("%s ", title), "", -1)
s = strings.Replace(s, "(Fernsehfilm ", "(", -1)
s = strings.Replace(s, "(Video ", "(", -1)
s = strings.Replace(s, " )", ")", -1)
return s
}