upgraded to regular expressions

This commit is contained in:
Andreas Schulte 2024-01-23 13:08:55 +01:00
parent add23fbe9d
commit 3dc03f9c04
Signed by: andreas
GPG Key ID: E123DA7BD8F9C8AB
2 changed files with 52 additions and 141 deletions

View File

@ -4,34 +4,22 @@ import (
"encoding/json" "encoding/json"
"errors" "errors"
"fmt" "fmt"
"net/url" "regexp"
"strconv"
"strings" "strings"
"unicode"
"git.0x0001f346.de/andreas/useragents" "git.0x0001f346.de/andreas/useragents"
"git.0x0001f346.de/andreas/utils"
"github.com/gocolly/colly" "github.com/gocolly/colly"
) )
// GetIMDbIDFromQuery tries to extract an IMDb ID from query // GetIMDbIDFromQuery tries to extract an IMDb ID from query
func GetIMDbIDFromQuery(query string) (string, error) { func GetIMDbIDFromQuery(query string) (string, error) {
if IsValidIMDbID(query) { r := regexp.MustCompile(`tt\d{7,8}`)
return query, nil
}
u, err := url.Parse(query) match := r.FindStringSubmatch(query)
if err != nil || u.Scheme == "" || u.Host == "" {
return "", errors.New("no IMDb ID in query")
}
if u.Host != "www.imdb.com" && u.Host != "m.imdb.com" { if match != nil {
return "", errors.New("no IMDb ID in query") return match[0], nil
}
for _, s := range strings.Split(u.Path, "/") {
if IsValidIMDbID(s) {
return s, nil
}
} }
return "", errors.New("no IMDb ID in query") return "", errors.New("no IMDb ID in query")
@ -39,22 +27,9 @@ func GetIMDbIDFromQuery(query string) (string, error) {
// IsValidIMDbID returns true, if s is a valid IMDb id // IsValidIMDbID returns true, if s is a valid IMDb id
func IsValidIMDbID(s string) bool { func IsValidIMDbID(s string) bool {
if len(s) < 9 || len(s) > 10 { r := regexp.MustCompile(`^tt\d{7,8}$`)
return false
}
if string([]rune(s)[0:2]) != "tt" { return r.MatchString(s)
return false
}
afterPrefix := string([]rune(s)[2:])
for _, r := range afterPrefix {
if !unicode.IsDigit(r) {
return false
}
}
return true
} }
// Scrap tries to scrap an entry from the IMDb // Scrap tries to scrap an entry from the IMDb
@ -84,11 +59,7 @@ func Scrap(id string) (IMDbEntry, error) {
} }
}) })
e.ForEach("title", func(_ int, title *colly.HTMLElement) { e.ForEach("title", func(_ int, title *colly.HTMLElement) {
if j.AlternateName != "" { year = extractYearFromIMDbTitle(title.Text)
year = extractYearFromIMDbTitle(title.Text, j.AlternateName)
return
}
year = extractYearFromIMDbTitle(title.Text, j.Name)
}) })
}) })
@ -108,89 +79,43 @@ func buildScrapingURL(id string) string {
} }
func convertIMDbRuntimeIntoMinutes(s string) int64 { func convertIMDbRuntimeIntoMinutes(s string) int64 {
if !strings.Contains(s, "PT") { r := regexp.MustCompile(`^PT(\dH)?(\d{1,2}M)?$`)
match := r.FindStringSubmatch(s)
if match == nil {
return 0 return 0
} }
s = strings.Replace(s, "PT", "", -1) var hours int64 = 0
var minutes int64 = 0
if strings.Contains(s, "H") { if len(match[1]) > 0 {
if strings.Contains(s, "M") { i, err := strconv.ParseInt(strings.Replace(match[1], "H", "", -1), 10, 64)
ss := strings.Split(strings.Replace(s, "M", "", -1), "H") if err == nil {
return (utils.ConvertStringToIntOrZeroOnError(ss[0]) * 60) + utils.ConvertStringToIntOrZeroOnError(ss[1]) hours = i
}
} }
return utils.ConvertStringToIntOrZeroOnError(strings.Replace(s, "H", "", -1)) * 60 if len(match[2]) > 0 {
i, err := strconv.ParseInt(strings.Replace(match[2], "M", "", -1), 10, 64)
if err == nil {
minutes = i
}
} }
return utils.ConvertStringToIntOrZeroOnError(strings.Replace(s, "M", "", -1)) return hours*60 + minutes
} }
func extractYearFromIMDbTitle(s string, title string) int64 { func extractYearFromIMDbTitle(s string) int64 {
s = sanitizeIMDbTitleForYearExtraction(s, title) r := regexp.MustCompile(`\(.*?(\d\d\d\d).*\)`)
if isIMDbTitleOfSeries(s) { match := r.FindStringSubmatch(s)
s = prepareChunckOfIMDbTitleOfSeriesForYearExtraction(s) if match != nil {
} year, err := strconv.ParseInt(match[1], 10, 64)
if err == nil {
for _, chunck := range strings.Split(s, " ") {
if !strings.Contains(chunck, "(") || !strings.Contains(chunck, ")") {
continue
}
year := utils.ConvertStringToIntOrZeroOnError(
strings.Replace(
strings.Replace(chunck, "(", "", -1),
")",
"",
-1,
),
)
if year != 0 {
return year return year
} }
} }
return 0 return 0
} }
func getSeriesIndicators() []string {
return []string{
"(Fernsehserie ",
"(Miniserie ",
"(TV Mini Series ",
"(TV Series ",
}
}
func isIMDbTitleOfSeries(title string) bool {
for _, indicator := range getSeriesIndicators() {
if strings.Contains(title, indicator) {
return true
}
}
return false
}
func prepareChunckOfIMDbTitleOfSeriesForYearExtraction(s string) string {
for _, indicator := range getSeriesIndicators() {
s = strings.Replace(s, indicator, "(", -1)
}
sSplitted := strings.Split(s, "")
if len(sSplitted) != 2 {
return s
}
return fmt.Sprintf("%s)", sSplitted[0])
}
func sanitizeIMDbTitleForYearExtraction(s string, title string) string {
s = strings.Replace(s, fmt.Sprintf("%s ", title), "", -1)
s = strings.Replace(s, "(Fernsehfilm ", "(", -1)
s = strings.Replace(s, "(Video ", "(", -1)
s = strings.Replace(s, " )", ")", -1)
return s
}

View File

@ -21,6 +21,10 @@ func TestConvertIMDbRuntimeIntoMinutes(t *testing.T) {
"PT41M": 41, "PT41M": 41,
"PT2H": 120, "PT2H": 120,
"PT2H1M": 121, "PT2H1M": 121,
" PT2H1M ": 0,
"PT12H1M ": 0,
"PT121M": 0,
"": 0,
} }
for d, expectedResult := range data { for d, expectedResult := range data {
@ -33,6 +37,7 @@ func TestConvertIMDbRuntimeIntoMinutes(t *testing.T) {
func TestGetIMDbIDFromQuery(t *testing.T) { func TestGetIMDbIDFromQuery(t *testing.T) {
data := map[string]string{ data := map[string]string{
"tt2861424": "tt2861424",
" tt2861424 ": "tt2861424", " tt2861424 ": "tt2861424",
"https://www.imdb.com/title/tt2861424": "tt2861424", "https://www.imdb.com/title/tt2861424": "tt2861424",
"https://www.imdb.com/title/tt2861424/": "tt2861424", "https://www.imdb.com/title/tt2861424/": "tt2861424",
@ -40,9 +45,9 @@ func TestGetIMDbIDFromQuery(t *testing.T) {
"https://m.imdb.com/title/tt2861424": "tt2861424", "https://m.imdb.com/title/tt2861424": "tt2861424",
"https://m.imdb.com/title/tt2861424/": "tt2861424", "https://m.imdb.com/title/tt2861424/": "tt2861424",
"https://m.imdb.com/title/tt2861424/?ref_=vp_vi_tt": "tt2861424", "https://m.imdb.com/title/tt2861424/?ref_=vp_vi_tt": "tt2861424",
"https://www.google.com/title/tt2861424/": "tt2861424",
"https://www.imdb.com/": "", "https://www.imdb.com/": "",
"https://www.google.com/title/tt2861424/": "", "": "",
"tt2861424tt2861424tt2861424": "",
} }
for d, expectedResult := range data { for d, expectedResult := range data {
@ -63,53 +68,44 @@ func TestExtractYearFromIMDbTitle(t *testing.T) {
data := []tResult{ data := []tResult{
{ {
Data: "John Wick: Kapitel 4 (2023) - IMDb", Data: "John Wick: Kapitel 4 (2023) - IMDb",
Title: "John Wick: Kapitel 4",
ExpectedResult: 2023, ExpectedResult: 2023,
}, },
{ {
Data: "Matrix (1999) - IMDb", Data: "Matrix (1999) - IMDb",
Title: "Matrix",
ExpectedResult: 1999, ExpectedResult: 1999,
}, },
{ {
Data: "Thurgood (Fernsehfilm 2011) - IMDb", Data: "Thurgood (Fernsehfilm 2011) - IMDb",
Title: "Thurgood",
ExpectedResult: 2011, ExpectedResult: 2011,
}, },
{ {
Data: "Pretty/Handsome (Fernsehfilm 2008) - IMDb", Data: "Pretty/Handsome (Fernsehfilm 2008) - IMDb",
Title: "Pretty/Handsome",
ExpectedResult: 2008, ExpectedResult: 2008,
}, },
{ {
Data: "Red Planet: Deleted Scenes (Video 2000) - IMDb", Data: "Red Planet: Deleted Scenes (Video 2000) - IMDb",
Title: "Red Planet: Deleted Scenes",
ExpectedResult: 2000, ExpectedResult: 2000,
}, },
{ {
Data: "Last Night in Soho: Deleted Scenes (Video 2022) - IMDb", Data: "Last Night in Soho: Deleted Scenes (Video 2022) - IMDb",
Title: "Last Night in Soho: Deleted Scenes",
ExpectedResult: 2022, ExpectedResult: 2022,
}, },
{ {
Data: "Eine schrecklich nette Familie (Fernsehserie 19871997) - IMDb", Data: "Eine schrecklich nette Familie (Fernsehserie 19871997) - IMDb",
Title: "Eine schrecklich nette Familie",
ExpectedResult: 1987, ExpectedResult: 1987,
}, },
{ {
Data: "Rick and Morty (Fernsehserie 2013 ) - IMDb", Data: "Rick and Morty (Fernsehserie 2013 ) - IMDb",
Title: "Rick and Morty",
ExpectedResult: 2013, ExpectedResult: 2013,
}, },
{ {
Data: "Unser Kosmos: Die Reise geht weiter (Miniserie 2014) - IMDb", Data: "Unser Kosmos: Die Reise geht weiter (Miniserie 2014) - IMDb",
Title: "Unser Kosmos: Die Reise geht weiter",
ExpectedResult: 2014, ExpectedResult: 2014,
}, },
} }
for _, r := range data { for _, r := range data {
result := extractYearFromIMDbTitle(r.Data, r.Title) result := extractYearFromIMDbTitle(r.Data)
if result != r.ExpectedResult { if result != r.ExpectedResult {
t.Errorf("\ngot: %d\nwanted: %d\nfor: %q", result, r.ExpectedResult, r.Data) t.Errorf("\ngot: %d\nwanted: %d\nfor: %q", result, r.ExpectedResult, r.Data)
} }
@ -119,14 +115,18 @@ func TestExtractYearFromIMDbTitle(t *testing.T) {
func TestIsValidIMDbID(t *testing.T) { func TestIsValidIMDbID(t *testing.T) {
data := map[string]bool{ data := map[string]bool{
"tt0000000": true, "tt0000000": true,
"tt99999999": true,
"tt2911666": true, "tt2911666": true,
"tt10366206": true, "tt10366206": true,
"tt0944947": true, "tt0944947": true,
"tt11737520": true, "tt11737520": true,
"tt291166": false, "tt291166": false, // too short
"tt103662060": false, "tt103662060": false, // too long (i hear that one a lot)
"ttt1036620": false, "ttt1036620": false, // invalid characters
"https://www.imdb.com/": false, "tt2911A66": false, // invalid characters
" tt2911666 ": false, // not trimmed
"https://www.imdb.com/": false, // wtf is this
"": false, // rly?!
} }
for d, expectedResult := range data { for d, expectedResult := range data {
@ -137,20 +137,6 @@ func TestIsValidIMDbID(t *testing.T) {
} }
} }
func TestPrepareChunckOfIMDbTitleOfSeriesForYearExtraction(t *testing.T) {
data := map[string]string{
"(Fernsehserie 19871997)": "(1987)",
"(Fernsehserie 2013 )": "(2013)",
}
for d, expectedResult := range data {
result := prepareChunckOfIMDbTitleOfSeriesForYearExtraction(d)
if result != expectedResult {
t.Errorf("\ngot: %+v\nwanted: %+v\nfor: %q", result, expectedResult, d)
}
}
}
func TestScrap(t *testing.T) { func TestScrap(t *testing.T) {
data := map[string]IMDbEntry{ data := map[string]IMDbEntry{
"tt0000000": {}, "tt0000000": {},