From 3dc03f9c046826603054fc8d2af5bd3d4d0d52c2 Mon Sep 17 00:00:00 2001 From: Andreas Schulte <0x0001f346@pm.me> Date: Tue, 23 Jan 2024 13:08:55 +0100 Subject: [PATCH] upgraded to regular expressions --- imdb-scraper.go | 141 ++++++++++--------------------------------- imdb-scraper_test.go | 52 ++++++---------- 2 files changed, 52 insertions(+), 141 deletions(-) diff --git a/imdb-scraper.go b/imdb-scraper.go index 1d149b0..38b7574 100644 --- a/imdb-scraper.go +++ b/imdb-scraper.go @@ -4,34 +4,22 @@ import ( "encoding/json" "errors" "fmt" - "net/url" + "regexp" + "strconv" "strings" - "unicode" "git.0x0001f346.de/andreas/useragents" - "git.0x0001f346.de/andreas/utils" "github.com/gocolly/colly" ) // GetIMDbIDFromQuery tries to extract an IMDb ID from query func GetIMDbIDFromQuery(query string) (string, error) { - if IsValidIMDbID(query) { - return query, nil - } + r := regexp.MustCompile(`tt\d{7,8}`) - u, err := url.Parse(query) - if err != nil || u.Scheme == "" || u.Host == "" { - return "", errors.New("no IMDb ID in query") - } + match := r.FindStringSubmatch(query) - if u.Host != "www.imdb.com" && u.Host != "m.imdb.com" { - return "", errors.New("no IMDb ID in query") - } - - for _, s := range strings.Split(u.Path, "/") { - if IsValidIMDbID(s) { - return s, nil - } + if match != nil { + return match[0], nil } return "", errors.New("no IMDb ID in query") @@ -39,22 +27,9 @@ func GetIMDbIDFromQuery(query string) (string, error) { // IsValidIMDbID returns true, if s is a valid IMDb id func IsValidIMDbID(s string) bool { - if len(s) < 9 || len(s) > 10 { - return false - } + r := regexp.MustCompile(`^tt\d{7,8}$`) - if string([]rune(s)[0:2]) != "tt" { - return false - } - - afterPrefix := string([]rune(s)[2:]) - for _, r := range afterPrefix { - if !unicode.IsDigit(r) { - return false - } - } - - return true + return r.MatchString(s) } // Scrap tries to scrap an entry from the IMDb @@ -84,11 +59,7 @@ func Scrap(id string) (IMDbEntry, error) { } }) e.ForEach("title", func(_ int, title *colly.HTMLElement) { - if j.AlternateName != "" { - year = extractYearFromIMDbTitle(title.Text, j.AlternateName) - return - } - year = extractYearFromIMDbTitle(title.Text, j.Name) + year = extractYearFromIMDbTitle(title.Text) }) }) @@ -108,89 +79,43 @@ func buildScrapingURL(id string) string { } func convertIMDbRuntimeIntoMinutes(s string) int64 { - if !strings.Contains(s, "PT") { + r := regexp.MustCompile(`^PT(\dH)?(\d{1,2}M)?$`) + + match := r.FindStringSubmatch(s) + if match == nil { return 0 } - s = strings.Replace(s, "PT", "", -1) + var hours int64 = 0 + var minutes int64 = 0 - if strings.Contains(s, "H") { - if strings.Contains(s, "M") { - ss := strings.Split(strings.Replace(s, "M", "", -1), "H") - return (utils.ConvertStringToIntOrZeroOnError(ss[0]) * 60) + utils.ConvertStringToIntOrZeroOnError(ss[1]) + if len(match[1]) > 0 { + i, err := strconv.ParseInt(strings.Replace(match[1], "H", "", -1), 10, 64) + if err == nil { + hours = i } - - return utils.ConvertStringToIntOrZeroOnError(strings.Replace(s, "H", "", -1)) * 60 } - return utils.ConvertStringToIntOrZeroOnError(strings.Replace(s, "M", "", -1)) + if len(match[2]) > 0 { + i, err := strconv.ParseInt(strings.Replace(match[2], "M", "", -1), 10, 64) + if err == nil { + minutes = i + } + } + + return hours*60 + minutes } -func extractYearFromIMDbTitle(s string, title string) int64 { - s = sanitizeIMDbTitleForYearExtraction(s, title) +func extractYearFromIMDbTitle(s string) int64 { + r := regexp.MustCompile(`\(.*?(\d\d\d\d).*\)`) - if isIMDbTitleOfSeries(s) { - s = prepareChunckOfIMDbTitleOfSeriesForYearExtraction(s) - } - - for _, chunck := range strings.Split(s, " ") { - if !strings.Contains(chunck, "(") || !strings.Contains(chunck, ")") { - continue - } - - year := utils.ConvertStringToIntOrZeroOnError( - strings.Replace( - strings.Replace(chunck, "(", "", -1), - ")", - "", - -1, - ), - ) - if year != 0 { + match := r.FindStringSubmatch(s) + if match != nil { + year, err := strconv.ParseInt(match[1], 10, 64) + if err == nil { return year } } return 0 } - -func getSeriesIndicators() []string { - return []string{ - "(Fernsehserie ", - "(Miniserie ", - "(TV Mini Series ", - "(TV Series ", - } -} - -func isIMDbTitleOfSeries(title string) bool { - for _, indicator := range getSeriesIndicators() { - if strings.Contains(title, indicator) { - return true - } - } - - return false -} - -func prepareChunckOfIMDbTitleOfSeriesForYearExtraction(s string) string { - for _, indicator := range getSeriesIndicators() { - s = strings.Replace(s, indicator, "(", -1) - } - - sSplitted := strings.Split(s, "–") - if len(sSplitted) != 2 { - return s - } - - return fmt.Sprintf("%s)", sSplitted[0]) -} - -func sanitizeIMDbTitleForYearExtraction(s string, title string) string { - s = strings.Replace(s, fmt.Sprintf("%s ", title), "", -1) - s = strings.Replace(s, "(Fernsehfilm ", "(", -1) - s = strings.Replace(s, "(Video ", "(", -1) - s = strings.Replace(s, " )", ")", -1) - - return s -} diff --git a/imdb-scraper_test.go b/imdb-scraper_test.go index fa4486c..8eed8d5 100644 --- a/imdb-scraper_test.go +++ b/imdb-scraper_test.go @@ -18,9 +18,13 @@ func TestBuildScrapingURL(t *testing.T) { func TestConvertIMDbRuntimeIntoMinutes(t *testing.T) { data := map[string]int64{ - "PT41M": 41, - "PT2H": 120, - "PT2H1M": 121, + "PT41M": 41, + "PT2H": 120, + "PT2H1M": 121, + " PT2H1M ": 0, + "PT12H1M ": 0, + "PT121M": 0, + "": 0, } for d, expectedResult := range data { @@ -34,15 +38,16 @@ func TestConvertIMDbRuntimeIntoMinutes(t *testing.T) { func TestGetIMDbIDFromQuery(t *testing.T) { data := map[string]string{ "tt2861424": "tt2861424", + " tt2861424 ": "tt2861424", "https://www.imdb.com/title/tt2861424": "tt2861424", "https://www.imdb.com/title/tt2861424/": "tt2861424", "https://www.imdb.com/title/tt2861424/?ref_=vp_vi_tt": "tt2861424", "https://m.imdb.com/title/tt2861424": "tt2861424", "https://m.imdb.com/title/tt2861424/": "tt2861424", "https://m.imdb.com/title/tt2861424/?ref_=vp_vi_tt": "tt2861424", + "https://www.google.com/title/tt2861424/": "tt2861424", "https://www.imdb.com/": "", - "https://www.google.com/title/tt2861424/": "", - "tt2861424tt2861424tt2861424": "", + "": "", } for d, expectedResult := range data { @@ -63,53 +68,44 @@ func TestExtractYearFromIMDbTitle(t *testing.T) { data := []tResult{ { Data: "John Wick: Kapitel 4 (2023) - IMDb", - Title: "John Wick: Kapitel 4", ExpectedResult: 2023, }, { Data: "Matrix (1999) - IMDb", - Title: "Matrix", ExpectedResult: 1999, }, { Data: "Thurgood (Fernsehfilm 2011) - IMDb", - Title: "Thurgood", ExpectedResult: 2011, }, { Data: "Pretty/Handsome (Fernsehfilm 2008) - IMDb", - Title: "Pretty/Handsome", ExpectedResult: 2008, }, { Data: "Red Planet: Deleted Scenes (Video 2000) - IMDb", - Title: "Red Planet: Deleted Scenes", ExpectedResult: 2000, }, { Data: "Last Night in Soho: Deleted Scenes (Video 2022) - IMDb", - Title: "Last Night in Soho: Deleted Scenes", ExpectedResult: 2022, }, { Data: "Eine schrecklich nette Familie (Fernsehserie 1987–1997) - IMDb", - Title: "Eine schrecklich nette Familie", ExpectedResult: 1987, }, { Data: "Rick and Morty (Fernsehserie 2013– ) - IMDb", - Title: "Rick and Morty", ExpectedResult: 2013, }, { Data: "Unser Kosmos: Die Reise geht weiter (Miniserie 2014) - IMDb", - Title: "Unser Kosmos: Die Reise geht weiter", ExpectedResult: 2014, }, } for _, r := range data { - result := extractYearFromIMDbTitle(r.Data, r.Title) + result := extractYearFromIMDbTitle(r.Data) if result != r.ExpectedResult { t.Errorf("\ngot: %d\nwanted: %d\nfor: %q", result, r.ExpectedResult, r.Data) } @@ -119,14 +115,18 @@ func TestExtractYearFromIMDbTitle(t *testing.T) { func TestIsValidIMDbID(t *testing.T) { data := map[string]bool{ "tt0000000": true, + "tt99999999": true, "tt2911666": true, "tt10366206": true, "tt0944947": true, "tt11737520": true, - "tt291166": false, - "tt103662060": false, - "ttt1036620": false, - "https://www.imdb.com/": false, + "tt291166": false, // too short + "tt103662060": false, // too long (i hear that one a lot) + "ttt1036620": false, // invalid characters + "tt2911A66": false, // invalid characters + " tt2911666 ": false, // not trimmed + "https://www.imdb.com/": false, // wtf is this + "": false, // rly?! } for d, expectedResult := range data { @@ -137,20 +137,6 @@ func TestIsValidIMDbID(t *testing.T) { } } -func TestPrepareChunckOfIMDbTitleOfSeriesForYearExtraction(t *testing.T) { - data := map[string]string{ - "(Fernsehserie 1987–1997)": "(1987)", - "(Fernsehserie 2013– )": "(2013)", - } - - for d, expectedResult := range data { - result := prepareChunckOfIMDbTitleOfSeriesForYearExtraction(d) - if result != expectedResult { - t.Errorf("\ngot: %+v\nwanted: %+v\nfor: %q", result, expectedResult, d) - } - } -} - func TestScrap(t *testing.T) { data := map[string]IMDbEntry{ "tt0000000": {},