From 469e2984bc92e4fa0a5478e046035f67c702093f Mon Sep 17 00:00:00 2001 From: Andreas Schulte Date: Tue, 24 Oct 2023 00:39:04 +0200 Subject: [PATCH] improved series detection --- imdb-scraper.go | 39 +++++++++++++++++++++++++-------------- imdb-scraper_test.go | 13 +++++++++++-- 2 files changed, 36 insertions(+), 16 deletions(-) diff --git a/imdb-scraper.go b/imdb-scraper.go index 294b94a..edb71b1 100644 --- a/imdb-scraper.go +++ b/imdb-scraper.go @@ -65,7 +65,15 @@ func Scrap(id string) (IMDbEntry, error) { var year int64 = 0 foundJSON := false j := IMDbJSON{} - c := colly.NewCollector(colly.MaxDepth(1), colly.AllowedDomains("www.imdb.com")) + c := colly.NewCollector( + colly.MaxDepth(1), + colly.AllowedDomains("www.imdb.com"), + colly.UserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36"), + ) + + c.OnRequest(func(r *colly.Request) { + r.Headers.Set("Accept-Language", "de,en-US;q=0.7,en;q=0.3") + }) c.OnHTML("head", func(e *colly.HTMLElement) { e.ForEach("script", func(_ int, script *colly.HTMLElement) { @@ -145,26 +153,29 @@ func extractYearFromIMDbTitle(s string, title string) int64 { return 0 } +func getSeriesIndicators() []string { + return []string{ + "(Fernsehserie ", + "(Miniserie ", + "(TV Mini Series ", + "(TV Series ", + } +} + func isIMDbTitleOfSeries(title string) bool { - if strings.Contains(title, "(Fernsehserie ") { - return true - } - - if strings.Contains(title, "(Miniserie ") { - return true - } - - if strings.Contains(title, "(TV Series ") { - return true + for _, indicator := range getSeriesIndicators() { + if strings.Contains(title, indicator) { + return true + } } return false } func prepareChunckOfIMDbTitleOfSeriesForYearExtraction(s string) string { - s = strings.Replace(s, "(Fernsehserie ", "(", -1) - s = strings.Replace(s, "(Miniserie ", "(", -1) - s = strings.Replace(s, "(TV Series ", "(", -1) + for _, indicator := range getSeriesIndicators() { + s = strings.Replace(s, indicator, "(", -1) + } sSplitted := strings.Split(s, "–") if len(sSplitted) != 2 { diff --git a/imdb-scraper_test.go b/imdb-scraper_test.go index 2c682ae..4c16046 100644 --- a/imdb-scraper_test.go +++ b/imdb-scraper_test.go @@ -196,7 +196,7 @@ func TestScrap(t *testing.T) { Rating: 9.1, RuntimeInMins: 0, Title: "Rick and Morty", - Type: "TVSeries", + Type: "Series", Year: 2013, }, "tt0092400": { @@ -205,9 +205,18 @@ func TestScrap(t *testing.T) { Rating: 8.1, RuntimeInMins: 0, Title: "Married with Children", - Type: "TVSeries", + Type: "Series", Year: 1987, }, + "tt7366338": { + AlternateName: "", + IMDbID: "tt7366338", + Rating: 9.3, + RuntimeInMins: 0, + Title: "Chernobyl", + Type: "Series", + Year: 2019, + }, } for d, expectedResult := range data {