improved series detection

This commit is contained in:
Andreas Schulte 2023-10-24 00:39:04 +02:00
parent 4bb98a6518
commit 469e2984bc
Signed by: andreas
GPG Key ID: DCD1B6A247B69DB6
2 changed files with 36 additions and 16 deletions

View File

@ -65,7 +65,15 @@ func Scrap(id string) (IMDbEntry, error) {
var year int64 = 0
foundJSON := false
j := IMDbJSON{}
c := colly.NewCollector(colly.MaxDepth(1), colly.AllowedDomains("www.imdb.com"))
c := colly.NewCollector(
colly.MaxDepth(1),
colly.AllowedDomains("www.imdb.com"),
colly.UserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36"),
)
c.OnRequest(func(r *colly.Request) {
r.Headers.Set("Accept-Language", "de,en-US;q=0.7,en;q=0.3")
})
c.OnHTML("head", func(e *colly.HTMLElement) {
e.ForEach("script", func(_ int, script *colly.HTMLElement) {
@ -145,26 +153,29 @@ func extractYearFromIMDbTitle(s string, title string) int64 {
return 0
}
func getSeriesIndicators() []string {
return []string{
"(Fernsehserie ",
"(Miniserie ",
"(TV Mini Series ",
"(TV Series ",
}
}
func isIMDbTitleOfSeries(title string) bool {
if strings.Contains(title, "(Fernsehserie ") {
return true
}
if strings.Contains(title, "(Miniserie ") {
return true
}
if strings.Contains(title, "(TV Series ") {
return true
for _, indicator := range getSeriesIndicators() {
if strings.Contains(title, indicator) {
return true
}
}
return false
}
func prepareChunckOfIMDbTitleOfSeriesForYearExtraction(s string) string {
s = strings.Replace(s, "(Fernsehserie ", "(", -1)
s = strings.Replace(s, "(Miniserie ", "(", -1)
s = strings.Replace(s, "(TV Series ", "(", -1)
for _, indicator := range getSeriesIndicators() {
s = strings.Replace(s, indicator, "(", -1)
}
sSplitted := strings.Split(s, "")
if len(sSplitted) != 2 {

View File

@ -196,7 +196,7 @@ func TestScrap(t *testing.T) {
Rating: 9.1,
RuntimeInMins: 0,
Title: "Rick and Morty",
Type: "TVSeries",
Type: "Series",
Year: 2013,
},
"tt0092400": {
@ -205,9 +205,18 @@ func TestScrap(t *testing.T) {
Rating: 8.1,
RuntimeInMins: 0,
Title: "Married with Children",
Type: "TVSeries",
Type: "Series",
Year: 1987,
},
"tt7366338": {
AlternateName: "",
IMDbID: "tt7366338",
Rating: 9.3,
RuntimeInMins: 0,
Title: "Chernobyl",
Type: "Series",
Year: 2019,
},
}
for d, expectedResult := range data {