upgraded to regular expressions
This commit is contained in:
parent
add23fbe9d
commit
3dc03f9c04
139
imdb-scraper.go
139
imdb-scraper.go
@ -4,34 +4,22 @@ import (
|
|||||||
"encoding/json"
|
"encoding/json"
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"net/url"
|
"regexp"
|
||||||
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"unicode"
|
|
||||||
|
|
||||||
"git.0x0001f346.de/andreas/useragents"
|
"git.0x0001f346.de/andreas/useragents"
|
||||||
"git.0x0001f346.de/andreas/utils"
|
|
||||||
"github.com/gocolly/colly"
|
"github.com/gocolly/colly"
|
||||||
)
|
)
|
||||||
|
|
||||||
// GetIMDbIDFromQuery tries to extract an IMDb ID from query
|
// GetIMDbIDFromQuery tries to extract an IMDb ID from query
|
||||||
func GetIMDbIDFromQuery(query string) (string, error) {
|
func GetIMDbIDFromQuery(query string) (string, error) {
|
||||||
if IsValidIMDbID(query) {
|
r := regexp.MustCompile(`tt\d{7,8}`)
|
||||||
return query, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
u, err := url.Parse(query)
|
match := r.FindStringSubmatch(query)
|
||||||
if err != nil || u.Scheme == "" || u.Host == "" {
|
|
||||||
return "", errors.New("no IMDb ID in query")
|
|
||||||
}
|
|
||||||
|
|
||||||
if u.Host != "www.imdb.com" && u.Host != "m.imdb.com" {
|
if match != nil {
|
||||||
return "", errors.New("no IMDb ID in query")
|
return match[0], nil
|
||||||
}
|
|
||||||
|
|
||||||
for _, s := range strings.Split(u.Path, "/") {
|
|
||||||
if IsValidIMDbID(s) {
|
|
||||||
return s, nil
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return "", errors.New("no IMDb ID in query")
|
return "", errors.New("no IMDb ID in query")
|
||||||
@ -39,22 +27,9 @@ func GetIMDbIDFromQuery(query string) (string, error) {
|
|||||||
|
|
||||||
// IsValidIMDbID returns true, if s is a valid IMDb id
|
// IsValidIMDbID returns true, if s is a valid IMDb id
|
||||||
func IsValidIMDbID(s string) bool {
|
func IsValidIMDbID(s string) bool {
|
||||||
if len(s) < 9 || len(s) > 10 {
|
r := regexp.MustCompile(`^tt\d{7,8}$`)
|
||||||
return false
|
|
||||||
}
|
|
||||||
|
|
||||||
if string([]rune(s)[0:2]) != "tt" {
|
return r.MatchString(s)
|
||||||
return false
|
|
||||||
}
|
|
||||||
|
|
||||||
afterPrefix := string([]rune(s)[2:])
|
|
||||||
for _, r := range afterPrefix {
|
|
||||||
if !unicode.IsDigit(r) {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return true
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Scrap tries to scrap an entry from the IMDb
|
// Scrap tries to scrap an entry from the IMDb
|
||||||
@ -84,11 +59,7 @@ func Scrap(id string) (IMDbEntry, error) {
|
|||||||
}
|
}
|
||||||
})
|
})
|
||||||
e.ForEach("title", func(_ int, title *colly.HTMLElement) {
|
e.ForEach("title", func(_ int, title *colly.HTMLElement) {
|
||||||
if j.AlternateName != "" {
|
year = extractYearFromIMDbTitle(title.Text)
|
||||||
year = extractYearFromIMDbTitle(title.Text, j.AlternateName)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
year = extractYearFromIMDbTitle(title.Text, j.Name)
|
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
|
|
||||||
@ -108,89 +79,43 @@ func buildScrapingURL(id string) string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func convertIMDbRuntimeIntoMinutes(s string) int64 {
|
func convertIMDbRuntimeIntoMinutes(s string) int64 {
|
||||||
if !strings.Contains(s, "PT") {
|
r := regexp.MustCompile(`^PT(\dH)?(\d{1,2}M)?$`)
|
||||||
|
|
||||||
|
match := r.FindStringSubmatch(s)
|
||||||
|
if match == nil {
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
s = strings.Replace(s, "PT", "", -1)
|
var hours int64 = 0
|
||||||
|
var minutes int64 = 0
|
||||||
|
|
||||||
if strings.Contains(s, "H") {
|
if len(match[1]) > 0 {
|
||||||
if strings.Contains(s, "M") {
|
i, err := strconv.ParseInt(strings.Replace(match[1], "H", "", -1), 10, 64)
|
||||||
ss := strings.Split(strings.Replace(s, "M", "", -1), "H")
|
if err == nil {
|
||||||
return (utils.ConvertStringToIntOrZeroOnError(ss[0]) * 60) + utils.ConvertStringToIntOrZeroOnError(ss[1])
|
hours = i
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return utils.ConvertStringToIntOrZeroOnError(strings.Replace(s, "H", "", -1)) * 60
|
if len(match[2]) > 0 {
|
||||||
|
i, err := strconv.ParseInt(strings.Replace(match[2], "M", "", -1), 10, 64)
|
||||||
|
if err == nil {
|
||||||
|
minutes = i
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return utils.ConvertStringToIntOrZeroOnError(strings.Replace(s, "M", "", -1))
|
return hours*60 + minutes
|
||||||
}
|
}
|
||||||
|
|
||||||
func extractYearFromIMDbTitle(s string, title string) int64 {
|
func extractYearFromIMDbTitle(s string) int64 {
|
||||||
s = sanitizeIMDbTitleForYearExtraction(s, title)
|
r := regexp.MustCompile(`\(.*?(\d\d\d\d).*\)`)
|
||||||
|
|
||||||
if isIMDbTitleOfSeries(s) {
|
match := r.FindStringSubmatch(s)
|
||||||
s = prepareChunckOfIMDbTitleOfSeriesForYearExtraction(s)
|
if match != nil {
|
||||||
}
|
year, err := strconv.ParseInt(match[1], 10, 64)
|
||||||
|
if err == nil {
|
||||||
for _, chunck := range strings.Split(s, " ") {
|
|
||||||
if !strings.Contains(chunck, "(") || !strings.Contains(chunck, ")") {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
year := utils.ConvertStringToIntOrZeroOnError(
|
|
||||||
strings.Replace(
|
|
||||||
strings.Replace(chunck, "(", "", -1),
|
|
||||||
")",
|
|
||||||
"",
|
|
||||||
-1,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
if year != 0 {
|
|
||||||
return year
|
return year
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
func getSeriesIndicators() []string {
|
|
||||||
return []string{
|
|
||||||
"(Fernsehserie ",
|
|
||||||
"(Miniserie ",
|
|
||||||
"(TV Mini Series ",
|
|
||||||
"(TV Series ",
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func isIMDbTitleOfSeries(title string) bool {
|
|
||||||
for _, indicator := range getSeriesIndicators() {
|
|
||||||
if strings.Contains(title, indicator) {
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
|
|
||||||
func prepareChunckOfIMDbTitleOfSeriesForYearExtraction(s string) string {
|
|
||||||
for _, indicator := range getSeriesIndicators() {
|
|
||||||
s = strings.Replace(s, indicator, "(", -1)
|
|
||||||
}
|
|
||||||
|
|
||||||
sSplitted := strings.Split(s, "–")
|
|
||||||
if len(sSplitted) != 2 {
|
|
||||||
return s
|
|
||||||
}
|
|
||||||
|
|
||||||
return fmt.Sprintf("%s)", sSplitted[0])
|
|
||||||
}
|
|
||||||
|
|
||||||
func sanitizeIMDbTitleForYearExtraction(s string, title string) string {
|
|
||||||
s = strings.Replace(s, fmt.Sprintf("%s ", title), "", -1)
|
|
||||||
s = strings.Replace(s, "(Fernsehfilm ", "(", -1)
|
|
||||||
s = strings.Replace(s, "(Video ", "(", -1)
|
|
||||||
s = strings.Replace(s, " )", ")", -1)
|
|
||||||
|
|
||||||
return s
|
|
||||||
}
|
|
||||||
|
@ -21,6 +21,10 @@ func TestConvertIMDbRuntimeIntoMinutes(t *testing.T) {
|
|||||||
"PT41M": 41,
|
"PT41M": 41,
|
||||||
"PT2H": 120,
|
"PT2H": 120,
|
||||||
"PT2H1M": 121,
|
"PT2H1M": 121,
|
||||||
|
" PT2H1M ": 0,
|
||||||
|
"PT12H1M ": 0,
|
||||||
|
"PT121M": 0,
|
||||||
|
"": 0,
|
||||||
}
|
}
|
||||||
|
|
||||||
for d, expectedResult := range data {
|
for d, expectedResult := range data {
|
||||||
@ -33,6 +37,7 @@ func TestConvertIMDbRuntimeIntoMinutes(t *testing.T) {
|
|||||||
|
|
||||||
func TestGetIMDbIDFromQuery(t *testing.T) {
|
func TestGetIMDbIDFromQuery(t *testing.T) {
|
||||||
data := map[string]string{
|
data := map[string]string{
|
||||||
|
"tt2861424": "tt2861424",
|
||||||
" tt2861424 ": "tt2861424",
|
" tt2861424 ": "tt2861424",
|
||||||
"https://www.imdb.com/title/tt2861424": "tt2861424",
|
"https://www.imdb.com/title/tt2861424": "tt2861424",
|
||||||
"https://www.imdb.com/title/tt2861424/": "tt2861424",
|
"https://www.imdb.com/title/tt2861424/": "tt2861424",
|
||||||
@ -40,9 +45,9 @@ func TestGetIMDbIDFromQuery(t *testing.T) {
|
|||||||
"https://m.imdb.com/title/tt2861424": "tt2861424",
|
"https://m.imdb.com/title/tt2861424": "tt2861424",
|
||||||
"https://m.imdb.com/title/tt2861424/": "tt2861424",
|
"https://m.imdb.com/title/tt2861424/": "tt2861424",
|
||||||
"https://m.imdb.com/title/tt2861424/?ref_=vp_vi_tt": "tt2861424",
|
"https://m.imdb.com/title/tt2861424/?ref_=vp_vi_tt": "tt2861424",
|
||||||
|
"https://www.google.com/title/tt2861424/": "tt2861424",
|
||||||
"https://www.imdb.com/": "",
|
"https://www.imdb.com/": "",
|
||||||
"https://www.google.com/title/tt2861424/": "",
|
"": "",
|
||||||
"tt2861424tt2861424tt2861424": "",
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for d, expectedResult := range data {
|
for d, expectedResult := range data {
|
||||||
@ -63,53 +68,44 @@ func TestExtractYearFromIMDbTitle(t *testing.T) {
|
|||||||
data := []tResult{
|
data := []tResult{
|
||||||
{
|
{
|
||||||
Data: "John Wick: Kapitel 4 (2023) - IMDb",
|
Data: "John Wick: Kapitel 4 (2023) - IMDb",
|
||||||
Title: "John Wick: Kapitel 4",
|
|
||||||
ExpectedResult: 2023,
|
ExpectedResult: 2023,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
Data: "Matrix (1999) - IMDb",
|
Data: "Matrix (1999) - IMDb",
|
||||||
Title: "Matrix",
|
|
||||||
ExpectedResult: 1999,
|
ExpectedResult: 1999,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
Data: "Thurgood (Fernsehfilm 2011) - IMDb",
|
Data: "Thurgood (Fernsehfilm 2011) - IMDb",
|
||||||
Title: "Thurgood",
|
|
||||||
ExpectedResult: 2011,
|
ExpectedResult: 2011,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
Data: "Pretty/Handsome (Fernsehfilm 2008) - IMDb",
|
Data: "Pretty/Handsome (Fernsehfilm 2008) - IMDb",
|
||||||
Title: "Pretty/Handsome",
|
|
||||||
ExpectedResult: 2008,
|
ExpectedResult: 2008,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
Data: "Red Planet: Deleted Scenes (Video 2000) - IMDb",
|
Data: "Red Planet: Deleted Scenes (Video 2000) - IMDb",
|
||||||
Title: "Red Planet: Deleted Scenes",
|
|
||||||
ExpectedResult: 2000,
|
ExpectedResult: 2000,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
Data: "Last Night in Soho: Deleted Scenes (Video 2022) - IMDb",
|
Data: "Last Night in Soho: Deleted Scenes (Video 2022) - IMDb",
|
||||||
Title: "Last Night in Soho: Deleted Scenes",
|
|
||||||
ExpectedResult: 2022,
|
ExpectedResult: 2022,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
Data: "Eine schrecklich nette Familie (Fernsehserie 1987–1997) - IMDb",
|
Data: "Eine schrecklich nette Familie (Fernsehserie 1987–1997) - IMDb",
|
||||||
Title: "Eine schrecklich nette Familie",
|
|
||||||
ExpectedResult: 1987,
|
ExpectedResult: 1987,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
Data: "Rick and Morty (Fernsehserie 2013– ) - IMDb",
|
Data: "Rick and Morty (Fernsehserie 2013– ) - IMDb",
|
||||||
Title: "Rick and Morty",
|
|
||||||
ExpectedResult: 2013,
|
ExpectedResult: 2013,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
Data: "Unser Kosmos: Die Reise geht weiter (Miniserie 2014) - IMDb",
|
Data: "Unser Kosmos: Die Reise geht weiter (Miniserie 2014) - IMDb",
|
||||||
Title: "Unser Kosmos: Die Reise geht weiter",
|
|
||||||
ExpectedResult: 2014,
|
ExpectedResult: 2014,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, r := range data {
|
for _, r := range data {
|
||||||
result := extractYearFromIMDbTitle(r.Data, r.Title)
|
result := extractYearFromIMDbTitle(r.Data)
|
||||||
if result != r.ExpectedResult {
|
if result != r.ExpectedResult {
|
||||||
t.Errorf("\ngot: %d\nwanted: %d\nfor: %q", result, r.ExpectedResult, r.Data)
|
t.Errorf("\ngot: %d\nwanted: %d\nfor: %q", result, r.ExpectedResult, r.Data)
|
||||||
}
|
}
|
||||||
@ -119,14 +115,18 @@ func TestExtractYearFromIMDbTitle(t *testing.T) {
|
|||||||
func TestIsValidIMDbID(t *testing.T) {
|
func TestIsValidIMDbID(t *testing.T) {
|
||||||
data := map[string]bool{
|
data := map[string]bool{
|
||||||
"tt0000000": true,
|
"tt0000000": true,
|
||||||
|
"tt99999999": true,
|
||||||
"tt2911666": true,
|
"tt2911666": true,
|
||||||
"tt10366206": true,
|
"tt10366206": true,
|
||||||
"tt0944947": true,
|
"tt0944947": true,
|
||||||
"tt11737520": true,
|
"tt11737520": true,
|
||||||
"tt291166": false,
|
"tt291166": false, // too short
|
||||||
"tt103662060": false,
|
"tt103662060": false, // too long (i hear that one a lot)
|
||||||
"ttt1036620": false,
|
"ttt1036620": false, // invalid characters
|
||||||
"https://www.imdb.com/": false,
|
"tt2911A66": false, // invalid characters
|
||||||
|
" tt2911666 ": false, // not trimmed
|
||||||
|
"https://www.imdb.com/": false, // wtf is this
|
||||||
|
"": false, // rly?!
|
||||||
}
|
}
|
||||||
|
|
||||||
for d, expectedResult := range data {
|
for d, expectedResult := range data {
|
||||||
@ -137,20 +137,6 @@ func TestIsValidIMDbID(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestPrepareChunckOfIMDbTitleOfSeriesForYearExtraction(t *testing.T) {
|
|
||||||
data := map[string]string{
|
|
||||||
"(Fernsehserie 1987–1997)": "(1987)",
|
|
||||||
"(Fernsehserie 2013– )": "(2013)",
|
|
||||||
}
|
|
||||||
|
|
||||||
for d, expectedResult := range data {
|
|
||||||
result := prepareChunckOfIMDbTitleOfSeriesForYearExtraction(d)
|
|
||||||
if result != expectedResult {
|
|
||||||
t.Errorf("\ngot: %+v\nwanted: %+v\nfor: %q", result, expectedResult, d)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestScrap(t *testing.T) {
|
func TestScrap(t *testing.T) {
|
||||||
data := map[string]IMDbEntry{
|
data := map[string]IMDbEntry{
|
||||||
"tt0000000": {},
|
"tt0000000": {},
|
||||||
|
Loading…
Reference in New Issue
Block a user