Improve a bit internal/reader/scraper/scraper.go
- make findContentUsingCustomRules' more idiomatic, since in golang a function returning an error might return garbage in other parameter. Moreover, ignoring errors is bad practise. - getPredefinedScraperRules is now running in constant-time, instead of iterating on a list with around 50 items in it.
This commit is contained in:
parent
5b2558bf92
commit
c2d2f31438
2 changed files with 10 additions and 9 deletions
|
@ -78,10 +78,9 @@ func findContentUsingCustomRules(page io.Reader, rules string) (string, error) {
|
||||||
|
|
||||||
contents := ""
|
contents := ""
|
||||||
document.Find(rules).Each(func(i int, s *goquery.Selection) {
|
document.Find(rules).Each(func(i int, s *goquery.Selection) {
|
||||||
var content string
|
if content, err := goquery.OuterHtml(s); err == nil {
|
||||||
|
contents += content
|
||||||
content, _ = goquery.OuterHtml(s)
|
}
|
||||||
contents += content
|
|
||||||
})
|
})
|
||||||
|
|
||||||
return contents, nil
|
return contents, nil
|
||||||
|
@ -89,13 +88,11 @@ func findContentUsingCustomRules(page io.Reader, rules string) (string, error) {
|
||||||
|
|
||||||
func getPredefinedScraperRules(websiteURL string) string {
|
func getPredefinedScraperRules(websiteURL string) string {
|
||||||
urlDomain := urllib.Domain(websiteURL)
|
urlDomain := urllib.Domain(websiteURL)
|
||||||
|
urlDomain = strings.TrimPrefix(urlDomain, "www.")
|
||||||
|
|
||||||
for domain, rules := range predefinedRules {
|
if rules, ok := predefinedRules[urlDomain]; ok {
|
||||||
if strings.Contains(urlDomain, domain) {
|
return rules
|
||||||
return rules
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return ""
|
return ""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -19,6 +19,10 @@ func TestGetPredefinedRules(t *testing.T) {
|
||||||
t.Error("Unable to find rule for linux.com")
|
t.Error("Unable to find rule for linux.com")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if getPredefinedScraperRules("https://linux.com/") == "" {
|
||||||
|
t.Error("Unable to find rule for linux.com")
|
||||||
|
}
|
||||||
|
|
||||||
if getPredefinedScraperRules("https://example.org/") != "" {
|
if getPredefinedScraperRules("https://example.org/") != "" {
|
||||||
t.Error("A rule not defined should not return anything")
|
t.Error("A rule not defined should not return anything")
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue