Improve a bit internal/reader/scraper/scraper.go
- make findContentUsingCustomRules' more idiomatic, since in golang a function returning an error might return garbage in other parameter. Moreover, ignoring errors is bad practise. - getPredefinedScraperRules is now running in constant-time, instead of iterating on a list with around 50 items in it.
This commit is contained in:
parent
5b2558bf92
commit
c2d2f31438
2 changed files with 10 additions and 9 deletions
|
@ -78,10 +78,9 @@ func findContentUsingCustomRules(page io.Reader, rules string) (string, error) {
|
|||
|
||||
contents := ""
|
||||
document.Find(rules).Each(func(i int, s *goquery.Selection) {
|
||||
var content string
|
||||
|
||||
content, _ = goquery.OuterHtml(s)
|
||||
contents += content
|
||||
if content, err := goquery.OuterHtml(s); err == nil {
|
||||
contents += content
|
||||
}
|
||||
})
|
||||
|
||||
return contents, nil
|
||||
|
@ -89,13 +88,11 @@ func findContentUsingCustomRules(page io.Reader, rules string) (string, error) {
|
|||
|
||||
func getPredefinedScraperRules(websiteURL string) string {
|
||||
urlDomain := urllib.Domain(websiteURL)
|
||||
urlDomain = strings.TrimPrefix(urlDomain, "www.")
|
||||
|
||||
for domain, rules := range predefinedRules {
|
||||
if strings.Contains(urlDomain, domain) {
|
||||
return rules
|
||||
}
|
||||
if rules, ok := predefinedRules[urlDomain]; ok {
|
||||
return rules
|
||||
}
|
||||
|
||||
return ""
|
||||
}
|
||||
|
||||
|
|
|
@ -19,6 +19,10 @@ func TestGetPredefinedRules(t *testing.T) {
|
|||
t.Error("Unable to find rule for linux.com")
|
||||
}
|
||||
|
||||
if getPredefinedScraperRules("https://linux.com/") == "" {
|
||||
t.Error("Unable to find rule for linux.com")
|
||||
}
|
||||
|
||||
if getPredefinedScraperRules("https://example.org/") != "" {
|
||||
t.Error("A rule not defined should not return anything")
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue