Allow the scraper to parse XHTML documents
Only "text/html" was authorized before.
This commit is contained in:
parent
1ff9950a55
commit
3b6e44c331
2 changed files with 28 additions and 1 deletions
|
@ -34,7 +34,7 @@ func Fetch(websiteURL, rules, userAgent string) (string, error) {
|
|||
return "", errors.New("scraper: unable to download web page")
|
||||
}
|
||||
|
||||
if !strings.Contains(response.ContentType, "text/html") {
|
||||
if !isWhitelistedContentType(response.ContentType) {
|
||||
return "", fmt.Errorf("scraper: this resource is not a HTML document (%s)", response.ContentType)
|
||||
}
|
||||
|
||||
|
@ -99,3 +99,9 @@ func getPredefinedScraperRules(websiteURL string) string {
|
|||
|
||||
return ""
|
||||
}
|
||||
|
||||
func isWhitelistedContentType(contentType string) bool {
|
||||
contentType = strings.ToLower(contentType)
|
||||
return strings.HasPrefix(contentType, "text/html") ||
|
||||
strings.HasPrefix(contentType, "application/xhtml+xml")
|
||||
}
|
||||
|
|
|
@ -19,3 +19,24 @@ func TestGetPredefinedRules(t *testing.T) {
|
|||
t.Error("A rule not defined should not return anything")
|
||||
}
|
||||
}
|
||||
|
||||
func TestWhitelistedContentTypes(t *testing.T) {
|
||||
scenarios := map[string]bool{
|
||||
"text/html": true,
|
||||
"TeXt/hTmL": true,
|
||||
"application/xhtml+xml": true,
|
||||
"text/html; charset=utf-8": true,
|
||||
"application/xhtml+xml; charset=utf-8": true,
|
||||
"text/css": false,
|
||||
"application/javascript": false,
|
||||
"image/png": false,
|
||||
"application/pdf": false,
|
||||
}
|
||||
|
||||
for inputValue, expectedResult := range scenarios {
|
||||
actualResult := isWhitelistedContentType(inputValue)
|
||||
if actualResult != expectedResult {
|
||||
t.Errorf(`Unexpected result for content type whitelist, got "%v" instead of "%v"`, actualResult, expectedResult)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue