Minor internal/reader/readability/readability.go speedup

- Don't use a capturing group in `divToPElementsRegexp`
- Remove a duplicate condition
- Replace a regex with a fixed-comparison and a `Contains`
This commit is contained in:
jvoisin 2024-02-29 04:01:17 +01:00 committed by Frédéric Guillot
parent f12d5131b0
commit 4db138d4b8

View file

@ -21,8 +21,7 @@ const (
)
var (
divToPElementsRegexp = regexp.MustCompile(`(?i)<(a|blockquote|dl|div|img|ol|p|pre|table|ul)`)
sentenceRegexp = regexp.MustCompile(`\.( |$)`)
divToPElementsRegexp = regexp.MustCompile(`(?i)<(?:a|blockquote|dl|div|img|ol|p|pre|table|ul)`)
blacklistCandidatesRegexp = regexp.MustCompile(`(?i)popupbody|-ad|g-plus`)
okMaybeItsACandidateRegexp = regexp.MustCompile(`(?i)and|article|body|column|main|shadow`)
@ -114,9 +113,11 @@ func getArticle(topCandidate *candidate, candidates candidateList) string {
content := s.Text()
contentLength := len(content)
if contentLength >= 80 && linkDensity < .25 {
if contentLength >= 80 {
if linkDensity < .25 {
append = true
} else if contentLength < 80 && linkDensity == 0 && sentenceRegexp.MatchString(content) {
}
} else if linkDensity == 0 && (content[len(content)-1] == '.' || strings.Contains(content, ". ")) {
append = true
}
}