From 97765b93a964e65aab2106a1132637f7661ed4ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Guillot?= Date: Thu, 29 Feb 2024 18:34:15 -0800 Subject: [PATCH] Revert "Minor internal/reader/readability/readability.go speedup" This reverts commit 4db138d4b87c988eed6dbe2fc72cf1a13d393d8b. ``` panic: runtime error: index out of range [-1] goroutine 49 [running]: miniflux.app/v2/internal/reader/readability.getArticle.func1(0x8?, 0xc000b56570) /home/fred/repos/miniflux/v2/internal/reader/readability/readability.go:120 +0x2ac github.com/PuerkitoBio/goquery.(*Selection).Each(0xc000b56510, 0xc000892fa8) /home/fred/go/pkg/mod/github.com/!puerkito!bio/goquery@v1.9.0/iteration.go:10 +0x62 miniflux.app/v2/internal/reader/readability.getArticle(0xc00044f1f0, 0xc000a04a50) /home/fred/repos/miniflux/v2/internal/reader/readability/readability.go:101 +0x15d miniflux.app/v2/internal/reader/readability.ExtractContent({0x1005d00?, 0xc0001522d0?}) /home/fred/repos/miniflux/v2/internal/reader/readability/readability.go:91 +0x211 miniflux.app/v2/internal/reader/scraper.ScrapeWebsite(0xc000893688?, {0xc0007ce720, 0x54}, {0x0, 0x0}) /home/fred/repos/miniflux/v2/internal/reader/scraper/scraper.go:63 +0x859 miniflux.app/v2/internal/reader/processor.ProcessFeedEntries(0xc000133188, 0xc000502c40, 0xc0003e6360, 0x0) /home/fred/repos/miniflux/v2/internal/reader/processor/processor.go:77 +0x8ea miniflux.app/v2/internal/reader/handler.RefreshFeed(0xc000133188, 0x10cf, 0x52d5c, 0x0) /home/fred/repos/miniflux/v2/internal/reader/handler/handler.go:301 +0x1485 miniflux.app/v2/internal/cli.refreshFeeds.func1(0x0) /home/fred/repos/miniflux/v2/internal/cli/refresh_feeds.go:59 +0x2d7 created by miniflux.app/v2/internal/cli.refreshFeeds in goroutine 1 /home/fred/repos/miniflux/v2/internal/cli/refresh_feeds.go:50 +0x5d5 ``` --- internal/reader/readability/readability.go | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/internal/reader/readability/readability.go b/internal/reader/readability/readability.go index 443f2138..ec127bca 100644 --- a/internal/reader/readability/readability.go +++ b/internal/reader/readability/readability.go @@ -21,7 +21,8 @@ const ( ) var ( - divToPElementsRegexp = regexp.MustCompile(`(?i)<(?:a|blockquote|dl|div|img|ol|p|pre|table|ul)`) + divToPElementsRegexp = regexp.MustCompile(`(?i)<(a|blockquote|dl|div|img|ol|p|pre|table|ul)`) + sentenceRegexp = regexp.MustCompile(`\.( |$)`) blacklistCandidatesRegexp = regexp.MustCompile(`(?i)popupbody|-ad|g-plus`) okMaybeItsACandidateRegexp = regexp.MustCompile(`(?i)and|article|body|column|main|shadow`) @@ -113,11 +114,9 @@ func getArticle(topCandidate *candidate, candidates candidateList) string { content := s.Text() contentLength := len(content) - if contentLength >= 80 { - if linkDensity < .25 { - append = true - } - } else if linkDensity == 0 && (content[len(content)-1] == '.' || strings.Contains(content, ". ")) { + if contentLength >= 80 && linkDensity < .25 { + append = true + } else if contentLength < 80 && linkDensity == 0 && sentenceRegexp.MatchString(content) { append = true } }