Minor simplifications of the rewriter

- Online some one-line functions
- Transform a free-standing function into a method
- Massively simplify `removeClickbait`
- Use a proper constant instead of a magic number in `applyFuncOnTextContent`
This commit is contained in:
jvoisin 2024-03-17 18:38:57 +01:00 committed by Frédéric Guillot
parent 02a074ed26
commit c29ca0e313
2 changed files with 51 additions and 68 deletions

View file

@ -14,6 +14,8 @@ import (
"miniflux.app/v2/internal/config"
nethtml "golang.org/x/net/html"
"github.com/PuerkitoBio/goquery"
"github.com/yuin/goldmark"
goldmarkhtml "github.com/yuin/goldmark/renderer/html"
@ -301,10 +303,6 @@ func replaceTextLinks(input string) string {
return textLinkRegex.ReplaceAllString(input, `<a href="${1}">${1}</a>`)
}
func replaceLineFeeds(input string) string {
return strings.ReplaceAll(input, "\n", "<br>")
}
func replaceCustom(entryContent string, searchTerm string, replaceTerm string) string {
re, err := regexp.Compile(searchTerm)
if err == nil {
@ -334,7 +332,7 @@ func addCastopodEpisode(entryURL, entryContent string) string {
func applyFuncOnTextContent(entryContent string, selector string, repl func(string) string) string {
var treatChildren func(i int, s *goquery.Selection)
treatChildren = func(i int, s *goquery.Selection) {
if s.Nodes[0].Type == 1 {
if s.Nodes[0].Type == nethtml.TextNode {
s.ReplaceWithHtml(repl(s.Nodes[0].Data))
} else {
s.Contents().Each(treatChildren)
@ -457,17 +455,3 @@ func removeTables(entryContent string) string {
output, _ := doc.Find("body").First().Html()
return output
}
func removeClickbait(entryTitle string) string {
titleWords := []string{}
for _, word := range strings.Fields(entryTitle) {
runes := []rune(word)
if len(runes) > 1 {
// keep first rune as is to keep the first capital letter
titleWords = append(titleWords, string([]rune{runes[0]})+strings.ToLower(string(runes[1:])))
} else {
titleWords = append(titleWords, word)
}
}
return strings.Join(titleWords, " ")
}

View file

@ -11,6 +11,9 @@ import (
"miniflux.app/v2/internal/model"
"miniflux.app/v2/internal/urllib"
"golang.org/x/text/cases"
"golang.org/x/text/language"
)
type rule struct {
@ -18,50 +21,7 @@ type rule struct {
args []string
}
// Rewriter modify item contents with a set of rewriting rules.
func Rewriter(entryURL string, entry *model.Entry, customRewriteRules string) {
rulesList := getPredefinedRewriteRules(entryURL)
if customRewriteRules != "" {
rulesList = customRewriteRules
}
rules := parseRules(rulesList)
rules = append(rules, rule{name: "add_pdf_download_link"})
slog.Debug("Rewrite rules applied",
slog.Any("rules", rules),
slog.String("entry_url", entryURL),
)
for _, rule := range rules {
applyRule(entryURL, entry, rule)
}
}
func parseRules(rulesText string) (rules []rule) {
scan := scanner.Scanner{Mode: scanner.ScanIdents | scanner.ScanStrings}
scan.Init(strings.NewReader(rulesText))
for {
switch scan.Scan() {
case scanner.Ident:
rules = append(rules, rule{name: scan.TokenText()})
case scanner.String:
if l := len(rules) - 1; l >= 0 {
text := scan.TokenText()
text, _ = strconv.Unquote(text)
rules[l].args = append(rules[l].args, text)
}
case scanner.EOF:
return
}
}
}
func applyRule(entryURL string, entry *model.Entry, rule rule) {
func (rule rule) applyRule(entryURL string, entry *model.Entry) {
switch rule.name {
case "add_image_title":
entry.Content = addImageTitle(entryURL, entry.Content)
@ -82,7 +42,7 @@ func applyRule(entryURL string, entry *model.Entry, rule rule) {
case "add_pdf_download_link":
entry.Content = addPDFLink(entryURL, entry.Content)
case "nl2br":
entry.Content = replaceLineFeeds(entry.Content)
entry.Content = strings.ReplaceAll(entry.Content, "\n", "<br>")
case "convert_text_link", "convert_text_links":
entry.Content = replaceTextLinks(entry.Content)
case "fix_medium_images":
@ -122,11 +82,11 @@ func applyRule(entryURL string, entry *model.Entry, rule rule) {
case "add_castopod_episode":
entry.Content = addCastopodEpisode(entryURL, entry.Content)
case "base64_decode":
selector := "body"
if len(rule.args) >= 1 {
entry.Content = applyFuncOnTextContent(entry.Content, rule.args[0], decodeBase64Content)
} else {
entry.Content = applyFuncOnTextContent(entry.Content, "body", decodeBase64Content)
selector = rule.args[0]
}
entry.Content = applyFuncOnTextContent(entry.Content, selector, decodeBase64Content)
case "add_hn_links_using_hack":
entry.Content = addHackerNewsLinksUsing(entry.Content, "hack")
case "add_hn_links_using_opener":
@ -136,7 +96,46 @@ func applyRule(entryURL string, entry *model.Entry, rule rule) {
case "remove_tables":
entry.Content = removeTables(entry.Content)
case "remove_clickbait":
entry.Title = removeClickbait(entry.Title)
entry.Title = cases.Title(language.English).String(strings.ToLower(entry.Title))
}
}
// Rewriter modify item contents with a set of rewriting rules.
func Rewriter(entryURL string, entry *model.Entry, customRewriteRules string) {
rulesList := getPredefinedRewriteRules(entryURL)
if customRewriteRules != "" {
rulesList = customRewriteRules
}
rules := parseRules(rulesList)
rules = append(rules, rule{name: "add_pdf_download_link"})
slog.Debug("Rewrite rules applied",
slog.Any("rules", rules),
slog.String("entry_url", entryURL),
)
for _, rule := range rules {
rule.applyRule(entryURL, entry)
}
}
func parseRules(rulesText string) (rules []rule) {
scan := scanner.Scanner{Mode: scanner.ScanIdents | scanner.ScanStrings}
scan.Init(strings.NewReader(rulesText))
for {
switch scan.Scan() {
case scanner.Ident:
rules = append(rules, rule{name: scan.TokenText()})
case scanner.String:
if l := len(rules) - 1; l >= 0 {
text, _ := strconv.Unquote(scan.TokenText())
rules[l].args = append(rules[l].args, text)
}
case scanner.EOF:
return
}
}
}