Add a rewrite rule to remove clickbait titles

This commit is contained in:
Romain de Laage 2023-04-08 11:02:36 +02:00 committed by Frédéric Guillot
parent 8161085714
commit 33c4b5188c
4 changed files with 379 additions and 170 deletions

View file

@ -85,7 +85,7 @@ func ProcessFeedEntries(store *storage.Storage, feed *model.Feed, user *model.Us
}
}
entry.Content = rewrite.Rewriter(url, entry.Content, feed.RewriteRules)
rewrite.Rewriter(url, entry, feed.RewriteRules)
// The sanitizer should always run at the end of the process to make sure unsafe HTML is filtered.
entry.Content = sanitizer.Sanitize(url, entry.Content)
@ -168,14 +168,14 @@ func ProcessEntryWebPage(feed *model.Feed, entry *model.Entry, user *model.User)
return scraperErr
}
content = rewrite.Rewriter(url, content, entry.Feed.RewriteRules)
content = sanitizer.Sanitize(url, content)
if content != "" {
entry.Content = content
entry.ReadingTime = calculateReadingTime(content, user)
}
rewrite.Rewriter(url, entry, entry.Feed.RewriteRules)
entry.Content = sanitizer.Sanitize(url, entry.Content)
return nil
}

View file

@ -367,3 +367,17 @@ func removeTables(entryContent string) string {
output, _ := doc.Find("body").First().Html()
return output
}
func removeClickbait(entryTitle string) string {
titleWords := []string{}
for _, word := range strings.Fields(entryTitle) {
runes := []rune(word)
if len(runes) > 1 {
// keep first rune as is to keep the first capital letter
titleWords = append(titleWords, string([]rune{runes[0]})+strings.ToLower(string(runes[1:])))
} else {
titleWords = append(titleWords, word)
}
}
return strings.Join(titleWords, " ")
}

View file

@ -10,6 +10,7 @@ import (
"text/scanner"
"miniflux.app/logger"
"miniflux.app/model"
"miniflux.app/url"
)
@ -19,7 +20,7 @@ type rule struct {
}
// Rewriter modify item contents with a set of rewriting rules.
func Rewriter(entryURL, entryContent, customRewriteRules string) string {
func Rewriter(entryURL string, entry *model.Entry, customRewriteRules string) {
rulesList := getPredefinedRewriteRules(entryURL)
if customRewriteRules != "" {
rulesList = customRewriteRules
@ -31,10 +32,8 @@ func Rewriter(entryURL, entryContent, customRewriteRules string) string {
logger.Debug(`[Rewrite] Applying rules %v for %q`, rules, entryURL)
for _, rule := range rules {
entryContent = applyRule(entryURL, entryContent, rule)
applyRule(entryURL, entry, rule)
}
return entryContent
}
func parseRules(rulesText string) (rules []rule) {
@ -60,61 +59,61 @@ func parseRules(rulesText string) (rules []rule) {
}
}
func applyRule(entryURL, entryContent string, rule rule) string {
func applyRule(entryURL string, entry *model.Entry, rule rule) {
switch rule.name {
case "add_image_title":
entryContent = addImageTitle(entryURL, entryContent)
entry.Content = addImageTitle(entryURL, entry.Content)
case "add_mailto_subject":
entryContent = addMailtoSubject(entryURL, entryContent)
entry.Content = addMailtoSubject(entryURL, entry.Content)
case "add_dynamic_image":
entryContent = addDynamicImage(entryURL, entryContent)
entry.Content = addDynamicImage(entryURL, entry.Content)
case "add_youtube_video":
entryContent = addYoutubeVideo(entryURL, entryContent)
entry.Content = addYoutubeVideo(entryURL, entry.Content)
case "add_invidious_video":
entryContent = addInvidiousVideo(entryURL, entryContent)
entry.Content = addInvidiousVideo(entryURL, entry.Content)
case "add_youtube_video_using_invidious_player":
entryContent = addYoutubeVideoUsingInvidiousPlayer(entryURL, entryContent)
entry.Content = addYoutubeVideoUsingInvidiousPlayer(entryURL, entry.Content)
case "add_youtube_video_from_id":
entryContent = addYoutubeVideoFromId(entryContent)
entry.Content = addYoutubeVideoFromId(entry.Content)
case "add_pdf_download_link":
entryContent = addPDFLink(entryURL, entryContent)
entry.Content = addPDFLink(entryURL, entry.Content)
case "nl2br":
entryContent = replaceLineFeeds(entryContent)
entry.Content = replaceLineFeeds(entry.Content)
case "convert_text_link", "convert_text_links":
entryContent = replaceTextLinks(entryContent)
entry.Content = replaceTextLinks(entry.Content)
case "fix_medium_images":
entryContent = fixMediumImages(entryURL, entryContent)
entry.Content = fixMediumImages(entryURL, entry.Content)
case "use_noscript_figure_images":
entryContent = useNoScriptImages(entryURL, entryContent)
entry.Content = useNoScriptImages(entryURL, entry.Content)
case "replace":
// Format: replace("search-term"|"replace-term")
if len(rule.args) >= 2 {
entryContent = replaceCustom(entryContent, rule.args[0], rule.args[1])
entry.Content = replaceCustom(entry.Content, rule.args[0], rule.args[1])
} else {
logger.Debug("[Rewrite] Cannot find search and replace terms for replace rule %s", rule)
}
case "remove":
// Format: remove("#selector > .element, .another")
if len(rule.args) >= 1 {
entryContent = removeCustom(entryContent, rule.args[0])
entry.Content = removeCustom(entry.Content, rule.args[0])
} else {
logger.Debug("[Rewrite] Cannot find selector for remove rule %s", rule)
}
case "add_castopod_episode":
entryContent = addCastopodEpisode(entryURL, entryContent)
entry.Content = addCastopodEpisode(entryURL, entry.Content)
case "base64_decode":
if len(rule.args) >= 1 {
entryContent = applyFuncOnTextContent(entryContent, rule.args[0], decodeBase64Content)
entry.Content = applyFuncOnTextContent(entry.Content, rule.args[0], decodeBase64Content)
} else {
entryContent = applyFuncOnTextContent(entryContent, "body", decodeBase64Content)
entry.Content = applyFuncOnTextContent(entry.Content, "body", decodeBase64Content)
}
case "parse_markdown":
entryContent = parseMarkdown(entryContent)
entry.Content = parseMarkdown(entry.Content)
case "remove_tables":
entryContent = removeTables(entryContent)
entry.Content = removeTables(entry.Content)
case "remove_clickbait":
entry.Title = removeClickbait(entry.Title)
}
return entryContent
}
func getPredefinedRewriteRules(entryURL string) string {

View file

@ -8,6 +8,8 @@ import (
"reflect"
"strings"
"testing"
"miniflux.app/model"
)
func TestParseRules(t *testing.T) {
@ -46,178 +48,301 @@ func TestReplaceTextLinks(t *testing.T) {
}
func TestRewriteWithNoMatchingRule(t *testing.T) {
output := Rewriter("https://example.org/article", `Some text.`, ``)
expected := `Some text.`
controlEntry := &model.Entry{
Title: `A title`,
Content: `Some text.`,
}
testEntry := &model.Entry{
Title: `A title`,
Content: `Some text.`,
}
Rewriter("https://example.org/article", testEntry, ``)
if expected != output {
t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected)
if !reflect.DeepEqual(testEntry, controlEntry) {
t.Errorf(`Not expected output: got "%+v" instead of "%+v"`, testEntry, controlEntry)
}
}
func TestRewriteWithYoutubeLink(t *testing.T) {
output := Rewriter("https://www.youtube.com/watch?v=1234", "Video Description", ``)
expected := `<iframe width="650" height="350" frameborder="0" src="https://www.youtube-nocookie.com/embed/1234" allowfullscreen></iframe><br>Video Description`
controlEntry := &model.Entry{
Title: `A title`,
Content: `<iframe width="650" height="350" frameborder="0" src="https://www.youtube-nocookie.com/embed/1234" allowfullscreen></iframe><br>Video Description`,
}
testEntry := &model.Entry{
Title: `A title`,
Content: `Video Description`,
}
Rewriter("https://www.youtube.com/watch?v=1234", testEntry, ``)
if expected != output {
t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected)
if !reflect.DeepEqual(testEntry, controlEntry) {
t.Errorf(`Not expected output: got "%+v" instead of "%+v"`, testEntry, controlEntry)
}
}
func TestRewriteWithInexistingCustomRule(t *testing.T) {
output := Rewriter("https://www.youtube.com/watch?v=1234", `Video Description`, `some rule`)
expected := `Video Description`
if expected != output {
t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected)
controlEntry := &model.Entry{
Title: `A title`,
Content: `Video Description`,
}
testEntry := &model.Entry{
Title: `A title`,
Content: `Video Description`,
}
Rewriter("https://www.youtube.com/watch?v=1234", testEntry, `some rule`)
if !reflect.DeepEqual(testEntry, controlEntry) {
t.Errorf(`Not expected output: got "%+v" instead of "%+v"`, testEntry, controlEntry)
}
}
func TestRewriteWithXkcdLink(t *testing.T) {
description := `<img src="https://imgs.xkcd.com/comics/thermostat.png" title="Your problem is so terrible, I worry that, if I help you, I risk drawing the attention of whatever god of technology inflicted it on you." alt="Your problem is so terrible, I worry that, if I help you, I risk drawing the attention of whatever god of technology inflicted it on you." />`
output := Rewriter("https://xkcd.com/1912/", description, ``)
expected := `<figure><img src="https://imgs.xkcd.com/comics/thermostat.png" alt="Your problem is so terrible, I worry that, if I help you, I risk drawing the attention of whatever god of technology inflicted it on you."/><figcaption><p>Your problem is so terrible, I worry that, if I help you, I risk drawing the attention of whatever god of technology inflicted it on you.</p></figcaption></figure>`
if expected != output {
t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected)
controlEntry := &model.Entry{
Title: `A title`,
Content: `<figure><img src="https://imgs.xkcd.com/comics/thermostat.png" alt="Your problem is so terrible, I worry that, if I help you, I risk drawing the attention of whatever god of technology inflicted it on you."/><figcaption><p>Your problem is so terrible, I worry that, if I help you, I risk drawing the attention of whatever god of technology inflicted it on you.</p></figcaption></figure>`,
}
testEntry := &model.Entry{
Title: `A title`,
Content: `<img src="https://imgs.xkcd.com/comics/thermostat.png" title="Your problem is so terrible, I worry that, if I help you, I risk drawing the attention of whatever god of technology inflicted it on you." alt="Your problem is so terrible, I worry that, if I help you, I risk drawing the attention of whatever god of technology inflicted it on you." />`,
}
Rewriter("https://xkcd.com/1912/", testEntry, ``)
if !reflect.DeepEqual(testEntry, controlEntry) {
t.Errorf(`Not expected output: got "%+v" instead of "%+v"`, testEntry, controlEntry)
}
}
func TestRewriteWithXkcdLinkHtmlInjection(t *testing.T) {
description := `<img src="https://imgs.xkcd.com/comics/thermostat.png" title="<foo>" alt="<foo>" />`
output := Rewriter("https://xkcd.com/1912/", description, ``)
expected := `<figure><img src="https://imgs.xkcd.com/comics/thermostat.png" alt="&lt;foo&gt;"/><figcaption><p>&lt;foo&gt;</p></figcaption></figure>`
if expected != output {
t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected)
controlEntry := &model.Entry{
Title: `A title`,
Content: `<figure><img src="https://imgs.xkcd.com/comics/thermostat.png" alt="&lt;foo&gt;"/><figcaption><p>&lt;foo&gt;</p></figcaption></figure>`,
}
testEntry := &model.Entry{
Title: `A title`,
Content: `<img src="https://imgs.xkcd.com/comics/thermostat.png" title="<foo>" alt="<foo>" />`,
}
Rewriter("https://xkcd.com/1912/", testEntry, ``)
if !reflect.DeepEqual(testEntry, controlEntry) {
t.Errorf(`Not expected output: got "%+v" instead of "%+v"`, testEntry, controlEntry)
}
}
func TestRewriteWithXkcdLinkAndImageNoTitle(t *testing.T) {
description := `<img src="https://imgs.xkcd.com/comics/thermostat.png" alt="Your problem is so terrible, I worry that, if I help you, I risk drawing the attention of whatever god of technology inflicted it on you." />`
output := Rewriter("https://xkcd.com/1912/", description, ``)
expected := description
if expected != output {
t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected)
controlEntry := &model.Entry{
Title: `A title`,
Content: `<img src="https://imgs.xkcd.com/comics/thermostat.png" alt="Your problem is so terrible, I worry that, if I help you, I risk drawing the attention of whatever god of technology inflicted it on you." />`,
}
testEntry := &model.Entry{
Title: `A title`,
Content: `<img src="https://imgs.xkcd.com/comics/thermostat.png" alt="Your problem is so terrible, I worry that, if I help you, I risk drawing the attention of whatever god of technology inflicted it on you." />`,
}
Rewriter("https://xkcd.com/1912/", testEntry, ``)
if !reflect.DeepEqual(testEntry, controlEntry) {
t.Errorf(`Not expected output: got "%+v" instead of "%+v"`, testEntry, controlEntry)
}
}
func TestRewriteWithXkcdLinkAndNoImage(t *testing.T) {
description := "test"
output := Rewriter("https://xkcd.com/1912/", description, ``)
expected := description
if expected != output {
t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected)
controlEntry := &model.Entry{
Title: `A title`,
Content: `test`,
}
testEntry := &model.Entry{
Title: `A title`,
Content: `test`,
}
Rewriter("https://xkcd.com/1912/", testEntry, ``)
if !reflect.DeepEqual(testEntry, controlEntry) {
t.Errorf(`Not expected output: got "%+v" instead of "%+v"`, testEntry, controlEntry)
}
}
func TestRewriteWithXkcdAndNoImage(t *testing.T) {
description := "test"
output := Rewriter("https://xkcd.com/1912/", description, ``)
expected := description
controlEntry := &model.Entry{
Title: `A title`,
Content: `test`,
}
testEntry := &model.Entry{
Title: `A title`,
Content: `test`,
}
Rewriter("https://xkcd.com/1912/", testEntry, ``)
if expected != output {
t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected)
if !reflect.DeepEqual(testEntry, controlEntry) {
t.Errorf(`Not expected output: got "%+v" instead of "%+v"`, testEntry, controlEntry)
}
}
func TestRewriteMailtoLink(t *testing.T) {
description := `<a href="mailto:ryan@qwantz.com?subject=blah%20blah">contact</a>`
output := Rewriter("https://www.qwantz.com/", description, ``)
expected := `<a href="mailto:ryan@qwantz.com?subject=blah%20blah">contact [blah blah]</a>`
if expected != output {
t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected)
controlEntry := &model.Entry{
Title: `A title`,
Content: `<a href="mailto:ryan@qwantz.com?subject=blah%20blah">contact [blah blah]</a>`,
}
testEntry := &model.Entry{
Title: `A title`,
Content: `<a href="mailto:ryan@qwantz.com?subject=blah%20blah">contact</a>`,
}
Rewriter("https://www.qwantz.com/", testEntry, ``)
if !reflect.DeepEqual(testEntry, controlEntry) {
t.Errorf(`Not expected output: got "%+v" instead of "%+v"`, testEntry, controlEntry)
}
}
func TestRewriteWithPDFLink(t *testing.T) {
description := "test"
output := Rewriter("https://example.org/document.pdf", description, ``)
expected := `<a href="https://example.org/document.pdf">PDF</a><br>test`
controlEntry := &model.Entry{
Title: `A title`,
Content: `<a href="https://example.org/document.pdf">PDF</a><br>test`,
}
testEntry := &model.Entry{
Title: `A title`,
Content: `test`,
}
Rewriter("https://example.org/document.pdf", testEntry, ``)
if expected != output {
t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected)
if !reflect.DeepEqual(testEntry, controlEntry) {
t.Errorf(`Not expected output: got "%+v" instead of "%+v"`, testEntry, controlEntry)
}
}
func TestRewriteWithNoLazyImage(t *testing.T) {
description := `<img src="https://example.org/image.jpg" alt="Image"><noscript><p>Some text</p></noscript>`
output := Rewriter("https://example.org/article", description, "add_dynamic_image")
expected := description
controlEntry := &model.Entry{
Title: `A title`,
Content: `<img src="https://example.org/image.jpg" alt="Image"><noscript><p>Some text</p></noscript>`,
}
testEntry := &model.Entry{
Title: `A title`,
Content: `<img src="https://example.org/image.jpg" alt="Image"><noscript><p>Some text</p></noscript>`,
}
Rewriter("https://example.org/article", testEntry, "add_dynamic_image")
if expected != output {
t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected)
if !reflect.DeepEqual(testEntry, controlEntry) {
t.Errorf(`Not expected output: got "%+v" instead of "%+v"`, testEntry, controlEntry)
}
}
func TestRewriteWithLazyImage(t *testing.T) {
description := `<img src="" data-url="https://example.org/image.jpg" alt="Image"><noscript><img src="https://example.org/fallback.jpg" alt="Fallback"></noscript>`
output := Rewriter("https://example.org/article", description, "add_dynamic_image")
expected := `<img src="https://example.org/image.jpg" data-url="https://example.org/image.jpg" alt="Image"/><noscript><img src="https://example.org/fallback.jpg" alt="Fallback"></noscript>`
controlEntry := &model.Entry{
Title: `A title`,
Content: `<img src="https://example.org/image.jpg" data-url="https://example.org/image.jpg" alt="Image"/><noscript><img src="https://example.org/fallback.jpg" alt="Fallback"></noscript>`,
}
testEntry := &model.Entry{
Title: `A title`,
Content: `<img src="" data-url="https://example.org/image.jpg" alt="Image"><noscript><img src="https://example.org/fallback.jpg" alt="Fallback"></noscript>`,
}
Rewriter("https://example.org/article", testEntry, "add_dynamic_image")
if expected != output {
t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected)
if !reflect.DeepEqual(testEntry, controlEntry) {
t.Errorf(`Not expected output: got "%+v" instead of "%+v"`, testEntry, controlEntry)
}
}
func TestRewriteWithLazyDivImage(t *testing.T) {
description := `<div data-url="https://example.org/image.jpg" alt="Image"></div><noscript><img src="https://example.org/fallback.jpg" alt="Fallback"></noscript>`
output := Rewriter("https://example.org/article", description, "add_dynamic_image")
expected := `<img src="https://example.org/image.jpg" alt="Image"/><noscript><img src="https://example.org/fallback.jpg" alt="Fallback"></noscript>`
controlEntry := &model.Entry{
Title: `A title`,
Content: `<img src="https://example.org/image.jpg" alt="Image"/><noscript><img src="https://example.org/fallback.jpg" alt="Fallback"></noscript>`,
}
testEntry := &model.Entry{
Title: `A title`,
Content: `<div data-url="https://example.org/image.jpg" alt="Image"></div><noscript><img src="https://example.org/fallback.jpg" alt="Fallback"></noscript>`,
}
Rewriter("https://example.org/article", testEntry, "add_dynamic_image")
if expected != output {
t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected)
if !reflect.DeepEqual(testEntry, controlEntry) {
t.Errorf(`Not expected output: got "%+v" instead of "%+v"`, testEntry, controlEntry)
}
}
func TestRewriteWithUnknownLazyNoScriptImage(t *testing.T) {
description := `<img src="" data-non-candidate="https://example.org/image.jpg" alt="Image"><noscript><img src="https://example.org/fallback.jpg" alt="Fallback"></noscript>`
output := Rewriter("https://example.org/article", description, "add_dynamic_image")
expected := `<img src="" data-non-candidate="https://example.org/image.jpg" alt="Image"/><img src="https://example.org/fallback.jpg" alt="Fallback"/>`
controlEntry := &model.Entry{
Title: `A title`,
Content: `<img src="" data-non-candidate="https://example.org/image.jpg" alt="Image"/><img src="https://example.org/fallback.jpg" alt="Fallback"/>`,
}
testEntry := &model.Entry{
Title: `A title`,
Content: `<img src="" data-non-candidate="https://example.org/image.jpg" alt="Image"><noscript><img src="https://example.org/fallback.jpg" alt="Fallback"></noscript>`,
}
Rewriter("https://example.org/article", testEntry, "add_dynamic_image")
if expected != output {
t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected)
if !reflect.DeepEqual(testEntry, controlEntry) {
t.Errorf(`Not expected output: got "%+v" instead of "%+v"`, testEntry, controlEntry)
}
}
func TestRewriteWithLazySrcset(t *testing.T) {
description := `<img srcset="" data-srcset="https://example.org/image.jpg" alt="Image">`
output := Rewriter("https://example.org/article", description, "add_dynamic_image")
expected := `<img srcset="https://example.org/image.jpg" data-srcset="https://example.org/image.jpg" alt="Image"/>`
controlEntry := &model.Entry{
Title: `A title`,
Content: `<img srcset="https://example.org/image.jpg" data-srcset="https://example.org/image.jpg" alt="Image"/>`,
}
testEntry := &model.Entry{
Title: `A title`,
Content: `<img srcset="" data-srcset="https://example.org/image.jpg" alt="Image">`,
}
Rewriter("https://example.org/article", testEntry, "add_dynamic_image")
if expected != output {
t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected)
if !reflect.DeepEqual(testEntry, controlEntry) {
t.Errorf(`Not expected output: got "%+v" instead of "%+v"`, testEntry, controlEntry)
}
}
func TestRewriteWithImageAndLazySrcset(t *testing.T) {
description := `<img src="meow" srcset="" data-srcset="https://example.org/image.jpg" alt="Image">`
output := Rewriter("https://example.org/article", description, "add_dynamic_image")
expected := `<img src="meow" srcset="https://example.org/image.jpg" data-srcset="https://example.org/image.jpg" alt="Image"/>`
controlEntry := &model.Entry{
Title: `A title`,
Content: `<img src="meow" srcset="https://example.org/image.jpg" data-srcset="https://example.org/image.jpg" alt="Image"/>`,
}
testEntry := &model.Entry{
Title: `A title`,
Content: `<img src="meow" srcset="" data-srcset="https://example.org/image.jpg" alt="Image">`,
}
Rewriter("https://example.org/article", testEntry, "add_dynamic_image")
if expected != output {
t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected)
if !reflect.DeepEqual(testEntry, controlEntry) {
t.Errorf(`Not expected output: got "%+v" instead of "%+v"`, testEntry, controlEntry)
}
}
func TestNewLineRewriteRule(t *testing.T) {
description := "A\nB\nC"
output := Rewriter("https://example.org/article", description, "nl2br")
expected := `A<br>B<br>C`
controlEntry := &model.Entry{
Title: `A title`,
Content: `A<br>B<br>C`,
}
testEntry := &model.Entry{
Title: `A title`,
Content: "A\nB\nC",
}
Rewriter("https://example.org/article", testEntry, "nl2br")
if expected != output {
t.Errorf(`Not expected output: got %q instead of %q`, output, expected)
if !reflect.DeepEqual(testEntry, controlEntry) {
t.Errorf(`Not expected output: got "%+v" instead of "%+v"`, testEntry, controlEntry)
}
}
func TestConvertTextLinkRewriteRule(t *testing.T) {
description := "Test: http://example.org/a/b"
output := Rewriter("https://example.org/article", description, "convert_text_link")
expected := `Test: <a href="http://example.org/a/b">http://example.org/a/b</a>`
controlEntry := &model.Entry{
Title: `A title`,
Content: `Test: <a href="http://example.org/a/b">http://example.org/a/b</a>`,
}
testEntry := &model.Entry{
Title: `A title`,
Content: `Test: http://example.org/a/b`,
}
Rewriter("https://example.org/article", testEntry, "convert_text_link")
if expected != output {
t.Errorf(`Not expected output: got %q instead of %q`, output, expected)
if !reflect.DeepEqual(testEntry, controlEntry) {
t.Errorf(`Not expected output: got "%+v" instead of "%+v"`, testEntry, controlEntry)
}
}
func TestMediumImage(t *testing.T) {
content := `
controlEntry := &model.Entry{
Title: `A title`,
Content: `<img alt="Image for post" class="t u v if aj" src="https://miro.medium.com/max/2560/1*ephLSqSzQYLvb7faDwzRbw.jpeg" width="1280" height="720" srcset="https://miro.medium.com/max/552/1*ephLSqSzQYLvb7faDwzRbw.jpeg 276w, https://miro.medium.com/max/1104/1*ephLSqSzQYLvb7faDwzRbw.jpeg 552w, https://miro.medium.com/max/1280/1*ephLSqSzQYLvb7faDwzRbw.jpeg 640w, https://miro.medium.com/max/1400/1*ephLSqSzQYLvb7faDwzRbw.jpeg 700w" sizes="700px"/>`,
}
testEntry := &model.Entry{
Title: `A title`,
Content: `
<figure class="ht hu hv hw hx hy cy cz paragraph-image">
<div class="hz ia ib ic aj">
<div class="cy cz hs">
@ -235,103 +360,174 @@ func TestMediumImage(t *testing.T) {
</div>
</div>
</figure>
`
expected := `<img alt="Image for post" class="t u v if aj" src="https://miro.medium.com/max/2560/1*ephLSqSzQYLvb7faDwzRbw.jpeg" width="1280" height="720" srcset="https://miro.medium.com/max/552/1*ephLSqSzQYLvb7faDwzRbw.jpeg 276w, https://miro.medium.com/max/1104/1*ephLSqSzQYLvb7faDwzRbw.jpeg 552w, https://miro.medium.com/max/1280/1*ephLSqSzQYLvb7faDwzRbw.jpeg 640w, https://miro.medium.com/max/1400/1*ephLSqSzQYLvb7faDwzRbw.jpeg 700w" sizes="700px"/>`
output := Rewriter("https://example.org/article", content, "fix_medium_images")
output = strings.TrimSpace(output)
`,
}
Rewriter("https://example.org/article", testEntry, "fix_medium_images")
testEntry.Content = strings.TrimSpace(testEntry.Content)
if expected != output {
t.Errorf(`Not expected output: %s`, output)
if !reflect.DeepEqual(testEntry, controlEntry) {
t.Errorf(`Not expected output: got "%+v" instead of "%+v"`, testEntry, controlEntry)
}
}
func TestRewriteNoScriptImageWithoutNoScriptTag(t *testing.T) {
content := `<figure><img src="https://developer.mozilla.org/static/img/favicon144.png" alt="The beautiful MDN logo."><figcaption>MDN Logo</figcaption></figure>`
expected := `<figure><img src="https://developer.mozilla.org/static/img/favicon144.png" alt="The beautiful MDN logo."/><figcaption>MDN Logo</figcaption></figure>`
output := Rewriter("https://example.org/article", content, "use_noscript_figure_images")
output = strings.TrimSpace(output)
controlEntry := &model.Entry{
Title: `A title`,
Content: `<figure><img src="https://developer.mozilla.org/static/img/favicon144.png" alt="The beautiful MDN logo."/><figcaption>MDN Logo</figcaption></figure>`,
}
testEntry := &model.Entry{
Title: `A title`,
Content: `<figure><img src="https://developer.mozilla.org/static/img/favicon144.png" alt="The beautiful MDN logo."><figcaption>MDN Logo</figcaption></figure>`,
}
Rewriter("https://example.org/article", testEntry, "use_noscript_figure_images")
testEntry.Content = strings.TrimSpace(testEntry.Content)
if expected != output {
t.Errorf(`Not expected output: %s`, output)
if !reflect.DeepEqual(testEntry, controlEntry) {
t.Errorf(`Not expected output: got "%+v" instead of "%+v"`, testEntry, controlEntry)
}
}
func TestRewriteNoScriptImageWithNoScriptTag(t *testing.T) {
content := `<figure><img src="https://developer.mozilla.org/static/img/favicon144.png" alt="The beautiful MDN logo."><noscript><img src="http://example.org/logo.svg"></noscript><figcaption>MDN Logo</figcaption></figure>`
expected := `<figure><img src="http://example.org/logo.svg"/><figcaption>MDN Logo</figcaption></figure>`
output := Rewriter("https://example.org/article", content, "use_noscript_figure_images")
output = strings.TrimSpace(output)
controlEntry := &model.Entry{
Title: `A title`,
Content: `<figure><img src="http://example.org/logo.svg"/><figcaption>MDN Logo</figcaption></figure>`,
}
testEntry := &model.Entry{
Title: `A title`,
Content: `<figure><img src="https://developer.mozilla.org/static/img/favicon144.png" alt="The beautiful MDN logo."><noscript><img src="http://example.org/logo.svg"></noscript><figcaption>MDN Logo</figcaption></figure>`,
}
Rewriter("https://example.org/article", testEntry, "use_noscript_figure_images")
testEntry.Content = strings.TrimSpace(testEntry.Content)
if expected != output {
t.Errorf(`Not expected output: %s`, output)
if !reflect.DeepEqual(testEntry, controlEntry) {
t.Errorf(`Not expected output: got "%+v" instead of "%+v"`, testEntry, controlEntry)
}
}
func TestRewriteReplaceCustom(t *testing.T) {
content := `<img src="http://example.org/logo.svg"><img src="https://example.org/article/picture.svg">`
expected := `<img src="http://example.org/logo.svg"><img src="https://example.org/article/picture.png">`
output := Rewriter("https://example.org/article", content, `replace("article/(.*).svg"|"article/$1.png")`)
controlEntry := &model.Entry{
Title: `A title`,
Content: `<img src="http://example.org/logo.svg"><img src="https://example.org/article/picture.png">`,
}
testEntry := &model.Entry{
Title: `A title`,
Content: `<img src="http://example.org/logo.svg"><img src="https://example.org/article/picture.svg">`,
}
Rewriter("https://example.org/article", testEntry, `replace("article/(.*).svg"|"article/$1.png")`)
if expected != output {
t.Errorf(`Not expected output: %s`, output)
if !reflect.DeepEqual(testEntry, controlEntry) {
t.Errorf(`Not expected output: got "%+v" instead of "%+v"`, testEntry, controlEntry)
}
}
func TestRewriteRemoveCustom(t *testing.T) {
content := `<div>Lorem Ipsum <span class="spam">I dont want to see this</span><span class="ads keep">Super important info</span></div>`
expected := `<div>Lorem Ipsum <span class="ads keep">Super important info</span></div>`
output := Rewriter("https://example.org/article", content, `remove(".spam, .ads:not(.keep)")`)
controlEntry := &model.Entry{
Title: `A title`,
Content: `<div>Lorem Ipsum <span class="ads keep">Super important info</span></div>`,
}
testEntry := &model.Entry{
Title: `A title`,
Content: `<div>Lorem Ipsum <span class="spam">I dont want to see this</span><span class="ads keep">Super important info</span></div>`,
}
Rewriter("https://example.org/article", testEntry, `remove(".spam, .ads:not(.keep)")`)
if expected != output {
t.Errorf(`Not expected output: %s`, output)
if !reflect.DeepEqual(testEntry, controlEntry) {
t.Errorf(`Not expected output: got "%+v" instead of "%+v"`, testEntry, controlEntry)
}
}
func TestRewriteAddCastopodEpisode(t *testing.T) {
output := Rewriter("https://podcast.demo/@demo/episodes/test", "Episode Description", `add_castopod_episode`)
expected := `<iframe width="650" frameborder="0" src="https://podcast.demo/@demo/episodes/test/embed/light"></iframe><br>Episode Description`
controlEntry := &model.Entry{
Title: `A title`,
Content: `<iframe width="650" frameborder="0" src="https://podcast.demo/@demo/episodes/test/embed/light"></iframe><br>Episode Description`,
}
testEntry := &model.Entry{
Title: `A title`,
Content: `Episode Description`,
}
Rewriter("https://podcast.demo/@demo/episodes/test", testEntry, `add_castopod_episode`)
if expected != output {
t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected)
if !reflect.DeepEqual(testEntry, controlEntry) {
t.Errorf(`Not expected output: got "%+v" instead of "%+v"`, testEntry, controlEntry)
}
}
func TestRewriteBase64Decode(t *testing.T) {
content := `VGhpcyBpcyBzb21lIGJhc2U2NCBlbmNvZGVkIGNvbnRlbnQ=`
expected := `This is some base64 encoded content`
output := Rewriter("https://example.org/article", content, `base64_decode`)
controlEntry := &model.Entry{
Title: `A title`,
Content: `This is some base64 encoded content`,
}
testEntry := &model.Entry{
Title: `A title`,
Content: `VGhpcyBpcyBzb21lIGJhc2U2NCBlbmNvZGVkIGNvbnRlbnQ=`,
}
Rewriter("https://example.org/article", testEntry, `base64_decode`)
if expected != output {
t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected)
if !reflect.DeepEqual(testEntry, controlEntry) {
t.Errorf(`Not expected output: got "%+v" instead of "%+v"`, testEntry, controlEntry)
}
}
func TestRewriteBase64DecodeInHTML(t *testing.T) {
content := `<div>Lorem Ipsum not valid base64<span class="base64">VGhpcyBpcyBzb21lIGJhc2U2NCBlbmNvZGVkIGNvbnRlbnQ=</span></div>`
expected := `<div>Lorem Ipsum not valid base64<span class="base64">This is some base64 encoded content</span></div>`
output := Rewriter("https://example.org/article", content, `base64_decode`)
controlEntry := &model.Entry{
Title: `A title`,
Content: `<div>Lorem Ipsum not valid base64<span class="base64">This is some base64 encoded content</span></div>`,
}
testEntry := &model.Entry{
Title: `A title`,
Content: `<div>Lorem Ipsum not valid base64<span class="base64">VGhpcyBpcyBzb21lIGJhc2U2NCBlbmNvZGVkIGNvbnRlbnQ=</span></div>`,
}
Rewriter("https://example.org/article", testEntry, `base64_decode`)
if expected != output {
t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected)
if !reflect.DeepEqual(testEntry, controlEntry) {
t.Errorf(`Not expected output: got "%+v" instead of "%+v"`, testEntry, controlEntry)
}
}
func TestRewriteBase64DecodeArgs(t *testing.T) {
content := `<div>Lorem Ipsum<span class="base64">VGhpcyBpcyBzb21lIGJhc2U2NCBlbmNvZGVkIGNvbnRlbnQ=</span></div>`
expected := `<div>Lorem Ipsum<span class="base64">This is some base64 encoded content</span></div>`
output := Rewriter("https://example.org/article", content, `base64_decode(".base64")`)
controlEntry := &model.Entry{
Title: `A title`,
Content: `<div>Lorem Ipsum<span class="base64">This is some base64 encoded content</span></div>`,
}
testEntry := &model.Entry{
Title: `A title`,
Content: `<div>Lorem Ipsum<span class="base64">VGhpcyBpcyBzb21lIGJhc2U2NCBlbmNvZGVkIGNvbnRlbnQ=</span></div>`,
}
Rewriter("https://example.org/article", testEntry, `base64_decode(".base64")`)
if expected != output {
t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected)
if !reflect.DeepEqual(testEntry, controlEntry) {
t.Errorf(`Not expected output: got "%+v" instead of "%+v"`, testEntry, controlEntry)
}
}
func TestRewriteRemoveTables(t *testing.T) {
content := `<table class="container"><tbody><tr><td><p>Test</p><table class="row"><tbody><tr><td><p>Hello World!</p></td><td><p>Test</p></td></tr></tbody></table></td></tr></tbody></table>`
expected := `<p>Test</p><p>Hello World!</p><p>Test</p>`
output := Rewriter("https://example.org/article", content, `remove_tables`)
controlEntry := &model.Entry{
Title: `A title`,
Content: `<p>Test</p><p>Hello World!</p><p>Test</p>`,
}
testEntry := &model.Entry{
Title: `A title`,
Content: `<table class="container"><tbody><tr><td><p>Test</p><table class="row"><tbody><tr><td><p>Hello World!</p></td><td><p>Test</p></td></tr></tbody></table></td></tr></tbody></table>`,
}
Rewriter("https://example.org/article", testEntry, `remove_tables`)
if expected != output {
t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected)
if !reflect.DeepEqual(testEntry, controlEntry) {
t.Errorf(`Not expected output: got "%+v" instead of "%+v"`, testEntry, controlEntry)
}
}
func TestRemoveClickbait(t *testing.T) {
controlEntry := &model.Entry{
Title: `This Is Amazing`,
Content: `Some description`,
}
testEntry := &model.Entry{
Title: `THIS IS AMAZING`,
Content: `Some description`,
}
Rewriter("https://example.org/article", testEntry, `remove_clickbait`)
if !reflect.DeepEqual(testEntry, controlEntry) {
t.Errorf(`Not expected output: got "%+v" instead of "%+v"`, testEntry, controlEntry)
}
}