Add rewrite rule to fix Medium.com images
This commit is contained in:
parent
d75ff0c5ab
commit
31435ef83e
6 changed files with 89 additions and 40 deletions
|
@ -76,7 +76,7 @@ func ExtractContent(page io.Reader) (string, error) {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
|
|
||||||
document.Find("script,style,noscript").Each(func(i int, s *goquery.Selection) {
|
document.Find("script,style").Each(func(i int, s *goquery.Selection) {
|
||||||
removeNodes(s)
|
removeNodes(s)
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|
|
@ -139,6 +139,21 @@ func addDynamicImage(entryURL, entryContent string) string {
|
||||||
return entryContent
|
return entryContent
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func fixMediumImages(entryURL, entryContent string) string {
|
||||||
|
doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent))
|
||||||
|
if err != nil {
|
||||||
|
return entryContent
|
||||||
|
}
|
||||||
|
|
||||||
|
doc.Find("figure.paragraph-image").Each(func(i int, paragraphImage *goquery.Selection) {
|
||||||
|
noscriptElement := paragraphImage.Find("noscript")
|
||||||
|
paragraphImage.ReplaceWithHtml(noscriptElement.Text())
|
||||||
|
})
|
||||||
|
|
||||||
|
output, _ := doc.Find("body").First().Html()
|
||||||
|
return output
|
||||||
|
}
|
||||||
|
|
||||||
func addYoutubeVideo(entryURL, entryContent string) string {
|
func addYoutubeVideo(entryURL, entryContent string) string {
|
||||||
matches := youtubeRegex.FindStringSubmatch(entryURL)
|
matches := youtubeRegex.FindStringSubmatch(entryURL)
|
||||||
|
|
||||||
|
|
|
@ -43,6 +43,8 @@ func Rewriter(entryURL, entryContent, customRewriteRules string) string {
|
||||||
entryContent = replaceLineFeeds(entryContent)
|
entryContent = replaceLineFeeds(entryContent)
|
||||||
case "convert_text_link", "convert_text_links":
|
case "convert_text_link", "convert_text_links":
|
||||||
entryContent = replaceTextLinks(entryContent)
|
entryContent = replaceTextLinks(entryContent)
|
||||||
|
case "fix_medium_images":
|
||||||
|
entryContent = fixMediumImages(entryURL, entryContent)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -4,7 +4,10 @@
|
||||||
|
|
||||||
package rewrite // import "miniflux.app/reader/rewrite"
|
package rewrite // import "miniflux.app/reader/rewrite"
|
||||||
|
|
||||||
import "testing"
|
import (
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
func TestReplaceTextLinks(t *testing.T) {
|
func TestReplaceTextLinks(t *testing.T) {
|
||||||
scenarios := map[string]string{
|
scenarios := map[string]string{
|
||||||
|
@ -176,3 +179,32 @@ func TestConvertTextLinkRewriteRule(t *testing.T) {
|
||||||
t.Errorf(`Not expected output: got %q instead of %q`, output, expected)
|
t.Errorf(`Not expected output: got %q instead of %q`, output, expected)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestMediumImage(t *testing.T) {
|
||||||
|
content := `
|
||||||
|
<figure class="ht hu hv hw hx hy cy cz paragraph-image">
|
||||||
|
<div class="hz ia ib ic aj">
|
||||||
|
<div class="cy cz hs">
|
||||||
|
<div class="ii s ib ij">
|
||||||
|
<div class="ik il s">
|
||||||
|
<div class="id ie t u v if aj bk ig ih">
|
||||||
|
<img alt="Image for post" class="t u v if aj im in io" src="https://miro.medium.com/max/60/1*ephLSqSzQYLvb7faDwzRbw.jpeg?q=20" width="1280" height="720"/>
|
||||||
|
</div>
|
||||||
|
<img alt="Image for post" class="id ie t u v if aj c" width="1280" height="720"/>
|
||||||
|
<noscript>
|
||||||
|
<img alt="Image for post" class="t u v if aj" src="https://miro.medium.com/max/2560/1*ephLSqSzQYLvb7faDwzRbw.jpeg" width="1280" height="720" srcSet="https://miro.medium.com/max/552/1*ephLSqSzQYLvb7faDwzRbw.jpeg 276w, https://miro.medium.com/max/1104/1*ephLSqSzQYLvb7faDwzRbw.jpeg 552w, https://miro.medium.com/max/1280/1*ephLSqSzQYLvb7faDwzRbw.jpeg 640w, https://miro.medium.com/max/1400/1*ephLSqSzQYLvb7faDwzRbw.jpeg 700w" sizes="700px"/>
|
||||||
|
</noscript>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</figure>
|
||||||
|
`
|
||||||
|
expected := `<img alt="Image for post" class="t u v if aj" src="https://miro.medium.com/max/2560/1*ephLSqSzQYLvb7faDwzRbw.jpeg" width="1280" height="720" srcset="https://miro.medium.com/max/552/1*ephLSqSzQYLvb7faDwzRbw.jpeg 276w, https://miro.medium.com/max/1104/1*ephLSqSzQYLvb7faDwzRbw.jpeg 552w, https://miro.medium.com/max/1280/1*ephLSqSzQYLvb7faDwzRbw.jpeg 640w, https://miro.medium.com/max/1400/1*ephLSqSzQYLvb7faDwzRbw.jpeg 700w" sizes="700px"/>`
|
||||||
|
output := Rewriter("https://example.org/article", content, "fix_medium_images")
|
||||||
|
output = strings.TrimSpace(output)
|
||||||
|
|
||||||
|
if expected != output {
|
||||||
|
t.Errorf(`Not expected output: %s`, output)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -30,4 +30,5 @@ var predefinedRules = map[string]string{
|
||||||
"invidio.us": "add_invidious_video",
|
"invidio.us": "add_invidious_video",
|
||||||
"xkcd.com": "add_image_title",
|
"xkcd.com": "add_image_title",
|
||||||
"framatube.org": "nl2br,convert_text_link",
|
"framatube.org": "nl2br,convert_text_link",
|
||||||
|
"medium.com": "fix_medium_images",
|
||||||
}
|
}
|
||||||
|
|
|
@ -7,43 +7,42 @@ package scraper // import "miniflux.app/reader/scraper"
|
||||||
// List of predefined scraper rules (alphabetically sorted)
|
// List of predefined scraper rules (alphabetically sorted)
|
||||||
// domain => CSS selectors
|
// domain => CSS selectors
|
||||||
var predefinedRules = map[string]string{
|
var predefinedRules = map[string]string{
|
||||||
"bbc.co.uk": "div.vxp-column--single, div.story-body__inner, ul.gallery-images__list",
|
"bbc.co.uk": "div.vxp-column--single, div.story-body__inner, ul.gallery-images__list",
|
||||||
"cbc.ca": ".story-content",
|
"cbc.ca": ".story-content",
|
||||||
"darkreading.com": "#article-main:not(header)",
|
"darkreading.com": "#article-main:not(header)",
|
||||||
"developpez.com": "div[itemprop=articleBody]",
|
"developpez.com": "div[itemprop=articleBody]",
|
||||||
"dilbert.com": "span.comic-title-name, img.img-comic",
|
"dilbert.com": "span.comic-title-name, img.img-comic",
|
||||||
"financialsamurai.com": "article",
|
"financialsamurai.com": "article",
|
||||||
"francetvinfo.fr": ".text",
|
"francetvinfo.fr": ".text",
|
||||||
"github.com": "article.entry-content",
|
"github.com": "article.entry-content",
|
||||||
"heise.de": "header .article-content__lead, header .article-image, div.article-layout__content.article-content",
|
"heise.de": "header .article-content__lead, header .article-image, div.article-layout__content.article-content",
|
||||||
"igen.fr": "section.corps",
|
"igen.fr": "section.corps",
|
||||||
"ing.dk": "section.body",
|
"ing.dk": "section.body",
|
||||||
"lapresse.ca": ".amorce, .entry",
|
"lapresse.ca": ".amorce, .entry",
|
||||||
"lemonde.fr": "article",
|
"lemonde.fr": "article",
|
||||||
"lepoint.fr": ".art-text",
|
"lepoint.fr": ".art-text",
|
||||||
"lesjoiesducode.fr": ".blog-post-content img",
|
"lesjoiesducode.fr": ".blog-post-content img",
|
||||||
"lesnumeriques.com": ".text",
|
"lesnumeriques.com": ".text",
|
||||||
"linux.com": "div.content, div[property]",
|
"linux.com": "div.content, div[property]",
|
||||||
"medium.com": ".section-content",
|
"mac4ever.com": "div[itemprop=articleBody]",
|
||||||
"mac4ever.com": "div[itemprop=articleBody]",
|
"monwindows.com": ".blog-post-body",
|
||||||
"monwindows.com": ".blog-post-body",
|
"npr.org": "#storytext",
|
||||||
"npr.org": "#storytext",
|
"oneindia.com": ".io-article-body",
|
||||||
"oneindia.com": ".io-article-body",
|
"opensource.com": "div[property]",
|
||||||
"opensource.com": "div[property]",
|
"osnews.com": "div.newscontent1",
|
||||||
"osnews.com": "div.newscontent1",
|
"phoronix.com": "div.content",
|
||||||
"phoronix.com": "div.content",
|
"pseudo-sciences.org": "#art_main",
|
||||||
"pseudo-sciences.org": "#art_main",
|
"raywenderlich.com": "article",
|
||||||
"raywenderlich.com": "article",
|
"slate.fr": ".field-items",
|
||||||
"slate.fr": ".field-items",
|
"techcrunch.com": "div.article-entry",
|
||||||
"techcrunch.com": "div.article-entry",
|
"theoatmeal.com": "div#comic",
|
||||||
"theoatmeal.com": "div#comic",
|
"theregister.co.uk": "#body",
|
||||||
"theregister.co.uk": "#body",
|
"turnoff.us": "article.post-content",
|
||||||
"turnoff.us": "article.post-content",
|
"universfreebox.com": "#corps_corps",
|
||||||
"universfreebox.com": "#corps_corps",
|
"version2.dk": "section.body",
|
||||||
"version2.dk": "section.body",
|
"wdwnt.com": "div.entry-content",
|
||||||
"wdwnt.com": "div.entry-content",
|
"wired.com": "main figure, article",
|
||||||
"wired.com": "main figure, article",
|
"zeit.de": ".summary, .article-body",
|
||||||
"zeit.de": ".summary, .article-body",
|
"zdnet.com": "div.storyBody",
|
||||||
"zdnet.com": "div.storyBody",
|
"openingsource.org": "article.suxing-popup-gallery",
|
||||||
"openingsource.org": "article.suxing-popup-gallery",
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue