From dcf87bd6424961d5ec0b58cea3295848ab581da6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jouni=20K=2E=20Sepp=C3=A4nen?= Date: Mon, 3 Jan 2022 17:41:21 +0200 Subject: [PATCH] Add scrape and rewrite rules for quantamagazine This is a somewhat complex React site so the rules could be a little fragile. Text content seems to be always inside .outer--content, and most h6 elements are fluff like "read later" or pointers to other articles. However, h6.byline and h6.post__title__kicker are relevant to the current article. Figure captions are sometimes inside both figure and div.outer--content elements, sometimes only inside figure, so take both and remove the intersection. The figure elements sometimes contain multiple copies of images or videos, and we just take them all. Math articles seem to use Mathjax, which we don't add. --- reader/rewrite/rules.go | 1 + reader/scraper/rules.go | 1 + 2 files changed, 2 insertions(+) diff --git a/reader/rewrite/rules.go b/reader/rewrite/rules.go index 845973a3..fb615546 100644 --- a/reader/rewrite/rules.go +++ b/reader/rewrite/rules.go @@ -26,6 +26,7 @@ var predefinedRules = map[string]string{ "oglaf.com": "add_image_title", "optipess.com": "add_image_title", "peebleslab.com": "add_image_title", + "quantamagazine.org": `remove("h6:not(.byline,.post__title__kicker), #comments, .next-post__content, .footer__section, figure .outer--content")`, "sentfromthemoon.com": "add_image_title", "thedoghousediaries.com": "add_image_title", "treelobsters.com": "add_image_title", diff --git a/reader/scraper/rules.go b/reader/scraper/rules.go index dfcc4396..0352b6bd 100644 --- a/reader/scraper/rules.go +++ b/reader/scraper/rules.go @@ -33,6 +33,7 @@ var predefinedRules = map[string]string{ "osnews.com": "div.newscontent1", "phoronix.com": "div.content", "pseudo-sciences.org": "#art_main", + "quantamagazine.org": ".outer--content, figure", "raywenderlich.com": "article", "slate.fr": ".field-items", "techcrunch.com": "div.article-entry",