Add scraper rules

This commit is contained in:
Frédéric Guillot 2017-12-10 20:51:04 -08:00
parent 7a35c58f53
commit 87ccad5c7f
16 changed files with 140 additions and 34 deletions

View file

@ -1,5 +1,5 @@
// Code generated by go generate; DO NOT EDIT.
// 2017-12-10 18:56:24.387844114 -0800 PST m=+0.029823201
// 2017-12-10 20:08:14.447304303 -0800 PST m=+0.040286758
package locale
@ -167,12 +167,13 @@ var translations = map[string]string{
"Activate Fever API": "Activer l'API de Fever",
"Fever Username": "Nom d'utilisateur pour l'API de Fever",
"Fever Password": "Mot de passe pour l'API de Fever",
"Fetch original content": "Récupérer le contenu original"
"Fetch original content": "Récupérer le contenu original",
"Scraper Rules": "Règles pour récupérer le contenu original"
}
`,
}
var translationsChecksums = map[string]string{
"en_US": "6fe95384260941e8a5a3c695a655a932e0a8a6a572c1e45cb2b1ae8baa01b897",
"fr_FR": "fd629b171aefa50dd0a6100acaac8fbecbdf1a1d53e3fce984234565ec5bb5d5",
"fr_FR": "4426cea875ee2c9acb1a2b0619cb82f3a32f71aabe5d07657eaf2f6b7387c5f9",
}

View file

@ -151,5 +151,6 @@
"Activate Fever API": "Activer l'API de Fever",
"Fever Username": "Nom d'utilisateur pour l'API de Fever",
"Fever Password": "Mot de passe pour l'API de Fever",
"Fetch original content": "Récupérer le contenu original"
"Fetch original content": "Récupérer le contenu original",
"Scraper Rules": "Règles pour récupérer le contenu original"
}

View file

@ -22,6 +22,7 @@ type Feed struct {
LastModifiedHeader string `json:"last_modified_header,omitempty"`
ParsingErrorMsg string `json:"parsing_error_message,omitempty"`
ParsingErrorCount int `json:"parsing_error_count,omitempty"`
ScraperRules string `json:"scraper_rules"`
Category *Category `json:"category,omitempty"`
Entries Entries `json:"entries,omitempty"`
Icon *FeedIcon `json:"icon,omitempty"`

16
reader/scraper/rules.go Normal file
View file

@ -0,0 +1,16 @@
// Copyright 2017 Frédéric Guillot. All rights reserved.
// Use of this source code is governed by the Apache 2.0
// license that can be found in the LICENSE file.
package scraper
// List of predefined scraper rules (alphabetically sorted)
// domain => CSS selectors
var predefinedRules = map[string]string{
"lemonde.fr": "div#articleBody",
"lesjoiesducode.fr": ".blog-post-content img",
"linux.com": "div.content, div[property]",
"opensource.com": "div[property]",
"phoronix.com": "div.content",
"techcrunch.com": "div.article-entry",
}

View file

@ -6,14 +6,19 @@ package scraper
import (
"errors"
"io"
"log"
"strings"
"github.com/PuerkitoBio/goquery"
"github.com/miniflux/miniflux2/http"
"github.com/miniflux/miniflux2/reader/readability"
"github.com/miniflux/miniflux2/reader/sanitizer"
"github.com/miniflux/miniflux2/url"
)
// Fetch download a web page a returns relevant contents.
func Fetch(websiteURL string) (string, error) {
func Fetch(websiteURL, rules string) (string, error) {
client := http.NewClient(websiteURL)
response, err := client.Get()
if err != nil {
@ -29,10 +34,57 @@ func Fetch(websiteURL string) (string, error) {
return "", err
}
content, err := readability.ExtractContent(page)
var content string
if rules == "" {
rules = getPredefinedScraperRules(websiteURL)
}
if rules != "" {
log.Printf(`[Scraper] Using rules "%s" for "%s"`, rules, websiteURL)
content, err = scrapContent(page, rules)
} else {
log.Printf(`[Scraper] Using readability for "%s"`, websiteURL)
content, err = readability.ExtractContent(page)
}
if err != nil {
return "", err
}
return sanitizer.Sanitize(websiteURL, content), nil
}
func scrapContent(page io.Reader, rules string) (string, error) {
document, err := goquery.NewDocumentFromReader(page)
if err != nil {
return "", err
}
contents := ""
document.Find(rules).Each(func(i int, s *goquery.Selection) {
var content string
// For some inline elements, we get the parent.
if s.Is("img") {
content, _ = s.Parent().Html()
} else {
content, _ = s.Html()
}
contents += content
})
return contents, nil
}
func getPredefinedScraperRules(websiteURL string) string {
urlDomain := url.Domain(websiteURL)
for domain, rules := range predefinedRules {
if strings.Contains(urlDomain, domain) {
return rules
}
}
return ""
}

View file

@ -0,0 +1,21 @@
// Copyright 2017 Frédéric Guillot. All rights reserved.
// Use of this source code is governed by the Apache 2.0
// license that can be found in the LICENSE file.
package scraper
import "testing"
func TestGetPredefinedRules(t *testing.T) {
if getPredefinedScraperRules("http://www.phoronix.com/") == "" {
t.Error("Unable to find rule for phoronix.com")
}
if getPredefinedScraperRules("https://www.linux.com/") == "" {
t.Error("Unable to find rule for linux.com")
}
if getPredefinedScraperRules("https://example.org/") != "" {
t.Error("A rule not defined should not return anything")
}
}

View file

@ -45,6 +45,9 @@
<label for="form-feed-url">{{ t "Feed URL" }}</label>
<input type="url" name="feed_url" id="form-feed-url" placeholder="https://domain.tld/" value="{{ .form.FeedURL }}" required>
<label for="form-scraper-rules">{{ t "Scraper Rules" }}</label>
<input type="text" name="scraper_rules" id="form-scraper-rules" value="{{ .form.ScraperRules }}">
<label for="form-category">{{ t "Category" }}</label>
<select id="form-category" name="category_id">
{{ range .categories }}

View file

@ -1,5 +1,5 @@
// Code generated by go generate; DO NOT EDIT.
// 2017-12-10 18:56:24.375327888 -0800 PST m=+0.017306975
// 2017-12-10 20:08:14.428877093 -0800 PST m=+0.021859548
package template
@ -395,6 +395,9 @@ var templateViewsMap = map[string]string{
<label for="form-feed-url">{{ t "Feed URL" }}</label>
<input type="url" name="feed_url" id="form-feed-url" placeholder="https://domain.tld/" value="{{ .form.FeedURL }}" required>
<label for="form-scraper-rules">{{ t "Scraper Rules" }}</label>
<input type="text" name="scraper_rules" id="form-scraper-rules" value="{{ .form.ScraperRules }}">
<label for="form-category">{{ t "Category" }}</label>
<select id="form-category" name="category_id">
{{ range .categories }}
@ -1181,7 +1184,7 @@ var templateViewsMapChecksums = map[string]string{
"create_category": "2b82af5d2dcd67898dc5daa57a6461e6ff8121a6089b2a2a1be909f35e4a2275",
"create_user": "45e226df757126d5fe7c464e295e9a34f07952cfdb71e31e49839850d35af139",
"edit_category": "cee720faadcec58289b707ad30af623d2ee66c1ce23a732965463250d7ff41c5",
"edit_feed": "c5bc4c22bf7e8348d880395250545595d21fb8c8e723fc5d7cca68e25d250884",
"edit_feed": "b3c7dd5e93d58e051abcd59da31217d8e9b50587014b895d1b7c9172247b35f8",
"edit_user": "82d9749d76ddbd2352816d813c4b1f6d92f2222de678b4afe5821090246735c7",
"entry": "ebcf9bb35812dd02759718f7f7411267e6a6c8efd59a9aa0a0e735bcb88efeff",
"feed_entries": "547c19eb36b20e350ce70ed045173b064cdcd6b114afb241c9f2dda9d88fcc27",

View file

@ -40,18 +40,14 @@ func (c *Controller) FetchContent(ctx *core.Context, request *core.Request, resp
return
}
content, err := scraper.Fetch(entry.URL)
content, err := scraper.Fetch(entry.URL, entry.Feed.ScraperRules)
if err != nil {
response.JSON().ServerError(err)
return
}
if len(content) > len(entry.Content) {
entry.Content = content
c.store.UpdateEntryContent(entry)
} else {
content = entry.Content
}
response.JSON().Created(map[string]string{"content": content})
}

View file

@ -220,6 +220,7 @@ func (c *Controller) getFeedFormTemplateArgs(ctx *core.Context, user *model.User
SiteURL: feed.SiteURL,
FeedURL: feed.FeedURL,
Title: feed.Title,
ScraperRules: feed.ScraperRules,
CategoryID: feed.Category.ID,
}
} else {

View file

@ -17,6 +17,7 @@ type FeedForm struct {
FeedURL string
SiteURL string
Title string
ScraperRules string
CategoryID int64
}
@ -34,6 +35,7 @@ func (f FeedForm) Merge(feed *model.Feed) *model.Feed {
feed.Title = f.Title
feed.SiteURL = f.SiteURL
feed.FeedURL = f.FeedURL
feed.ScraperRules = f.ScraperRules
feed.ParsingErrorCount = 0
feed.ParsingErrorMsg = ""
return feed
@ -50,6 +52,7 @@ func NewFeedForm(r *http.Request) *FeedForm {
FeedURL: r.FormValue("feed_url"),
SiteURL: r.FormValue("site_url"),
Title: r.FormValue("title"),
ScraperRules: r.FormValue("scraper_rules"),
CategoryID: int64(categoryID),
}
}

1
sql/schema_version_6.sql Normal file
View file

@ -0,0 +1 @@
alter table feeds add column scraper_rules text default '';

View file

@ -1,5 +1,5 @@
// Code generated by go generate; DO NOT EDIT.
// 2017-12-10 18:56:24.36359961 -0800 PST m=+0.005578697
// 2017-12-10 20:08:14.411225368 -0800 PST m=+0.004207823
package sql
@ -136,6 +136,8 @@ alter table users add column entry_direction entry_sorting_direction default 'as
fever_token text default '',
primary key(user_id)
)
`,
"schema_version_6": `alter table feeds add column scraper_rules text default '';
`,
}
@ -145,4 +147,5 @@ var SqlMapChecksums = map[string]string{
"schema_version_3": "a54745dbc1c51c000f74d4e5068f1e2f43e83309f023415b1749a47d5c1e0f12",
"schema_version_4": "216ea3a7d3e1704e40c797b5dc47456517c27dbb6ca98bf88812f4f63d74b5d9",
"schema_version_5": "46397e2f5f2c82116786127e9f6a403e975b14d2ca7b652a48cd1ba843e6a27c",
"schema_version_6": "9d05b4fb223f0e60efc716add5048b0ca9c37511cf2041721e20505d6d798ce4",
}

View file

@ -152,7 +152,7 @@ func (e *EntryQueryBuilder) GetEntries() (model.Entries, error) {
SELECT
e.id, e.user_id, e.feed_id, e.hash, e.published_at at time zone '%s', e.title, e.url, e.author, e.content, e.status,
f.title as feed_title, f.feed_url, f.site_url, f.checked_at,
f.category_id, c.title as category_title,
f.category_id, c.title as category_title, f.scraper_rules,
fi.icon_id
FROM entries e
LEFT JOIN feeds f ON f.id=e.feed_id
@ -197,6 +197,7 @@ func (e *EntryQueryBuilder) GetEntries() (model.Entries, error) {
&entry.Feed.CheckedAt,
&entry.Feed.Category.ID,
&entry.Feed.Category.Title,
&entry.Feed.ScraperRules,
&iconID,
)

View file

@ -52,7 +52,7 @@ func (s *Storage) Feeds(userID int64) (model.Feeds, error) {
feeds := make(model.Feeds, 0)
query := `SELECT
f.id, f.feed_url, f.site_url, f.title, f.etag_header, f.last_modified_header,
f.user_id, f.checked_at, f.parsing_error_count, f.parsing_error_msg,
f.user_id, f.checked_at, f.parsing_error_count, f.parsing_error_msg, f.scraper_rules,
f.category_id, c.title as category_title,
fi.icon_id
FROM feeds f
@ -84,6 +84,7 @@ func (s *Storage) Feeds(userID int64) (model.Feeds, error) {
&feed.CheckedAt,
&feed.ParsingErrorCount,
&errorMsg,
&feed.ScraperRules,
&feed.Category.ID,
&feed.Category.Title,
&iconID,
@ -122,7 +123,7 @@ func (s *Storage) FeedByID(userID, feedID int64) (*model.Feed, error) {
query := `
SELECT
f.id, f.feed_url, f.site_url, f.title, f.etag_header, f.last_modified_header,
f.user_id, f.checked_at, f.parsing_error_count, f.parsing_error_msg,
f.user_id, f.checked_at, f.parsing_error_count, f.parsing_error_msg, f.scraper_rules,
f.category_id, c.title as category_title
FROM feeds f
LEFT JOIN categories c ON c.id=f.category_id
@ -139,6 +140,7 @@ func (s *Storage) FeedByID(userID, feedID int64) (*model.Feed, error) {
&feed.CheckedAt,
&feed.ParsingErrorCount,
&feed.ParsingErrorMsg,
&feed.ScraperRules,
&feed.Category.ID,
&feed.Category.Title,
)
@ -195,8 +197,8 @@ func (s *Storage) UpdateFeed(feed *model.Feed) (err error) {
query := `UPDATE feeds SET
feed_url=$1, site_url=$2, title=$3, category_id=$4, etag_header=$5, last_modified_header=$6, checked_at=$7,
parsing_error_msg=$8, parsing_error_count=$9
WHERE id=$10 AND user_id=$11`
parsing_error_msg=$8, parsing_error_count=$9, scraper_rules=$10
WHERE id=$11 AND user_id=$12`
_, err = s.db.Exec(query,
feed.FeedURL,
@ -208,6 +210,7 @@ func (s *Storage) UpdateFeed(feed *model.Feed) (err error) {
feed.CheckedAt,
feed.ParsingErrorMsg,
feed.ParsingErrorCount,
feed.ScraperRules,
feed.ID,
feed.UserID,
)

View file

@ -12,7 +12,7 @@ import (
"github.com/miniflux/miniflux2/sql"
)
const schemaVersion = 5
const schemaVersion = 6
// Migrate run database migrations.
func (s *Storage) Migrate() {