Add scraper rules
This commit is contained in:
parent
7a35c58f53
commit
87ccad5c7f
16 changed files with 140 additions and 34 deletions
|
@ -1,5 +1,5 @@
|
|||
// Code generated by go generate; DO NOT EDIT.
|
||||
// 2017-12-10 18:56:24.387844114 -0800 PST m=+0.029823201
|
||||
// 2017-12-10 20:08:14.447304303 -0800 PST m=+0.040286758
|
||||
|
||||
package locale
|
||||
|
||||
|
@ -167,12 +167,13 @@ var translations = map[string]string{
|
|||
"Activate Fever API": "Activer l'API de Fever",
|
||||
"Fever Username": "Nom d'utilisateur pour l'API de Fever",
|
||||
"Fever Password": "Mot de passe pour l'API de Fever",
|
||||
"Fetch original content": "Récupérer le contenu original"
|
||||
"Fetch original content": "Récupérer le contenu original",
|
||||
"Scraper Rules": "Règles pour récupérer le contenu original"
|
||||
}
|
||||
`,
|
||||
}
|
||||
|
||||
var translationsChecksums = map[string]string{
|
||||
"en_US": "6fe95384260941e8a5a3c695a655a932e0a8a6a572c1e45cb2b1ae8baa01b897",
|
||||
"fr_FR": "fd629b171aefa50dd0a6100acaac8fbecbdf1a1d53e3fce984234565ec5bb5d5",
|
||||
"fr_FR": "4426cea875ee2c9acb1a2b0619cb82f3a32f71aabe5d07657eaf2f6b7387c5f9",
|
||||
}
|
||||
|
|
|
@ -151,5 +151,6 @@
|
|||
"Activate Fever API": "Activer l'API de Fever",
|
||||
"Fever Username": "Nom d'utilisateur pour l'API de Fever",
|
||||
"Fever Password": "Mot de passe pour l'API de Fever",
|
||||
"Fetch original content": "Récupérer le contenu original"
|
||||
"Fetch original content": "Récupérer le contenu original",
|
||||
"Scraper Rules": "Règles pour récupérer le contenu original"
|
||||
}
|
||||
|
|
|
@ -22,6 +22,7 @@ type Feed struct {
|
|||
LastModifiedHeader string `json:"last_modified_header,omitempty"`
|
||||
ParsingErrorMsg string `json:"parsing_error_message,omitempty"`
|
||||
ParsingErrorCount int `json:"parsing_error_count,omitempty"`
|
||||
ScraperRules string `json:"scraper_rules"`
|
||||
Category *Category `json:"category,omitempty"`
|
||||
Entries Entries `json:"entries,omitempty"`
|
||||
Icon *FeedIcon `json:"icon,omitempty"`
|
||||
|
|
16
reader/scraper/rules.go
Normal file
16
reader/scraper/rules.go
Normal file
|
@ -0,0 +1,16 @@
|
|||
// Copyright 2017 Frédéric Guillot. All rights reserved.
|
||||
// Use of this source code is governed by the Apache 2.0
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package scraper
|
||||
|
||||
// List of predefined scraper rules (alphabetically sorted)
|
||||
// domain => CSS selectors
|
||||
var predefinedRules = map[string]string{
|
||||
"lemonde.fr": "div#articleBody",
|
||||
"lesjoiesducode.fr": ".blog-post-content img",
|
||||
"linux.com": "div.content, div[property]",
|
||||
"opensource.com": "div[property]",
|
||||
"phoronix.com": "div.content",
|
||||
"techcrunch.com": "div.article-entry",
|
||||
}
|
|
@ -6,14 +6,19 @@ package scraper
|
|||
|
||||
import (
|
||||
"errors"
|
||||
"io"
|
||||
"log"
|
||||
"strings"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"github.com/miniflux/miniflux2/http"
|
||||
"github.com/miniflux/miniflux2/reader/readability"
|
||||
"github.com/miniflux/miniflux2/reader/sanitizer"
|
||||
"github.com/miniflux/miniflux2/url"
|
||||
)
|
||||
|
||||
// Fetch download a web page a returns relevant contents.
|
||||
func Fetch(websiteURL string) (string, error) {
|
||||
func Fetch(websiteURL, rules string) (string, error) {
|
||||
client := http.NewClient(websiteURL)
|
||||
response, err := client.Get()
|
||||
if err != nil {
|
||||
|
@ -29,10 +34,57 @@ func Fetch(websiteURL string) (string, error) {
|
|||
return "", err
|
||||
}
|
||||
|
||||
content, err := readability.ExtractContent(page)
|
||||
var content string
|
||||
if rules == "" {
|
||||
rules = getPredefinedScraperRules(websiteURL)
|
||||
}
|
||||
|
||||
if rules != "" {
|
||||
log.Printf(`[Scraper] Using rules "%s" for "%s"`, rules, websiteURL)
|
||||
content, err = scrapContent(page, rules)
|
||||
} else {
|
||||
log.Printf(`[Scraper] Using readability for "%s"`, websiteURL)
|
||||
content, err = readability.ExtractContent(page)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
return sanitizer.Sanitize(websiteURL, content), nil
|
||||
}
|
||||
|
||||
func scrapContent(page io.Reader, rules string) (string, error) {
|
||||
document, err := goquery.NewDocumentFromReader(page)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
contents := ""
|
||||
document.Find(rules).Each(func(i int, s *goquery.Selection) {
|
||||
var content string
|
||||
|
||||
// For some inline elements, we get the parent.
|
||||
if s.Is("img") {
|
||||
content, _ = s.Parent().Html()
|
||||
} else {
|
||||
content, _ = s.Html()
|
||||
}
|
||||
|
||||
contents += content
|
||||
})
|
||||
|
||||
return contents, nil
|
||||
}
|
||||
|
||||
func getPredefinedScraperRules(websiteURL string) string {
|
||||
urlDomain := url.Domain(websiteURL)
|
||||
|
||||
for domain, rules := range predefinedRules {
|
||||
if strings.Contains(urlDomain, domain) {
|
||||
return rules
|
||||
}
|
||||
}
|
||||
|
||||
return ""
|
||||
}
|
||||
|
|
21
reader/scraper/scraper_test.go
Normal file
21
reader/scraper/scraper_test.go
Normal file
|
@ -0,0 +1,21 @@
|
|||
// Copyright 2017 Frédéric Guillot. All rights reserved.
|
||||
// Use of this source code is governed by the Apache 2.0
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package scraper
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestGetPredefinedRules(t *testing.T) {
|
||||
if getPredefinedScraperRules("http://www.phoronix.com/") == "" {
|
||||
t.Error("Unable to find rule for phoronix.com")
|
||||
}
|
||||
|
||||
if getPredefinedScraperRules("https://www.linux.com/") == "" {
|
||||
t.Error("Unable to find rule for linux.com")
|
||||
}
|
||||
|
||||
if getPredefinedScraperRules("https://example.org/") != "" {
|
||||
t.Error("A rule not defined should not return anything")
|
||||
}
|
||||
}
|
|
@ -45,6 +45,9 @@
|
|||
<label for="form-feed-url">{{ t "Feed URL" }}</label>
|
||||
<input type="url" name="feed_url" id="form-feed-url" placeholder="https://domain.tld/" value="{{ .form.FeedURL }}" required>
|
||||
|
||||
<label for="form-scraper-rules">{{ t "Scraper Rules" }}</label>
|
||||
<input type="text" name="scraper_rules" id="form-scraper-rules" value="{{ .form.ScraperRules }}">
|
||||
|
||||
<label for="form-category">{{ t "Category" }}</label>
|
||||
<select id="form-category" name="category_id">
|
||||
{{ range .categories }}
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
// Code generated by go generate; DO NOT EDIT.
|
||||
// 2017-12-10 18:56:24.375327888 -0800 PST m=+0.017306975
|
||||
// 2017-12-10 20:08:14.428877093 -0800 PST m=+0.021859548
|
||||
|
||||
package template
|
||||
|
||||
|
@ -395,6 +395,9 @@ var templateViewsMap = map[string]string{
|
|||
<label for="form-feed-url">{{ t "Feed URL" }}</label>
|
||||
<input type="url" name="feed_url" id="form-feed-url" placeholder="https://domain.tld/" value="{{ .form.FeedURL }}" required>
|
||||
|
||||
<label for="form-scraper-rules">{{ t "Scraper Rules" }}</label>
|
||||
<input type="text" name="scraper_rules" id="form-scraper-rules" value="{{ .form.ScraperRules }}">
|
||||
|
||||
<label for="form-category">{{ t "Category" }}</label>
|
||||
<select id="form-category" name="category_id">
|
||||
{{ range .categories }}
|
||||
|
@ -1181,7 +1184,7 @@ var templateViewsMapChecksums = map[string]string{
|
|||
"create_category": "2b82af5d2dcd67898dc5daa57a6461e6ff8121a6089b2a2a1be909f35e4a2275",
|
||||
"create_user": "45e226df757126d5fe7c464e295e9a34f07952cfdb71e31e49839850d35af139",
|
||||
"edit_category": "cee720faadcec58289b707ad30af623d2ee66c1ce23a732965463250d7ff41c5",
|
||||
"edit_feed": "c5bc4c22bf7e8348d880395250545595d21fb8c8e723fc5d7cca68e25d250884",
|
||||
"edit_feed": "b3c7dd5e93d58e051abcd59da31217d8e9b50587014b895d1b7c9172247b35f8",
|
||||
"edit_user": "82d9749d76ddbd2352816d813c4b1f6d92f2222de678b4afe5821090246735c7",
|
||||
"entry": "ebcf9bb35812dd02759718f7f7411267e6a6c8efd59a9aa0a0e735bcb88efeff",
|
||||
"feed_entries": "547c19eb36b20e350ce70ed045173b064cdcd6b114afb241c9f2dda9d88fcc27",
|
||||
|
|
|
@ -40,18 +40,14 @@ func (c *Controller) FetchContent(ctx *core.Context, request *core.Request, resp
|
|||
return
|
||||
}
|
||||
|
||||
content, err := scraper.Fetch(entry.URL)
|
||||
content, err := scraper.Fetch(entry.URL, entry.Feed.ScraperRules)
|
||||
if err != nil {
|
||||
response.JSON().ServerError(err)
|
||||
return
|
||||
}
|
||||
|
||||
if len(content) > len(entry.Content) {
|
||||
entry.Content = content
|
||||
c.store.UpdateEntryContent(entry)
|
||||
} else {
|
||||
content = entry.Content
|
||||
}
|
||||
|
||||
response.JSON().Created(map[string]string{"content": content})
|
||||
}
|
||||
|
|
|
@ -220,6 +220,7 @@ func (c *Controller) getFeedFormTemplateArgs(ctx *core.Context, user *model.User
|
|||
SiteURL: feed.SiteURL,
|
||||
FeedURL: feed.FeedURL,
|
||||
Title: feed.Title,
|
||||
ScraperRules: feed.ScraperRules,
|
||||
CategoryID: feed.Category.ID,
|
||||
}
|
||||
} else {
|
||||
|
|
|
@ -17,6 +17,7 @@ type FeedForm struct {
|
|||
FeedURL string
|
||||
SiteURL string
|
||||
Title string
|
||||
ScraperRules string
|
||||
CategoryID int64
|
||||
}
|
||||
|
||||
|
@ -34,6 +35,7 @@ func (f FeedForm) Merge(feed *model.Feed) *model.Feed {
|
|||
feed.Title = f.Title
|
||||
feed.SiteURL = f.SiteURL
|
||||
feed.FeedURL = f.FeedURL
|
||||
feed.ScraperRules = f.ScraperRules
|
||||
feed.ParsingErrorCount = 0
|
||||
feed.ParsingErrorMsg = ""
|
||||
return feed
|
||||
|
@ -50,6 +52,7 @@ func NewFeedForm(r *http.Request) *FeedForm {
|
|||
FeedURL: r.FormValue("feed_url"),
|
||||
SiteURL: r.FormValue("site_url"),
|
||||
Title: r.FormValue("title"),
|
||||
ScraperRules: r.FormValue("scraper_rules"),
|
||||
CategoryID: int64(categoryID),
|
||||
}
|
||||
}
|
||||
|
|
1
sql/schema_version_6.sql
Normal file
1
sql/schema_version_6.sql
Normal file
|
@ -0,0 +1 @@
|
|||
alter table feeds add column scraper_rules text default '';
|
|
@ -1,5 +1,5 @@
|
|||
// Code generated by go generate; DO NOT EDIT.
|
||||
// 2017-12-10 18:56:24.36359961 -0800 PST m=+0.005578697
|
||||
// 2017-12-10 20:08:14.411225368 -0800 PST m=+0.004207823
|
||||
|
||||
package sql
|
||||
|
||||
|
@ -136,6 +136,8 @@ alter table users add column entry_direction entry_sorting_direction default 'as
|
|||
fever_token text default '',
|
||||
primary key(user_id)
|
||||
)
|
||||
`,
|
||||
"schema_version_6": `alter table feeds add column scraper_rules text default '';
|
||||
`,
|
||||
}
|
||||
|
||||
|
@ -145,4 +147,5 @@ var SqlMapChecksums = map[string]string{
|
|||
"schema_version_3": "a54745dbc1c51c000f74d4e5068f1e2f43e83309f023415b1749a47d5c1e0f12",
|
||||
"schema_version_4": "216ea3a7d3e1704e40c797b5dc47456517c27dbb6ca98bf88812f4f63d74b5d9",
|
||||
"schema_version_5": "46397e2f5f2c82116786127e9f6a403e975b14d2ca7b652a48cd1ba843e6a27c",
|
||||
"schema_version_6": "9d05b4fb223f0e60efc716add5048b0ca9c37511cf2041721e20505d6d798ce4",
|
||||
}
|
||||
|
|
|
@ -152,7 +152,7 @@ func (e *EntryQueryBuilder) GetEntries() (model.Entries, error) {
|
|||
SELECT
|
||||
e.id, e.user_id, e.feed_id, e.hash, e.published_at at time zone '%s', e.title, e.url, e.author, e.content, e.status,
|
||||
f.title as feed_title, f.feed_url, f.site_url, f.checked_at,
|
||||
f.category_id, c.title as category_title,
|
||||
f.category_id, c.title as category_title, f.scraper_rules,
|
||||
fi.icon_id
|
||||
FROM entries e
|
||||
LEFT JOIN feeds f ON f.id=e.feed_id
|
||||
|
@ -197,6 +197,7 @@ func (e *EntryQueryBuilder) GetEntries() (model.Entries, error) {
|
|||
&entry.Feed.CheckedAt,
|
||||
&entry.Feed.Category.ID,
|
||||
&entry.Feed.Category.Title,
|
||||
&entry.Feed.ScraperRules,
|
||||
&iconID,
|
||||
)
|
||||
|
||||
|
|
|
@ -52,7 +52,7 @@ func (s *Storage) Feeds(userID int64) (model.Feeds, error) {
|
|||
feeds := make(model.Feeds, 0)
|
||||
query := `SELECT
|
||||
f.id, f.feed_url, f.site_url, f.title, f.etag_header, f.last_modified_header,
|
||||
f.user_id, f.checked_at, f.parsing_error_count, f.parsing_error_msg,
|
||||
f.user_id, f.checked_at, f.parsing_error_count, f.parsing_error_msg, f.scraper_rules,
|
||||
f.category_id, c.title as category_title,
|
||||
fi.icon_id
|
||||
FROM feeds f
|
||||
|
@ -84,6 +84,7 @@ func (s *Storage) Feeds(userID int64) (model.Feeds, error) {
|
|||
&feed.CheckedAt,
|
||||
&feed.ParsingErrorCount,
|
||||
&errorMsg,
|
||||
&feed.ScraperRules,
|
||||
&feed.Category.ID,
|
||||
&feed.Category.Title,
|
||||
&iconID,
|
||||
|
@ -122,7 +123,7 @@ func (s *Storage) FeedByID(userID, feedID int64) (*model.Feed, error) {
|
|||
query := `
|
||||
SELECT
|
||||
f.id, f.feed_url, f.site_url, f.title, f.etag_header, f.last_modified_header,
|
||||
f.user_id, f.checked_at, f.parsing_error_count, f.parsing_error_msg,
|
||||
f.user_id, f.checked_at, f.parsing_error_count, f.parsing_error_msg, f.scraper_rules,
|
||||
f.category_id, c.title as category_title
|
||||
FROM feeds f
|
||||
LEFT JOIN categories c ON c.id=f.category_id
|
||||
|
@ -139,6 +140,7 @@ func (s *Storage) FeedByID(userID, feedID int64) (*model.Feed, error) {
|
|||
&feed.CheckedAt,
|
||||
&feed.ParsingErrorCount,
|
||||
&feed.ParsingErrorMsg,
|
||||
&feed.ScraperRules,
|
||||
&feed.Category.ID,
|
||||
&feed.Category.Title,
|
||||
)
|
||||
|
@ -195,8 +197,8 @@ func (s *Storage) UpdateFeed(feed *model.Feed) (err error) {
|
|||
|
||||
query := `UPDATE feeds SET
|
||||
feed_url=$1, site_url=$2, title=$3, category_id=$4, etag_header=$5, last_modified_header=$6, checked_at=$7,
|
||||
parsing_error_msg=$8, parsing_error_count=$9
|
||||
WHERE id=$10 AND user_id=$11`
|
||||
parsing_error_msg=$8, parsing_error_count=$9, scraper_rules=$10
|
||||
WHERE id=$11 AND user_id=$12`
|
||||
|
||||
_, err = s.db.Exec(query,
|
||||
feed.FeedURL,
|
||||
|
@ -208,6 +210,7 @@ func (s *Storage) UpdateFeed(feed *model.Feed) (err error) {
|
|||
feed.CheckedAt,
|
||||
feed.ParsingErrorMsg,
|
||||
feed.ParsingErrorCount,
|
||||
feed.ScraperRules,
|
||||
feed.ID,
|
||||
feed.UserID,
|
||||
)
|
||||
|
|
|
@ -12,7 +12,7 @@ import (
|
|||
"github.com/miniflux/miniflux2/sql"
|
||||
)
|
||||
|
||||
const schemaVersion = 5
|
||||
const schemaVersion = 6
|
||||
|
||||
// Migrate run database migrations.
|
||||
func (s *Storage) Migrate() {
|
||||
|
|
Loading…
Reference in a new issue