Add the possibility to enable crawler for feeds

This commit is contained in:
Frédéric Guillot 2017-12-12 19:19:36 -08:00
parent 33445e5b68
commit ef097f02fe
22 changed files with 77 additions and 25 deletions

View file

@ -1,5 +1,5 @@
// Code generated by go generate; DO NOT EDIT.
// 2017-12-11 22:04:47.860104328 -0800 PST m=+0.042425898
// 2017-12-12 19:14:08.438401734 -0800 PST m=+0.020484380
package locale

Binary file not shown.

View file

@ -24,6 +24,7 @@ type Feed struct {
ParsingErrorCount int `json:"parsing_error_count,omitempty"`
ScraperRules string `json:"scraper_rules"`
RewriteRules string `json:"rewrite_rules"`
Crawler bool `json:"crawler"`
Category *Category `json:"category,omitempty"`
Entries Entries `json:"entries,omitempty"`
Icon *FeedIcon `json:"icon,omitempty"`

View file

@ -33,7 +33,7 @@ type Handler struct {
}
// CreateFeed fetch, parse and store a new feed.
func (h *Handler) CreateFeed(userID, categoryID int64, url string) (*model.Feed, error) {
func (h *Handler) CreateFeed(userID, categoryID int64, url string, crawler bool) (*model.Feed, error) {
defer helper.ExecutionTime(time.Now(), fmt.Sprintf("[Handler:CreateFeed] feedUrl=%s", url))
if !h.store.CategoryExists(userID, categoryID) {
@ -65,6 +65,7 @@ func (h *Handler) CreateFeed(userID, categoryID int64, url string) (*model.Feed,
}
feedProcessor := processor.NewFeedProcessor(subscription)
feedProcessor.WithCrawler(crawler)
feedProcessor.Process()
subscription.Category = &model.Category{ID: categoryID}
@ -72,6 +73,7 @@ func (h *Handler) CreateFeed(userID, categoryID int64, url string) (*model.Feed,
subscription.LastModifiedHeader = response.LastModified
subscription.FeedURL = response.EffectiveURL
subscription.UserID = userID
subscription.Crawler = crawler
err = h.store.CreateFeed(subscription)
if err != nil {
@ -143,6 +145,7 @@ func (h *Handler) RefreshFeed(userID, feedID int64) error {
feedProcessor := processor.NewFeedProcessor(subscription)
feedProcessor.WithScraperRules(originalFeed.ScraperRules)
feedProcessor.WithRewriteRules(originalFeed.RewriteRules)
feedProcessor.WithCrawler(originalFeed.Crawler)
feedProcessor.Process()
originalFeed.EtagHeader = response.ETag

View file

@ -5,9 +5,12 @@
package processor
import (
"log"
"github.com/miniflux/miniflux2/model"
"github.com/miniflux/miniflux2/reader/rewrite"
"github.com/miniflux/miniflux2/reader/sanitizer"
"github.com/miniflux/miniflux2/reader/scraper"
)
// FeedProcessor handles the processing of feed contents.
@ -15,6 +18,12 @@ type FeedProcessor struct {
feed *model.Feed
scraperRules string
rewriteRules string
crawler bool
}
// WithCrawler enables the crawler.
func (f *FeedProcessor) WithCrawler(value bool) {
f.crawler = value
}
// WithScraperRules adds scraper rules to the processing.
@ -30,6 +39,15 @@ func (f *FeedProcessor) WithRewriteRules(rules string) {
// Process applies rewrite and scraper rules.
func (f *FeedProcessor) Process() {
for _, entry := range f.feed.Entries {
if f.crawler {
content, err := scraper.Fetch(entry.URL, f.scraperRules)
if err != nil {
log.Println("[FeedProcessor]", err)
} else {
entry.Content = content
}
}
entry.Content = sanitizer.Sanitize(entry.URL, entry.Content)
entry.Content = rewrite.Rewriter(entry.URL, entry.Content, f.rewriteRules)
}
@ -37,5 +55,5 @@ func (f *FeedProcessor) Process() {
// NewFeedProcessor returns a new FeedProcessor.
func NewFeedProcessor(feed *model.Feed) *FeedProcessor {
return &FeedProcessor{feed: feed}
return &FeedProcessor{feed: feed, crawler: false}
}

View file

@ -13,7 +13,6 @@ import (
"github.com/PuerkitoBio/goquery"
"github.com/miniflux/miniflux2/http"
"github.com/miniflux/miniflux2/reader/readability"
"github.com/miniflux/miniflux2/reader/sanitizer"
"github.com/miniflux/miniflux2/url"
)
@ -34,11 +33,11 @@ func Fetch(websiteURL, rules string) (string, error) {
return "", err
}
var content string
if rules == "" {
rules = getPredefinedScraperRules(websiteURL)
}
var content string
if rules != "" {
log.Printf(`[Scraper] Using rules "%s" for "%s"`, rules, websiteURL)
content, err = scrapContent(page, rules)
@ -51,7 +50,7 @@ func Fetch(websiteURL, rules string) (string, error) {
return "", err
}
return sanitizer.Sanitize(websiteURL, content), nil
return content, nil
}
func scrapContent(page io.Reader, rules string) (string, error) {

View file

@ -20,7 +20,7 @@ func (c *Controller) CreateFeed(ctx *core.Context, request *core.Request, respon
return
}
feed, err := c.feedHandler.CreateFeed(userID, categoryID, feedURL)
feed, err := c.feedHandler.CreateFeed(userID, categoryID, feedURL, false)
if err != nil {
response.JSON().ServerError(errors.New("Unable to create this feed"))
return

View file

@ -1,5 +1,5 @@
// Code generated by go generate; DO NOT EDIT.
// 2017-12-11 22:04:47.859021405 -0800 PST m=+0.041342975
// 2017-12-12 19:14:08.437349475 -0800 PST m=+0.019432121
package template

View file

@ -36,6 +36,8 @@
{{ end }}
</select>
<label><input type="checkbox" name="crawler" value="1" {{ if .form.Crawler }}checked{{ end }}> {{ t "Fetch original content" }}</label>
<div class="buttons">
<button type="submit" class="button button-primary" data-label-loading="{{ t "Loading..." }}">{{ t "Find a subscription" }}</button>
</div>

View file

@ -29,6 +29,9 @@
</div>
{{ end }}
<br>
<label><input type="checkbox" name="crawler" value="1" {{ if .form.Crawler }}checked{{ end }}> {{ t "Fetch original content" }}</label>
<div class="buttons">
<button type="submit" class="button button-primary" data-label-loading="{{ t "Loading..." }}">{{ t "Subscribe" }}</button>
</div>

View file

@ -58,6 +58,8 @@
{{ end }}
</select>
<label><input type="checkbox" name="crawler" value="1" {{ if .form.Crawler }}checked{{ end }}> {{ t "Fetch original content" }}</label>
<div class="buttons">
<button type="submit" class="button button-primary" data-label-loading="{{ t "Loading..." }}">{{ t "Update" }}</button> {{ t "or" }} <a href="{{ route "feeds" }}">{{ t "cancel" }}</a>
</div>

View file

@ -1,5 +1,5 @@
// Code generated by go generate; DO NOT EDIT.
// 2017-12-11 22:04:47.843652435 -0800 PST m=+0.025974005
// 2017-12-12 19:14:08.427613446 -0800 PST m=+0.009696092
package template
@ -83,6 +83,8 @@ var templateViewsMap = map[string]string{
{{ end }}
</select>
<label><input type="checkbox" name="crawler" value="1" {{ if .form.Crawler }}checked{{ end }}> {{ t "Fetch original content" }}</label>
<div class="buttons">
<button type="submit" class="button button-primary" data-label-loading="{{ t "Loading..." }}">{{ t "Find a subscription" }}</button>
</div>
@ -238,6 +240,9 @@ var templateViewsMap = map[string]string{
</div>
{{ end }}
<br>
<label><input type="checkbox" name="crawler" value="1" {{ if .form.Crawler }}checked{{ end }}> {{ t "Fetch original content" }}</label>
<div class="buttons">
<button type="submit" class="button button-primary" data-label-loading="{{ t "Loading..." }}">{{ t "Subscribe" }}</button>
</div>
@ -408,6 +413,8 @@ var templateViewsMap = map[string]string{
{{ end }}
</select>
<label><input type="checkbox" name="crawler" value="1" {{ if .form.Crawler }}checked{{ end }}> {{ t "Fetch original content" }}</label>
<div class="buttons">
<button type="submit" class="button button-primary" data-label-loading="{{ t "Loading..." }}">{{ t "Update" }}</button> {{ t "or" }} <a href="{{ route "feeds" }}">{{ t "cancel" }}</a>
</div>
@ -1180,14 +1187,14 @@ var templateViewsMap = map[string]string{
var templateViewsMapChecksums = map[string]string{
"about": "ad2fb778fc73c39b733b3f81b13e5c7d689b041fadd24ee2d4577f545aa788ad",
"add_subscription": "098ea9e492e18242bd414b22c4d8638006d113f728e5ae78c9186663f60ae3f1",
"add_subscription": "053c920b0d7e109ea19dce6a448e304ce720db8633588ea04db16677f7209a7b",
"categories": "ca1280cd157bb527d4fc907da67b05a8347378f6dce965b9389d4bcdf3600a11",
"category_entries": "951cdacf38fcaed5cdd63a00dc800e26039236b94b556a68e4409012b0095ece",
"choose_subscription": "d37682743d8bbd84738a964e238103db2651f95fa340c6e285ffe2e12548d673",
"choose_subscription": "a325f9c976ca2b2dc148e25c8fef0cf6ccab0e04e86e604e7812bb18dc4cdde1",
"create_category": "2b82af5d2dcd67898dc5daa57a6461e6ff8121a6089b2a2a1be909f35e4a2275",
"create_user": "45e226df757126d5fe7c464e295e9a34f07952cfdb71e31e49839850d35af139",
"edit_category": "cee720faadcec58289b707ad30af623d2ee66c1ce23a732965463250d7ff41c5",
"edit_feed": "e33e64de5e2b9c12580e693d048c2fab907968d4e7cddb2055d0251efc5b75e4",
"edit_feed": "7e78f0821312557ca05eb840fd52bcb60509c6da205e8ffce11eb08f65ae143d",
"edit_user": "82d9749d76ddbd2352816d813c4b1f6d92f2222de678b4afe5821090246735c7",
"entry": "ebcf9bb35812dd02759718f7f7411267e6a6c8efd59a9aa0a0e735bcb88efeff",
"feed_entries": "547c19eb36b20e350ce70ed045173b064cdcd6b114afb241c9f2dda9d88fcc27",

View file

@ -8,6 +8,8 @@ import (
"errors"
"log"
"github.com/miniflux/miniflux2/reader/sanitizer"
"github.com/miniflux/miniflux2/integration"
"github.com/miniflux/miniflux2/model"
"github.com/miniflux/miniflux2/reader/scraper"
@ -46,10 +48,10 @@ func (c *Controller) FetchContent(ctx *core.Context, request *core.Request, resp
return
}
entry.Content = content
entry.Content = sanitizer.Sanitize(entry.URL, content)
c.store.UpdateEntryContent(entry)
response.JSON().Created(map[string]string{"content": content})
response.JSON().Created(map[string]string{"content": entry.Content})
}
// SaveEntry send the link to external services.

View file

@ -222,6 +222,7 @@ func (c *Controller) getFeedFormTemplateArgs(ctx *core.Context, user *model.User
Title: feed.Title,
ScraperRules: feed.ScraperRules,
RewriteRules: feed.RewriteRules,
Crawler: feed.Crawler,
CategoryID: feed.Category.ID,
}
} else {

View file

@ -80,7 +80,7 @@ func (c *Controller) SubmitSubscription(ctx *core.Context, request *core.Request
"errorMessage": "Unable to find any subscription.",
}))
case n == 1:
feed, err := c.feedHandler.CreateFeed(user.ID, subscriptionForm.CategoryID, subscriptions[0].URL)
feed, err := c.feedHandler.CreateFeed(user.ID, subscriptionForm.CategoryID, subscriptions[0].URL, subscriptionForm.Crawler)
if err != nil {
response.HTML().Render("add_subscription", args.Merge(tplParams{
"form": subscriptionForm,
@ -117,7 +117,7 @@ func (c *Controller) ChooseSubscription(ctx *core.Context, request *core.Request
return
}
feed, err := c.feedHandler.CreateFeed(user.ID, subscriptionForm.CategoryID, subscriptionForm.URL)
feed, err := c.feedHandler.CreateFeed(user.ID, subscriptionForm.CategoryID, subscriptionForm.URL, subscriptionForm.Crawler)
if err != nil {
response.HTML().Render("add_subscription", args.Merge(tplParams{
"form": subscriptionForm,

View file

@ -19,6 +19,7 @@ type FeedForm struct {
Title string
ScraperRules string
RewriteRules string
Crawler bool
CategoryID int64
}
@ -38,6 +39,7 @@ func (f FeedForm) Merge(feed *model.Feed) *model.Feed {
feed.FeedURL = f.FeedURL
feed.ScraperRules = f.ScraperRules
feed.RewriteRules = f.RewriteRules
feed.Crawler = f.Crawler
feed.ParsingErrorCount = 0
feed.ParsingErrorMsg = ""
return feed
@ -56,6 +58,7 @@ func NewFeedForm(r *http.Request) *FeedForm {
Title: r.FormValue("title"),
ScraperRules: r.FormValue("scraper_rules"),
RewriteRules: r.FormValue("rewrite_rules"),
Crawler: r.FormValue("crawler") == "1",
CategoryID: int64(categoryID),
}
}

View file

@ -15,6 +15,7 @@ import (
type SubscriptionForm struct {
URL string
CategoryID int64
Crawler bool
}
// Validate makes sure the form values are valid.
@ -35,6 +36,7 @@ func NewSubscriptionForm(r *http.Request) *SubscriptionForm {
return &SubscriptionForm{
URL: r.FormValue("url"),
Crawler: r.FormValue("crawler") == "1",
CategoryID: int64(categoryID),
}
}

1
sql/schema_version_8.sql Normal file
View file

@ -0,0 +1 @@
alter table feeds add column crawler boolean default 'f';

View file

@ -1,5 +1,5 @@
// Code generated by go generate; DO NOT EDIT.
// 2017-12-11 22:04:47.821813568 -0800 PST m=+0.004135138
// 2017-12-12 19:14:08.420562729 -0800 PST m=+0.002645375
package sql
@ -140,6 +140,8 @@ alter table users add column entry_direction entry_sorting_direction default 'as
"schema_version_6": `alter table feeds add column scraper_rules text default '';
`,
"schema_version_7": `alter table feeds add column rewrite_rules text default '';
`,
"schema_version_8": `alter table feeds add column crawler boolean default 'f';
`,
}
@ -151,4 +153,5 @@ var SqlMapChecksums = map[string]string{
"schema_version_5": "46397e2f5f2c82116786127e9f6a403e975b14d2ca7b652a48cd1ba843e6a27c",
"schema_version_6": "9d05b4fb223f0e60efc716add5048b0ca9c37511cf2041721e20505d6d798ce4",
"schema_version_7": "33f298c9aa30d6de3ca28e1270df51c2884d7596f1283a75716e2aeb634cd05c",
"schema_version_8": "9922073fc4032d8922617ec6a6a07ae8d4817846c138760fb96cb5608ab83bfc",
}

View file

@ -152,7 +152,7 @@ func (e *EntryQueryBuilder) GetEntries() (model.Entries, error) {
SELECT
e.id, e.user_id, e.feed_id, e.hash, e.published_at at time zone '%s', e.title, e.url, e.author, e.content, e.status,
f.title as feed_title, f.feed_url, f.site_url, f.checked_at,
f.category_id, c.title as category_title, f.scraper_rules, f.rewrite_rules,
f.category_id, c.title as category_title, f.scraper_rules, f.rewrite_rules, f.crawler,
fi.icon_id
FROM entries e
LEFT JOIN feeds f ON f.id=e.feed_id
@ -199,6 +199,7 @@ func (e *EntryQueryBuilder) GetEntries() (model.Entries, error) {
&entry.Feed.Category.Title,
&entry.Feed.ScraperRules,
&entry.Feed.RewriteRules,
&entry.Feed.Crawler,
&iconID,
)

View file

@ -53,7 +53,7 @@ func (s *Storage) Feeds(userID int64) (model.Feeds, error) {
query := `SELECT
f.id, f.feed_url, f.site_url, f.title, f.etag_header, f.last_modified_header,
f.user_id, f.checked_at, f.parsing_error_count, f.parsing_error_msg,
f.scraper_rules, f.rewrite_rules,
f.scraper_rules, f.rewrite_rules, f.crawler,
f.category_id, c.title as category_title,
fi.icon_id
FROM feeds f
@ -87,6 +87,7 @@ func (s *Storage) Feeds(userID int64) (model.Feeds, error) {
&errorMsg,
&feed.ScraperRules,
&feed.RewriteRules,
&feed.Crawler,
&feed.Category.ID,
&feed.Category.Title,
&iconID,
@ -126,7 +127,7 @@ func (s *Storage) FeedByID(userID, feedID int64) (*model.Feed, error) {
SELECT
f.id, f.feed_url, f.site_url, f.title, f.etag_header, f.last_modified_header,
f.user_id, f.checked_at, f.parsing_error_count, f.parsing_error_msg,
f.scraper_rules, f.rewrite_rules,
f.scraper_rules, f.rewrite_rules, f.crawler,
f.category_id, c.title as category_title
FROM feeds f
LEFT JOIN categories c ON c.id=f.category_id
@ -145,6 +146,7 @@ func (s *Storage) FeedByID(userID, feedID int64) (*model.Feed, error) {
&feed.ParsingErrorMsg,
&feed.ScraperRules,
&feed.RewriteRules,
&feed.Crawler,
&feed.Category.ID,
&feed.Category.Title,
)
@ -164,8 +166,8 @@ func (s *Storage) CreateFeed(feed *model.Feed) error {
defer helper.ExecutionTime(time.Now(), fmt.Sprintf("[Storage:CreateFeed] feedURL=%s", feed.FeedURL))
sql := `
INSERT INTO feeds
(feed_url, site_url, title, category_id, user_id, etag_header, last_modified_header)
VALUES ($1, $2, $3, $4, $5, $6, $7)
(feed_url, site_url, title, category_id, user_id, etag_header, last_modified_header, crawler)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
RETURNING id
`
@ -178,6 +180,7 @@ func (s *Storage) CreateFeed(feed *model.Feed) error {
feed.UserID,
feed.EtagHeader,
feed.LastModifiedHeader,
feed.Crawler,
).Scan(&feed.ID)
if err != nil {
return fmt.Errorf("unable to create feed: %v", err)
@ -201,8 +204,8 @@ func (s *Storage) UpdateFeed(feed *model.Feed) (err error) {
query := `UPDATE feeds SET
feed_url=$1, site_url=$2, title=$3, category_id=$4, etag_header=$5, last_modified_header=$6, checked_at=$7,
parsing_error_msg=$8, parsing_error_count=$9, scraper_rules=$10, rewrite_rules=$11
WHERE id=$12 AND user_id=$13`
parsing_error_msg=$8, parsing_error_count=$9, scraper_rules=$10, rewrite_rules=$11, crawler=$12
WHERE id=$13 AND user_id=$14`
_, err = s.db.Exec(query,
feed.FeedURL,
@ -216,6 +219,7 @@ func (s *Storage) UpdateFeed(feed *model.Feed) (err error) {
feed.ParsingErrorCount,
feed.ScraperRules,
feed.RewriteRules,
feed.Crawler,
feed.ID,
feed.UserID,
)

View file

@ -12,7 +12,7 @@ import (
"github.com/miniflux/miniflux2/sql"
)
const schemaVersion = 7
const schemaVersion = 8
// Migrate run database migrations.
func (s *Storage) Migrate() {