2018-10-15 07:33:19 +02:00
// Copyright 2018 Frédéric Guillot. All rights reserved.
// Use of this source code is governed by the Apache 2.0
// license that can be found in the LICENSE file.
2018-12-03 05:51:06 +01:00
package processor
2018-10-15 07:33:19 +02:00
import (
2021-01-27 13:50:34 +01:00
"errors"
"fmt"
2020-11-19 02:29:40 +01:00
"math"
2020-10-16 23:40:56 +02:00
"regexp"
2021-01-27 13:50:34 +01:00
"strconv"
2020-11-19 02:29:40 +01:00
"strings"
2020-09-28 01:01:06 +02:00
"time"
2020-11-19 02:29:40 +01:00
"unicode/utf8"
2020-09-28 01:01:06 +02:00
2021-09-08 05:04:22 +02:00
"miniflux.app/integration"
2020-09-28 01:01:06 +02:00
"miniflux.app/config"
2021-01-27 13:50:34 +01:00
"miniflux.app/http/client"
2018-10-15 07:33:19 +02:00
"miniflux.app/logger"
2020-09-28 01:01:06 +02:00
"miniflux.app/metric"
2018-10-15 07:33:19 +02:00
"miniflux.app/model"
2021-01-27 13:50:34 +01:00
"miniflux.app/reader/browser"
2018-10-15 07:33:19 +02:00
"miniflux.app/reader/rewrite"
"miniflux.app/reader/sanitizer"
"miniflux.app/reader/scraper"
"miniflux.app/storage"
2020-11-19 02:29:40 +01:00
2021-01-27 13:50:34 +01:00
"github.com/PuerkitoBio/goquery"
2020-11-19 02:29:40 +01:00
"github.com/rylans/getlang"
2018-10-15 07:33:19 +02:00
)
2021-01-27 13:50:34 +01:00
var (
youtubeRegex = regexp . MustCompile ( ` youtube\.com/watch\?v=(.*) ` )
iso8601Regex = regexp . MustCompile ( ` ^P((?P<year>\d+)Y)?((?P<month>\d+)M)?((?P<week>\d+)W)?((?P<day>\d+)D)?(T((?P<hour>\d+)H)?((?P<minute>\d+)M)?((?P<second>\d+)S)?)?$ ` )
)
2018-12-03 05:51:06 +01:00
// ProcessFeedEntries downloads original web page for entries and apply filters.
func ProcessFeedEntries ( store * storage . Storage , feed * model . Feed ) {
2020-10-20 07:07:35 +02:00
var filteredEntries model . Entries
2020-10-16 23:40:56 +02:00
2018-10-15 07:33:19 +02:00
for _ , entry := range feed . Entries {
2020-10-20 07:07:35 +02:00
logger . Debug ( "[Processor] Processing entry %q from feed %q" , entry . URL , feed . FeedURL )
if isBlockedEntry ( feed , entry ) || ! isAllowedEntry ( feed , entry ) {
continue
}
2021-03-09 05:10:53 +01:00
entryIsNew := ! store . EntryURLExists ( feed . ID , entry . URL )
if feed . Crawler && entryIsNew {
logger . Debug ( "[Processor] Crawling entry %q from feed %q" , entry . URL , feed . FeedURL )
startTime := time . Now ( )
content , scraperErr := scraper . Fetch (
entry . URL ,
feed . ScraperRules ,
feed . UserAgent ,
2021-03-23 04:27:58 +01:00
feed . Cookie ,
2021-03-09 05:10:53 +01:00
feed . AllowSelfSignedCertificates ,
2021-08-28 11:30:04 +02:00
feed . FetchViaProxy ,
2021-03-09 05:10:53 +01:00
)
if config . Opts . HasMetricsCollector ( ) {
status := "success"
2020-09-28 01:01:06 +02:00
if scraperErr != nil {
2021-03-09 05:10:53 +01:00
status = "error"
2018-10-15 07:33:19 +02:00
}
2021-03-09 05:10:53 +01:00
metric . ScraperRequestDuration . WithLabelValues ( status ) . Observe ( time . Since ( startTime ) . Seconds ( ) )
}
if scraperErr != nil {
logger . Error ( ` [Processor] Unable to crawl this entry: %q => %v ` , entry . URL , scraperErr )
} else if content != "" {
// We replace the entry content only if the scraper doesn't return any error.
entry . Content = content
2018-10-15 07:33:19 +02:00
}
}
entry . Content = rewrite . Rewriter ( entry . URL , entry . Content , feed . RewriteRules )
// The sanitizer should always run at the end of the process to make sure unsafe HTML is filtered.
entry . Content = sanitizer . Sanitize ( entry . URL , entry . Content )
2020-10-20 07:07:35 +02:00
2021-09-08 05:04:22 +02:00
if entryIsNew {
intg , err := store . Integration ( feed . UserID )
if err != nil {
logger . Error ( "[Processor] Get integrations for user %d failed: %v; the refresh process will go on, but no integrations will run this time." , feed . UserID , err )
} else if intg != nil {
localEntry := entry
go func ( ) {
integration . PushEntry ( localEntry , intg )
} ( )
}
}
2021-03-09 05:10:53 +01:00
updateEntryReadingTime ( store , feed , entry , entryIsNew )
2020-10-20 07:07:35 +02:00
filteredEntries = append ( filteredEntries , entry )
2018-10-15 07:33:19 +02:00
}
2020-10-20 07:07:35 +02:00
feed . Entries = filteredEntries
2018-10-15 07:33:19 +02:00
}
2018-12-03 05:51:06 +01:00
2020-10-20 07:07:35 +02:00
func isBlockedEntry ( feed * model . Feed , entry * model . Entry ) bool {
if feed . BlocklistRules != "" {
match , _ := regexp . MatchString ( feed . BlocklistRules , entry . Title )
if match {
logger . Debug ( "[Processor] Blocking entry %q from feed %q based on rule %q" , entry . Title , feed . FeedURL , feed . BlocklistRules )
return true
2020-10-16 23:40:56 +02:00
}
}
2020-10-20 07:07:35 +02:00
return false
}
func isAllowedEntry ( feed * model . Feed , entry * model . Entry ) bool {
if feed . KeeplistRules != "" {
match , _ := regexp . MatchString ( feed . KeeplistRules , entry . Title )
if match {
logger . Debug ( "[Processor] Allow entry %q from feed %q based on rule %q" , entry . Title , feed . FeedURL , feed . KeeplistRules )
return true
2020-10-16 23:40:56 +02:00
}
2020-10-20 07:07:35 +02:00
return false
2020-10-16 23:40:56 +02:00
}
2020-10-20 07:07:35 +02:00
return true
2020-10-16 23:40:56 +02:00
}
2018-12-03 05:51:06 +01:00
// ProcessEntryWebPage downloads the entry web page and apply rewrite rules.
2021-02-21 22:42:49 +01:00
func ProcessEntryWebPage ( feed * model . Feed , entry * model . Entry ) error {
2020-09-28 01:01:06 +02:00
startTime := time . Now ( )
2021-02-21 22:42:49 +01:00
content , scraperErr := scraper . Fetch (
entry . URL ,
entry . Feed . ScraperRules ,
entry . Feed . UserAgent ,
2021-03-23 04:27:58 +01:00
entry . Feed . Cookie ,
2021-02-21 22:42:49 +01:00
feed . AllowSelfSignedCertificates ,
2021-08-28 11:30:04 +02:00
feed . FetchViaProxy ,
2021-02-21 22:42:49 +01:00
)
2020-09-28 01:01:06 +02:00
if config . Opts . HasMetricsCollector ( ) {
status := "success"
if scraperErr != nil {
status = "error"
}
metric . ScraperRequestDuration . WithLabelValues ( status ) . Observe ( time . Since ( startTime ) . Seconds ( ) )
}
if scraperErr != nil {
return scraperErr
2018-12-03 05:51:06 +01:00
}
content = rewrite . Rewriter ( entry . URL , content , entry . Feed . RewriteRules )
content = sanitizer . Sanitize ( entry . URL , content )
if content != "" {
entry . Content = content
2020-11-19 02:29:40 +01:00
entry . ReadingTime = calculateReadingTime ( content )
2018-12-03 05:51:06 +01:00
}
return nil
}
2020-11-19 02:29:40 +01:00
2021-03-09 05:10:53 +01:00
func updateEntryReadingTime ( store * storage . Storage , feed * model . Feed , entry * model . Entry , entryIsNew bool ) {
if shouldFetchYouTubeWatchTime ( entry ) {
if entryIsNew {
watchTime , err := fetchYouTubeWatchTime ( entry . URL )
if err != nil {
logger . Error ( "[Processor] Unable to fetch YouTube watch time: %q => %v" , entry . URL , err )
}
entry . ReadingTime = watchTime
} else {
entry . ReadingTime = store . GetReadTime ( entry , feed )
}
}
// Handle YT error case and non-YT entries.
if entry . ReadingTime == 0 {
entry . ReadingTime = calculateReadingTime ( entry . Content )
}
}
func shouldFetchYouTubeWatchTime ( entry * model . Entry ) bool {
if ! config . Opts . FetchYouTubeWatchTime ( ) {
return false
}
matches := youtubeRegex . FindStringSubmatch ( entry . URL )
urlMatchesYouTubePattern := len ( matches ) == 2
return urlMatchesYouTubePattern
}
2021-01-27 13:50:34 +01:00
func fetchYouTubeWatchTime ( url string ) ( int , error ) {
clt := client . NewClientWithConfig ( url , config . Opts )
response , browserErr := browser . Exec ( clt )
if browserErr != nil {
return 0 , browserErr
}
doc , docErr := goquery . NewDocumentFromReader ( response . Body )
if docErr != nil {
return 0 , docErr
}
durs , exists := doc . Find ( ` meta[itemprop="duration"] ` ) . First ( ) . Attr ( "content" )
if ! exists {
return 0 , errors . New ( "duration has not found" )
}
dur , err := parseISO8601 ( durs )
if err != nil {
return 0 , fmt . Errorf ( "unable to parse duration %s: %v" , durs , err )
}
return int ( dur . Minutes ( ) ) , nil
}
// parseISO8601 parses an ISO 8601 duration string.
func parseISO8601 ( from string ) ( time . Duration , error ) {
var match [ ] string
var d time . Duration
if iso8601Regex . MatchString ( from ) {
match = iso8601Regex . FindStringSubmatch ( from )
} else {
return 0 , errors . New ( "could not parse duration string" )
}
for i , name := range iso8601Regex . SubexpNames ( ) {
part := match [ i ]
if i == 0 || name == "" || part == "" {
continue
}
val , err := strconv . ParseInt ( part , 10 , 64 )
if err != nil {
return 0 , err
}
switch name {
case "hour" :
d = d + ( time . Duration ( val ) * time . Hour )
case "minute" :
d = d + ( time . Duration ( val ) * time . Minute )
case "second" :
d = d + ( time . Duration ( val ) * time . Second )
default :
return 0 , fmt . Errorf ( "unknown field %s" , name )
}
}
return d , nil
}
2020-11-19 02:29:40 +01:00
func calculateReadingTime ( content string ) int {
sanitizedContent := sanitizer . StripTags ( content )
languageInfo := getlang . FromString ( sanitizedContent )
var timeToReadInt int
if languageInfo . LanguageCode ( ) == "ko" || languageInfo . LanguageCode ( ) == "zh" || languageInfo . LanguageCode ( ) == "jp" {
timeToReadInt = int ( math . Ceil ( float64 ( utf8 . RuneCountInString ( sanitizedContent ) ) / 500 ) )
} else {
nbOfWords := len ( strings . Fields ( sanitizedContent ) )
timeToReadInt = int ( math . Ceil ( float64 ( nbOfWords ) / 265 ) )
}
return timeToReadInt
}