miniflux/internal/reader/processor/processor.go

// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

package processor

import (
	"errors"
	"fmt"
	"math"
	"regexp"
	"strconv"
	"strings"
	"time"
	"unicode/utf8"

	"miniflux.app/v2/internal/config"
	"miniflux.app/v2/internal/http/client"
	"miniflux.app/v2/internal/logger"
	"miniflux.app/v2/internal/metric"
	"miniflux.app/v2/internal/model"
	"miniflux.app/v2/internal/reader/browser"
	"miniflux.app/v2/internal/reader/rewrite"
	"miniflux.app/v2/internal/reader/sanitizer"
	"miniflux.app/v2/internal/reader/scraper"
	"miniflux.app/v2/internal/storage"

	"github.com/PuerkitoBio/goquery"
	"github.com/rylans/getlang"
)

var (
	youtubeRegex           = regexp.MustCompile(`youtube\.com/watch\?v=(.*)`)
	odyseeRegex            = regexp.MustCompile(`^https://odysee\.com`)
	iso8601Regex           = regexp.MustCompile(`^P((?P<year>\d+)Y)?((?P<month>\d+)M)?((?P<week>\d+)W)?((?P<day>\d+)D)?(T((?P<hour>\d+)H)?((?P<minute>\d+)M)?((?P<second>\d+)S)?)?$`)
	customReplaceRuleRegex = regexp.MustCompile(`rewrite\("(.*)"\|"(.*)"\)`)
)

// ProcessFeedEntries downloads original web page for entries and apply filters.
func ProcessFeedEntries(store *storage.Storage, feed *model.Feed, user *model.User, forceRefresh bool) {
	var filteredEntries model.Entries

	// Process older entries first
	for i := len(feed.Entries) - 1; i >= 0; i-- {
		entry := feed.Entries[i]

		logger.Debug("[Processor] Processing entry %q from feed %q", entry.URL, feed.FeedURL)

		if isBlockedEntry(feed, entry) || !isAllowedEntry(feed, entry) {
			continue
		}

		url := getUrlFromEntry(feed, entry)
		entryIsNew := !store.EntryURLExists(feed.ID, entry.URL)
		if feed.Crawler && (entryIsNew || forceRefresh) {
			logger.Debug("[Processor] Crawling entry %q from feed %q", url, feed.FeedURL)

			startTime := time.Now()
			content, scraperErr := scraper.Fetch(
				url,
				feed.ScraperRules,
				feed.UserAgent,
				feed.Cookie,
				feed.AllowSelfSignedCertificates,
				feed.FetchViaProxy,
			)

			if config.Opts.HasMetricsCollector() {
				status := "success"
				if scraperErr != nil {
					status = "error"
				}
				metric.ScraperRequestDuration.WithLabelValues(status).Observe(time.Since(startTime).Seconds())
			}

			if scraperErr != nil {
				logger.Error(`[Processor] Unable to crawl this entry: %q => %v`, entry.URL, scraperErr)
			} else if content != "" {
				// We replace the entry content only if the scraper doesn't return any error.
				entry.Content = content
			}
		}

		rewrite.Rewriter(url, entry, feed.RewriteRules)

		// The sanitizer should always run at the end of the process to make sure unsafe HTML is filtered.
		entry.Content = sanitizer.Sanitize(url, entry.Content)

		updateEntryReadingTime(store, feed, entry, entryIsNew, user)
		filteredEntries = append(filteredEntries, entry)
	}

	feed.Entries = filteredEntries
}

func isBlockedEntry(feed *model.Feed, entry *model.Entry) bool {
	if feed.BlocklistRules != "" {
		match, _ := regexp.MatchString(feed.BlocklistRules, entry.Title)
		if match {
			logger.Debug("[Processor] Blocking entry %q from feed %q based on rule %q", entry.Title, feed.FeedURL, feed.BlocklistRules)
			return true
		}
	}
	return false
}

func isAllowedEntry(feed *model.Feed, entry *model.Entry) bool {
	if feed.KeeplistRules != "" {
		match, _ := regexp.MatchString(feed.KeeplistRules, entry.Title)
		if match {
			logger.Debug("[Processor] Allow entry %q from feed %q based on rule %q", entry.Title, feed.FeedURL, feed.KeeplistRules)
			return true
		}
		return false
	}
	return true
}

// ProcessEntryWebPage downloads the entry web page and apply rewrite rules.
func ProcessEntryWebPage(feed *model.Feed, entry *model.Entry, user *model.User) error {
	startTime := time.Now()
	url := getUrlFromEntry(feed, entry)

	content, scraperErr := scraper.Fetch(
		url,
		entry.Feed.ScraperRules,
		entry.Feed.UserAgent,
		entry.Feed.Cookie,
		feed.AllowSelfSignedCertificates,
		feed.FetchViaProxy,
	)

	if config.Opts.HasMetricsCollector() {
		status := "success"
		if scraperErr != nil {
			status = "error"
		}
		metric.ScraperRequestDuration.WithLabelValues(status).Observe(time.Since(startTime).Seconds())
	}

	if scraperErr != nil {
		return scraperErr
	}

	if content != "" {
		entry.Content = content
		entry.ReadingTime = calculateReadingTime(content, user)
	}

	rewrite.Rewriter(url, entry, entry.Feed.RewriteRules)
	entry.Content = sanitizer.Sanitize(url, entry.Content)

	return nil
}

func getUrlFromEntry(feed *model.Feed, entry *model.Entry) string {
	var url = entry.URL
	if feed.UrlRewriteRules != "" {
		parts := customReplaceRuleRegex.FindStringSubmatch(feed.UrlRewriteRules)

		if len(parts) >= 3 {
			re := regexp.MustCompile(parts[1])
			url = re.ReplaceAllString(entry.URL, parts[2])
			logger.Debug(`[Processor] Rewriting entry URL %s to %s`, entry.URL, url)
		} else {
			logger.Debug("[Processor] Cannot find search and replace terms for replace rule %s", feed.UrlRewriteRules)
		}
	}
	return url
}

func updateEntryReadingTime(store *storage.Storage, feed *model.Feed, entry *model.Entry, entryIsNew bool, user *model.User) {
	if shouldFetchYouTubeWatchTime(entry) {
		if entryIsNew {
			watchTime, err := fetchYouTubeWatchTime(entry.URL)
			if err != nil {
				logger.Error("[Processor] Unable to fetch YouTube watch time: %q => %v", entry.URL, err)
			}
			entry.ReadingTime = watchTime
		} else {
			entry.ReadingTime = store.GetReadTime(entry, feed)
		}
	}

	if shouldFetchOdyseeWatchTime(entry) {
		if entryIsNew {
			watchTime, err := fetchOdyseeWatchTime(entry.URL)
			if err != nil {
				logger.Error("[Processor] Unable to fetch Odysee watch time: %q => %v", entry.URL, err)
			}
			entry.ReadingTime = watchTime
		} else {
			entry.ReadingTime = store.GetReadTime(entry, feed)
		}
	}
	// Handle YT error case and non-YT entries.
	if entry.ReadingTime == 0 {
		entry.ReadingTime = calculateReadingTime(entry.Content, user)
	}
}

func shouldFetchYouTubeWatchTime(entry *model.Entry) bool {
	if !config.Opts.FetchYouTubeWatchTime() {
		return false
	}
	matches := youtubeRegex.FindStringSubmatch(entry.URL)
	urlMatchesYouTubePattern := len(matches) == 2
	return urlMatchesYouTubePattern
}

func shouldFetchOdyseeWatchTime(entry *model.Entry) bool {
	if !config.Opts.FetchOdyseeWatchTime() {
		return false
	}
	matches := odyseeRegex.FindStringSubmatch(entry.URL)
	return matches != nil
}

func fetchYouTubeWatchTime(url string) (int, error) {
	clt := client.NewClientWithConfig(url, config.Opts)
	response, browserErr := browser.Exec(clt)
	if browserErr != nil {
		return 0, browserErr
	}

	doc, docErr := goquery.NewDocumentFromReader(response.Body)
	if docErr != nil {
		return 0, docErr
	}

	durs, exists := doc.Find(`meta[itemprop="duration"]`).First().Attr("content")
	if !exists {
		return 0, errors.New("duration has not found")
	}

	dur, err := parseISO8601(durs)
	if err != nil {
		return 0, fmt.Errorf("unable to parse duration %s: %v", durs, err)
	}

	return int(dur.Minutes()), nil
}

func fetchOdyseeWatchTime(url string) (int, error) {
	clt := client.NewClientWithConfig(url, config.Opts)
	response, browserErr := browser.Exec(clt)
	if browserErr != nil {
		return 0, browserErr
	}

	doc, docErr := goquery.NewDocumentFromReader(response.Body)
	if docErr != nil {
		return 0, docErr
	}

	durs, exists := doc.Find(`meta[property="og:video:duration"]`).First().Attr("content")
	// durs contains video watch time in seconds
	if !exists {
		return 0, errors.New("duration has not found")
	}

	dur, err := strconv.ParseInt(durs, 10, 64)
	if err != nil {
		return 0, fmt.Errorf("unable to parse duration %s: %v", durs, err)
	}

	return int(dur / 60), nil
}

// parseISO8601 parses an ISO 8601 duration string.
func parseISO8601(from string) (time.Duration, error) {
	var match []string
	var d time.Duration

	if iso8601Regex.MatchString(from) {
		match = iso8601Regex.FindStringSubmatch(from)
	} else {
		return 0, errors.New("could not parse duration string")
	}

	for i, name := range iso8601Regex.SubexpNames() {
		part := match[i]
		if i == 0 || name == "" || part == "" {
			continue
		}

		val, err := strconv.ParseInt(part, 10, 64)
		if err != nil {
			return 0, err
		}

		switch name {
		case "hour":
			d = d + (time.Duration(val) * time.Hour)
		case "minute":
			d = d + (time.Duration(val) * time.Minute)
		case "second":
			d = d + (time.Duration(val) * time.Second)
		default:
			return 0, fmt.Errorf("unknown field %s", name)
		}
	}

	return d, nil
}

func calculateReadingTime(content string, user *model.User) int {
	sanitizedContent := sanitizer.StripTags(content)
	languageInfo := getlang.FromString(sanitizedContent)

	var timeToReadInt int
	if languageInfo.LanguageCode() == "ko" || languageInfo.LanguageCode() == "zh" || languageInfo.LanguageCode() == "jp" {
		timeToReadInt = int(math.Ceil(float64(utf8.RuneCountInString(sanitizedContent)) / float64(user.CJKReadingSpeed)))
	} else {
		nbOfWords := len(strings.Fields(sanitizedContent))
		timeToReadInt = int(math.Ceil(float64(nbOfWords) / float64(user.DefaultReadingSpeed)))
	}

	return timeToReadInt
}
Replace copyright header with SPDX identifier 2023-06-19 23:42:47 +02:00			`// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.`
			`// SPDX-License-Identifier: Apache-2.0`
Simplify feed entries filtering - Rename processor package to filter - Remove boilerplate code 2018-10-15 07:33:19 +02:00
Refactor manual entry scraper 2018-12-03 05:51:06 +01:00			`package processor`
Simplify feed entries filtering - Rename processor package to filter - Remove boilerplate code 2018-10-15 07:33:19 +02:00
			`import (`
Use YouTube video duration as read time This feature works by scraping YouTube website. To enable it, set the FETCH_YOUTUBE_WATCH_TIME environment variable to 1. Resolves #972. 2021-01-27 13:50:34 +01:00			`"errors"`
			`"fmt"`
Calculate reading time during feed processing The goal is to speed up the user interface. Detecting the language based on the content is pretty slow. 2020-11-19 02:29:40 +01:00			`"math"`
Add feed filters (Keeplist and Blocklist) 2020-10-16 23:40:56 +02:00			`"regexp"`
Use YouTube video duration as read time This feature works by scraping YouTube website. To enable it, set the FETCH_YOUTUBE_WATCH_TIME environment variable to 1. Resolves #972. 2021-01-27 13:50:34 +01:00			`"strconv"`
Calculate reading time during feed processing The goal is to speed up the user interface. Detecting the language based on the content is pretty slow. 2020-11-19 02:29:40 +01:00			`"strings"`
Add Prometheus exporter 2020-09-28 01:01:06 +02:00			`"time"`
Calculate reading time during feed processing The goal is to speed up the user interface. Detecting the language based on the content is pretty slow. 2020-11-19 02:29:40 +01:00			`"unicode/utf8"`
Add Prometheus exporter 2020-09-28 01:01:06 +02:00
Move internal packages to an internal folder For reference: https://go.dev/doc/go1.4#internalpackages 2023-08-11 04:46:45 +02:00			`"miniflux.app/v2/internal/config"`
			`"miniflux.app/v2/internal/http/client"`
			`"miniflux.app/v2/internal/logger"`
			`"miniflux.app/v2/internal/metric"`
			`"miniflux.app/v2/internal/model"`
			`"miniflux.app/v2/internal/reader/browser"`
			`"miniflux.app/v2/internal/reader/rewrite"`
			`"miniflux.app/v2/internal/reader/sanitizer"`
			`"miniflux.app/v2/internal/reader/scraper"`
			`"miniflux.app/v2/internal/storage"`
Calculate reading time during feed processing The goal is to speed up the user interface. Detecting the language based on the content is pretty slow. 2020-11-19 02:29:40 +01:00
Use YouTube video duration as read time This feature works by scraping YouTube website. To enable it, set the FETCH_YOUTUBE_WATCH_TIME environment variable to 1. Resolves #972. 2021-01-27 13:50:34 +01:00			`"github.com/PuerkitoBio/goquery"`
Calculate reading time during feed processing The goal is to speed up the user interface. Detecting the language based on the content is pretty slow. 2020-11-19 02:29:40 +01:00			`"github.com/rylans/getlang"`
Simplify feed entries filtering - Rename processor package to filter - Remove boilerplate code 2018-10-15 07:33:19 +02:00			`)`

Use YouTube video duration as read time This feature works by scraping YouTube website. To enable it, set the FETCH_YOUTUBE_WATCH_TIME environment variable to 1. Resolves #972. 2021-01-27 13:50:34 +01:00			`var (`
Add rewrite rules for article URL before fetching content 2022-07-12 06:12:26 +02:00			youtubeRegex = regexp.MustCompile(`youtube\.com/watch\?v=(.*)`)
Use Odysee video duration as read time This feature works by scraping the Odysee website. To enable it, set the FETCH_ODYSEE_WATCH_TIME environment variable to 1. 2023-03-18 11:13:58 +01:00			odyseeRegex = regexp.MustCompile(`^https://odysee\.com`)
Add rewrite rules for article URL before fetching content 2022-07-12 06:12:26 +02:00			iso8601Regex = regexp.MustCompile(`^P((?P<year>\d+)Y)?((?P<month>\d+)M)?((?P<week>\d+)W)?((?P<day>\d+)D)?(T((?P<hour>\d+)H)?((?P<minute>\d+)M)?((?P<second>\d+)S)?)?$`)
			customReplaceRuleRegex = regexp.MustCompile(`rewrite\("(.)"\\|"(.)"\)`)
Use YouTube video duration as read time This feature works by scraping YouTube website. To enable it, set the FETCH_YOUTUBE_WATCH_TIME environment variable to 1. Resolves #972. 2021-01-27 13:50:34 +01:00			`)`

Refactor manual entry scraper 2018-12-03 05:51:06 +01:00			`// ProcessFeedEntries downloads original web page for entries and apply filters.`
feat: support force refresh in feed edit and feed entries page 2023-08-08 16:12:41 +02:00			`func ProcessFeedEntries(store storage.Storage, feed model.Feed, user *model.User, forceRefresh bool) {`
Refactor entry filtering Avoid looping multiple times across entries 2020-10-20 07:07:35 +02:00			`var filteredEntries model.Entries`
Add feed filters (Keeplist and Blocklist) 2020-10-16 23:40:56 +02:00
Process older entries first Feed entries are usually ordered from most to least recent. Processing older entries first ensures that their creation timestamp is lower than that of newer entries. This is useful when we order by creation, because then we get a consistent timeline. 2023-03-01 17:58:01 +01:00			`// Process older entries first`
			`for i := len(feed.Entries) - 1; i >= 0; i-- {`
			`entry := feed.Entries[i]`

Refactor entry filtering Avoid looping multiple times across entries 2020-10-20 07:07:35 +02:00			`logger.Debug("[Processor] Processing entry %q from feed %q", entry.URL, feed.FeedURL)`

			`if isBlockedEntry(feed, entry) \|\| !isAllowedEntry(feed, entry) {`
			`continue`
			`}`

Add rewrite rules for article URL before fetching content 2022-07-12 06:12:26 +02:00			`url := getUrlFromEntry(feed, entry)`
Prevent Youtube scraping if entry already exists 2021-03-09 05:10:53 +01:00			`entryIsNew := !store.EntryURLExists(feed.ID, entry.URL)`
feat: support force refresh in feed edit and feed entries page 2023-08-08 16:12:41 +02:00			`if feed.Crawler && (entryIsNew \|\| forceRefresh) {`
Add rewrite rules for article URL before fetching content 2022-07-12 06:12:26 +02:00			`logger.Debug("[Processor] Crawling entry %q from feed %q", url, feed.FeedURL)`
Prevent Youtube scraping if entry already exists 2021-03-09 05:10:53 +01:00
			`startTime := time.Now()`
			`content, scraperErr := scraper.Fetch(`
Add rewrite rules for article URL before fetching content 2022-07-12 06:12:26 +02:00			`url,`
Prevent Youtube scraping if entry already exists 2021-03-09 05:10:53 +01:00			`feed.ScraperRules,`
			`feed.UserAgent,`
Add per feed cookies option 2021-03-23 04:27:58 +01:00			`feed.Cookie,`
Prevent Youtube scraping if entry already exists 2021-03-09 05:10:53 +01:00			`feed.AllowSelfSignedCertificates,`
add proxy arg in scraper.Fetch 2021-08-28 11:30:04 +02:00			`feed.FetchViaProxy,`
Prevent Youtube scraping if entry already exists 2021-03-09 05:10:53 +01:00			`)`

			`if config.Opts.HasMetricsCollector() {`
			`status := "success"`
Add Prometheus exporter 2020-09-28 01:01:06 +02:00			`if scraperErr != nil {`
Prevent Youtube scraping if entry already exists 2021-03-09 05:10:53 +01:00			`status = "error"`
Simplify feed entries filtering - Rename processor package to filter - Remove boilerplate code 2018-10-15 07:33:19 +02:00			`}`
Prevent Youtube scraping if entry already exists 2021-03-09 05:10:53 +01:00			`metric.ScraperRequestDuration.WithLabelValues(status).Observe(time.Since(startTime).Seconds())`
			`}`

			`if scraperErr != nil {`
			logger.Error(`[Processor] Unable to crawl this entry: %q => %v`, entry.URL, scraperErr)
			`} else if content != "" {`
			`// We replace the entry content only if the scraper doesn't return any error.`
			`entry.Content = content`
Simplify feed entries filtering - Rename processor package to filter - Remove boilerplate code 2018-10-15 07:33:19 +02:00			`}`
			`}`

Add a rewrite rule to remove clickbait titles 2023-04-08 11:02:36 +02:00			`rewrite.Rewriter(url, entry, feed.RewriteRules)`
Simplify feed entries filtering - Rename processor package to filter - Remove boilerplate code 2018-10-15 07:33:19 +02:00
			`// The sanitizer should always run at the end of the process to make sure unsafe HTML is filtered.`
Add rewrite rules for article URL before fetching content 2022-07-12 06:12:26 +02:00			`entry.Content = sanitizer.Sanitize(url, entry.Content)`
Refactor entry filtering Avoid looping multiple times across entries 2020-10-20 07:07:35 +02:00
Make reading speed user-configurable 2021-08-30 16:53:05 +02:00			`updateEntryReadingTime(store, feed, entry, entryIsNew, user)`
Refactor entry filtering Avoid looping multiple times across entries 2020-10-20 07:07:35 +02:00			`filteredEntries = append(filteredEntries, entry)`
Simplify feed entries filtering - Rename processor package to filter - Remove boilerplate code 2018-10-15 07:33:19 +02:00			`}`
Refactor entry filtering Avoid looping multiple times across entries 2020-10-20 07:07:35 +02:00
			`feed.Entries = filteredEntries`
Simplify feed entries filtering - Rename processor package to filter - Remove boilerplate code 2018-10-15 07:33:19 +02:00			`}`
Refactor manual entry scraper 2018-12-03 05:51:06 +01:00
Refactor entry filtering Avoid looping multiple times across entries 2020-10-20 07:07:35 +02:00			`func isBlockedEntry(feed model.Feed, entry model.Entry) bool {`
			`if feed.BlocklistRules != "" {`
			`match, _ := regexp.MatchString(feed.BlocklistRules, entry.Title)`
			`if match {`
			`logger.Debug("[Processor] Blocking entry %q from feed %q based on rule %q", entry.Title, feed.FeedURL, feed.BlocklistRules)`
			`return true`
Add feed filters (Keeplist and Blocklist) 2020-10-16 23:40:56 +02:00			`}`
			`}`
Refactor entry filtering Avoid looping multiple times across entries 2020-10-20 07:07:35 +02:00			`return false`
			`}`

			`func isAllowedEntry(feed model.Feed, entry model.Entry) bool {`
			`if feed.KeeplistRules != "" {`
			`match, _ := regexp.MatchString(feed.KeeplistRules, entry.Title)`
			`if match {`
			`logger.Debug("[Processor] Allow entry %q from feed %q based on rule %q", entry.Title, feed.FeedURL, feed.KeeplistRules)`
			`return true`
Add feed filters (Keeplist and Blocklist) 2020-10-16 23:40:56 +02:00			`}`
Refactor entry filtering Avoid looping multiple times across entries 2020-10-20 07:07:35 +02:00			`return false`
Add feed filters (Keeplist and Blocklist) 2020-10-16 23:40:56 +02:00			`}`
Refactor entry filtering Avoid looping multiple times across entries 2020-10-20 07:07:35 +02:00			`return true`
Add feed filters (Keeplist and Blocklist) 2020-10-16 23:40:56 +02:00			`}`

Refactor manual entry scraper 2018-12-03 05:51:06 +01:00			`// ProcessEntryWebPage downloads the entry web page and apply rewrite rules.`
Make reading speed user-configurable 2021-08-30 16:53:05 +02:00			`func ProcessEntryWebPage(feed model.Feed, entry model.Entry, user *model.User) error {`
Add Prometheus exporter 2020-09-28 01:01:06 +02:00			`startTime := time.Now()`
Add rewrite rules for article URL before fetching content 2022-07-12 06:12:26 +02:00			`url := getUrlFromEntry(feed, entry)`

Add option to allow self-signed or invalid certificates 2021-02-21 22:42:49 +01:00			`content, scraperErr := scraper.Fetch(`
Add rewrite rules for article URL before fetching content 2022-07-12 06:12:26 +02:00			`url,`
Add option to allow self-signed or invalid certificates 2021-02-21 22:42:49 +01:00			`entry.Feed.ScraperRules,`
			`entry.Feed.UserAgent,`
Add per feed cookies option 2021-03-23 04:27:58 +01:00			`entry.Feed.Cookie,`
Add option to allow self-signed or invalid certificates 2021-02-21 22:42:49 +01:00			`feed.AllowSelfSignedCertificates,`
add proxy arg in scraper.Fetch 2021-08-28 11:30:04 +02:00			`feed.FetchViaProxy,`
Add option to allow self-signed or invalid certificates 2021-02-21 22:42:49 +01:00			`)`

Add Prometheus exporter 2020-09-28 01:01:06 +02:00			`if config.Opts.HasMetricsCollector() {`
			`status := "success"`
			`if scraperErr != nil {`
			`status = "error"`
			`}`
			`metric.ScraperRequestDuration.WithLabelValues(status).Observe(time.Since(startTime).Seconds())`
			`}`

			`if scraperErr != nil {`
			`return scraperErr`
Refactor manual entry scraper 2018-12-03 05:51:06 +01:00			`}`

			`if content != "" {`
			`entry.Content = content`
Make reading speed user-configurable 2021-08-30 16:53:05 +02:00			`entry.ReadingTime = calculateReadingTime(content, user)`
Refactor manual entry scraper 2018-12-03 05:51:06 +01:00			`}`

Add a rewrite rule to remove clickbait titles 2023-04-08 11:02:36 +02:00			`rewrite.Rewriter(url, entry, entry.Feed.RewriteRules)`
			`entry.Content = sanitizer.Sanitize(url, entry.Content)`

Refactor manual entry scraper 2018-12-03 05:51:06 +01:00			`return nil`
			`}`
Calculate reading time during feed processing The goal is to speed up the user interface. Detecting the language based on the content is pretty slow. 2020-11-19 02:29:40 +01:00
Add rewrite rules for article URL before fetching content 2022-07-12 06:12:26 +02:00			`func getUrlFromEntry(feed model.Feed, entry model.Entry) string {`
			`var url = entry.URL`
			`if feed.UrlRewriteRules != "" {`
			`parts := customReplaceRuleRegex.FindStringSubmatch(feed.UrlRewriteRules)`

			`if len(parts) >= 3 {`
			`re := regexp.MustCompile(parts[1])`
			`url = re.ReplaceAllString(entry.URL, parts[2])`
			logger.Debug(`[Processor] Rewriting entry URL %s to %s`, entry.URL, url)
			`} else {`
			`logger.Debug("[Processor] Cannot find search and replace terms for replace rule %s", feed.UrlRewriteRules)`
			`}`
			`}`
			`return url`
			`}`

Make reading speed user-configurable 2021-08-30 16:53:05 +02:00			`func updateEntryReadingTime(store storage.Storage, feed model.Feed, entry model.Entry, entryIsNew bool, user model.User) {`
Prevent Youtube scraping if entry already exists 2021-03-09 05:10:53 +01:00			`if shouldFetchYouTubeWatchTime(entry) {`
			`if entryIsNew {`
			`watchTime, err := fetchYouTubeWatchTime(entry.URL)`
			`if err != nil {`
			`logger.Error("[Processor] Unable to fetch YouTube watch time: %q => %v", entry.URL, err)`
			`}`
			`entry.ReadingTime = watchTime`
			`} else {`
			`entry.ReadingTime = store.GetReadTime(entry, feed)`
			`}`
			`}`

Use Odysee video duration as read time This feature works by scraping the Odysee website. To enable it, set the FETCH_ODYSEE_WATCH_TIME environment variable to 1. 2023-03-18 11:13:58 +01:00			`if shouldFetchOdyseeWatchTime(entry) {`
			`if entryIsNew {`
			`watchTime, err := fetchOdyseeWatchTime(entry.URL)`
			`if err != nil {`
			`logger.Error("[Processor] Unable to fetch Odysee watch time: %q => %v", entry.URL, err)`
			`}`
			`entry.ReadingTime = watchTime`
			`} else {`
			`entry.ReadingTime = store.GetReadTime(entry, feed)`
			`}`
			`}`
Prevent Youtube scraping if entry already exists 2021-03-09 05:10:53 +01:00			`// Handle YT error case and non-YT entries.`
			`if entry.ReadingTime == 0 {`
Make reading speed user-configurable 2021-08-30 16:53:05 +02:00			`entry.ReadingTime = calculateReadingTime(entry.Content, user)`
Prevent Youtube scraping if entry already exists 2021-03-09 05:10:53 +01:00			`}`
			`}`

			`func shouldFetchYouTubeWatchTime(entry *model.Entry) bool {`
			`if !config.Opts.FetchYouTubeWatchTime() {`
			`return false`
			`}`
			`matches := youtubeRegex.FindStringSubmatch(entry.URL)`
			`urlMatchesYouTubePattern := len(matches) == 2`
			`return urlMatchesYouTubePattern`
			`}`

Use Odysee video duration as read time This feature works by scraping the Odysee website. To enable it, set the FETCH_ODYSEE_WATCH_TIME environment variable to 1. 2023-03-18 11:13:58 +01:00			`func shouldFetchOdyseeWatchTime(entry *model.Entry) bool {`
			`if !config.Opts.FetchOdyseeWatchTime() {`
			`return false`
			`}`
			`matches := odyseeRegex.FindStringSubmatch(entry.URL)`
			`return matches != nil`
			`}`

Use YouTube video duration as read time This feature works by scraping YouTube website. To enable it, set the FETCH_YOUTUBE_WATCH_TIME environment variable to 1. Resolves #972. 2021-01-27 13:50:34 +01:00			`func fetchYouTubeWatchTime(url string) (int, error) {`
			`clt := client.NewClientWithConfig(url, config.Opts)`
			`response, browserErr := browser.Exec(clt)`
			`if browserErr != nil {`
			`return 0, browserErr`
			`}`

			`doc, docErr := goquery.NewDocumentFromReader(response.Body)`
			`if docErr != nil {`
			`return 0, docErr`
			`}`

			durs, exists := doc.Find(`meta[itemprop="duration"]`).First().Attr("content")
			`if !exists {`
			`return 0, errors.New("duration has not found")`
			`}`

			`dur, err := parseISO8601(durs)`
			`if err != nil {`
			`return 0, fmt.Errorf("unable to parse duration %s: %v", durs, err)`
			`}`

			`return int(dur.Minutes()), nil`
			`}`

Use Odysee video duration as read time This feature works by scraping the Odysee website. To enable it, set the FETCH_ODYSEE_WATCH_TIME environment variable to 1. 2023-03-18 11:13:58 +01:00			`func fetchOdyseeWatchTime(url string) (int, error) {`
			`clt := client.NewClientWithConfig(url, config.Opts)`
			`response, browserErr := browser.Exec(clt)`
			`if browserErr != nil {`
			`return 0, browserErr`
			`}`

			`doc, docErr := goquery.NewDocumentFromReader(response.Body)`
			`if docErr != nil {`
			`return 0, docErr`
			`}`

			durs, exists := doc.Find(`meta[property="og:video:duration"]`).First().Attr("content")
			`// durs contains video watch time in seconds`
			`if !exists {`
			`return 0, errors.New("duration has not found")`
			`}`

			`dur, err := strconv.ParseInt(durs, 10, 64)`
			`if err != nil {`
			`return 0, fmt.Errorf("unable to parse duration %s: %v", durs, err)`
			`}`

			`return int(dur / 60), nil`
			`}`

Use YouTube video duration as read time This feature works by scraping YouTube website. To enable it, set the FETCH_YOUTUBE_WATCH_TIME environment variable to 1. Resolves #972. 2021-01-27 13:50:34 +01:00			`// parseISO8601 parses an ISO 8601 duration string.`
			`func parseISO8601(from string) (time.Duration, error) {`
			`var match []string`
			`var d time.Duration`

			`if iso8601Regex.MatchString(from) {`
			`match = iso8601Regex.FindStringSubmatch(from)`
			`} else {`
			`return 0, errors.New("could not parse duration string")`
			`}`

			`for i, name := range iso8601Regex.SubexpNames() {`
			`part := match[i]`
			`if i == 0 \|\| name == "" \|\| part == "" {`
			`continue`
			`}`

			`val, err := strconv.ParseInt(part, 10, 64)`
			`if err != nil {`
			`return 0, err`
			`}`

			`switch name {`
			`case "hour":`
			`d = d + (time.Duration(val) * time.Hour)`
			`case "minute":`
			`d = d + (time.Duration(val) * time.Minute)`
			`case "second":`
			`d = d + (time.Duration(val) * time.Second)`
			`default:`
			`return 0, fmt.Errorf("unknown field %s", name)`
			`}`
			`}`

			`return d, nil`
			`}`

Make reading speed user-configurable 2021-08-30 16:53:05 +02:00			`func calculateReadingTime(content string, user *model.User) int {`
Calculate reading time during feed processing The goal is to speed up the user interface. Detecting the language based on the content is pretty slow. 2020-11-19 02:29:40 +01:00			`sanitizedContent := sanitizer.StripTags(content)`
			`languageInfo := getlang.FromString(sanitizedContent)`

			`var timeToReadInt int`
			`if languageInfo.LanguageCode() == "ko" \|\| languageInfo.LanguageCode() == "zh" \|\| languageInfo.LanguageCode() == "jp" {`
Make reading speed user-configurable 2021-08-30 16:53:05 +02:00			`timeToReadInt = int(math.Ceil(float64(utf8.RuneCountInString(sanitizedContent)) / float64(user.CJKReadingSpeed)))`
Calculate reading time during feed processing The goal is to speed up the user interface. Detecting the language based on the content is pretty slow. 2020-11-19 02:29:40 +01:00			`} else {`
			`nbOfWords := len(strings.Fields(sanitizedContent))`
Make reading speed user-configurable 2021-08-30 16:53:05 +02:00			`timeToReadInt = int(math.Ceil(float64(nbOfWords) / float64(user.DefaultReadingSpeed)))`
Calculate reading time during feed processing The goal is to speed up the user interface. Detecting the language based on the content is pretty slow. 2020-11-19 02:29:40 +01:00			`}`

			`return timeToReadInt`
			`}`