Calculate reading time during feed processing

The goal is to speed up the user interface.

Detecting the language based on the content is pretty slow.
This commit is contained in:
Frédéric Guillot 2020-11-18 17:29:40 -08:00 committed by fguillot
parent b1c9977711
commit de7a613098
12 changed files with 84 additions and 50 deletions

View file

@ -141,6 +141,7 @@ type Entry struct {
Author string `json:"author"` Author string `json:"author"`
ShareCode string `json:"share_code"` ShareCode string `json:"share_code"`
Starred bool `json:"starred"` Starred bool `json:"starred"`
ReadingTime int `json:"reading_time"`
Enclosures Enclosures `json:"enclosures,omitempty"` Enclosures Enclosures `json:"enclosures,omitempty"`
Feed *Feed `json:"feed,omitempty"` Feed *Feed `json:"feed,omitempty"`
} }

View file

@ -12,7 +12,7 @@ import (
"miniflux.app/logger" "miniflux.app/logger"
) )
const schemaVersion = 40 const schemaVersion = 41
// Migrate executes database migrations. // Migrate executes database migrations.
func Migrate(db *sql.DB) { func Migrate(db *sql.DB) {

View file

@ -203,6 +203,7 @@ alter table users add column entry_direction entry_sorting_direction default 'as
add column keeplist_rules text not null default '' add column keeplist_rules text not null default ''
; ;
`, `,
"schema_version_41": `alter table entries add column reading_time int not null default 0;`,
"schema_version_5": `create table integrations ( "schema_version_5": `create table integrations (
user_id int not null, user_id int not null,
pinboard_enabled bool default 'f', pinboard_enabled bool default 'f',
@ -264,6 +265,7 @@ var SqlMapChecksums = map[string]string{
"schema_version_39": "b0f90b97502921d4681a07c64d180a91a0b4ccac7d3c1dbe30519ad6f1bf1737", "schema_version_39": "b0f90b97502921d4681a07c64d180a91a0b4ccac7d3c1dbe30519ad6f1bf1737",
"schema_version_4": "216ea3a7d3e1704e40c797b5dc47456517c27dbb6ca98bf88812f4f63d74b5d9", "schema_version_4": "216ea3a7d3e1704e40c797b5dc47456517c27dbb6ca98bf88812f4f63d74b5d9",
"schema_version_40": "6a8fec92399f853ed6817aff4cfa43255dce4c19afad796e41519d09de62105e", "schema_version_40": "6a8fec92399f853ed6817aff4cfa43255dce4c19afad796e41519d09de62105e",
"schema_version_41": "128e118ce61267ea1f6ae03b63a6d4734eae87e520b00e309ad083f1f6afdfe5",
"schema_version_5": "46397e2f5f2c82116786127e9f6a403e975b14d2ca7b652a48cd1ba843e6a27c", "schema_version_5": "46397e2f5f2c82116786127e9f6a403e975b14d2ca7b652a48cd1ba843e6a27c",
"schema_version_6": "9d05b4fb223f0e60efc716add5048b0ca9c37511cf2041721e20505d6d798ce4", "schema_version_6": "9d05b4fb223f0e60efc716add5048b0ca9c37511cf2041721e20505d6d798ce4",
"schema_version_7": "33f298c9aa30d6de3ca28e1270df51c2884d7596f1283a75716e2aeb634cd05c", "schema_version_7": "33f298c9aa30d6de3ca28e1270df51c2884d7596f1283a75716e2aeb634cd05c",

View file

@ -0,0 +1 @@
alter table entries add column reading_time int not null default 0;

View file

@ -33,6 +33,7 @@ type Entry struct {
Author string `json:"author"` Author string `json:"author"`
ShareCode string `json:"share_code"` ShareCode string `json:"share_code"`
Starred bool `json:"starred"` Starred bool `json:"starred"`
ReadingTime int `json:"reading_time"`
Enclosures EnclosureList `json:"enclosures,omitempty"` Enclosures EnclosureList `json:"enclosures,omitempty"`
Feed *Feed `json:"feed,omitempty"` Feed *Feed `json:"feed,omitempty"`
} }

View file

@ -5,8 +5,11 @@
package processor package processor
import ( import (
"math"
"regexp" "regexp"
"strings"
"time" "time"
"unicode/utf8"
"miniflux.app/config" "miniflux.app/config"
"miniflux.app/logger" "miniflux.app/logger"
@ -16,6 +19,8 @@ import (
"miniflux.app/reader/sanitizer" "miniflux.app/reader/sanitizer"
"miniflux.app/reader/scraper" "miniflux.app/reader/scraper"
"miniflux.app/storage" "miniflux.app/storage"
"github.com/rylans/getlang"
) )
// ProcessFeedEntries downloads original web page for entries and apply filters. // ProcessFeedEntries downloads original web page for entries and apply filters.
@ -58,6 +63,7 @@ func ProcessFeedEntries(store *storage.Storage, feed *model.Feed) {
// The sanitizer should always run at the end of the process to make sure unsafe HTML is filtered. // The sanitizer should always run at the end of the process to make sure unsafe HTML is filtered.
entry.Content = sanitizer.Sanitize(entry.URL, entry.Content) entry.Content = sanitizer.Sanitize(entry.URL, entry.Content)
entry.ReadingTime = calculateReadingTime(entry.Content)
filteredEntries = append(filteredEntries, entry) filteredEntries = append(filteredEntries, entry)
} }
@ -108,7 +114,23 @@ func ProcessEntryWebPage(entry *model.Entry) error {
if content != "" { if content != "" {
entry.Content = content entry.Content = content
entry.ReadingTime = calculateReadingTime(content)
} }
return nil return nil
} }
func calculateReadingTime(content string) int {
sanitizedContent := sanitizer.StripTags(content)
languageInfo := getlang.FromString(sanitizedContent)
var timeToReadInt int
if languageInfo.LanguageCode() == "ko" || languageInfo.LanguageCode() == "zh" || languageInfo.LanguageCode() == "jp" {
timeToReadInt = int(math.Ceil(float64(utf8.RuneCountInString(sanitizedContent)) / 500))
} else {
nbOfWords := len(strings.Fields(sanitizedContent))
timeToReadInt = int(math.Ceil(float64(nbOfWords) / 265))
}
return timeToReadInt
}

View file

@ -75,11 +75,11 @@ func (s *Storage) UpdateEntryContent(entry *model.Entry) error {
UPDATE UPDATE
entries entries
SET SET
content=$1 content=$1, reading_time=$2
WHERE WHERE
id=$2 AND user_id=$3 id=$3 AND user_id=$4
` `
_, err = tx.Exec(query, entry.Content, entry.ID, entry.UserID) _, err = tx.Exec(query, entry.Content, entry.ReadingTime, entry.ID, entry.UserID)
if err != nil { if err != nil {
tx.Rollback() tx.Rollback()
return fmt.Errorf(`store: unable to update content of entry #%d: %v`, entry.ID, err) return fmt.Errorf(`store: unable to update content of entry #%d: %v`, entry.ID, err)
@ -106,9 +106,35 @@ func (s *Storage) UpdateEntryContent(entry *model.Entry) error {
func (s *Storage) createEntry(tx *sql.Tx, entry *model.Entry) error { func (s *Storage) createEntry(tx *sql.Tx, entry *model.Entry) error {
query := ` query := `
INSERT INTO entries INSERT INTO entries
(title, hash, url, comments_url, published_at, content, author, user_id, feed_id, changed_at, document_vectors) (
title,
hash,
url,
comments_url,
published_at,
content,
author,
user_id,
feed_id,
reading_time,
changed_at,
document_vectors
)
VALUES VALUES
($1, $2, $3, $4, $5, $6, $7, $8, $9, now(), setweight(to_tsvector(substring(coalesce($1, '') for 1000000)), 'A') || setweight(to_tsvector(substring(coalesce($6, '') for 1000000)), 'B')) (
$1,
$2,
$3,
$4,
$5,
$6,
$7,
$8,
$9,
$10,
now(),
setweight(to_tsvector(substring(coalesce($1, '') for 1000000)), 'A') || setweight(to_tsvector(substring(coalesce($6, '') for 1000000)), 'B')
)
RETURNING RETURNING
id, status id, status
` `
@ -123,6 +149,7 @@ func (s *Storage) createEntry(tx *sql.Tx, entry *model.Entry) error {
entry.Author, entry.Author,
entry.UserID, entry.UserID,
entry.FeedID, entry.FeedID,
entry.ReadingTime,
).Scan(&entry.ID, &entry.Status) ).Scan(&entry.ID, &entry.Status)
if err != nil { if err != nil {
@ -154,9 +181,10 @@ func (s *Storage) updateEntry(tx *sql.Tx, entry *model.Entry) error {
comments_url=$3, comments_url=$3,
content=$4, content=$4,
author=$5, author=$5,
reading_time=$6,
document_vectors = setweight(to_tsvector(substring(coalesce($1, '') for 1000000)), 'A') || setweight(to_tsvector(substring(coalesce($4, '') for 1000000)), 'B') document_vectors = setweight(to_tsvector(substring(coalesce($1, '') for 1000000)), 'A') || setweight(to_tsvector(substring(coalesce($4, '') for 1000000)), 'B')
WHERE WHERE
user_id=$6 AND feed_id=$7 AND hash=$8 user_id=$7 AND feed_id=$8 AND hash=$9
RETURNING RETURNING
id id
` `
@ -167,6 +195,7 @@ func (s *Storage) updateEntry(tx *sql.Tx, entry *model.Entry) error {
entry.CommentsURL, entry.CommentsURL,
entry.Content, entry.Content,
entry.Author, entry.Author,
entry.ReadingTime,
entry.UserID, entry.UserID,
entry.FeedID, entry.FeedID,
entry.Hash, entry.Hash,

View file

@ -226,6 +226,7 @@ func (e *EntryQueryBuilder) GetEntries() (model.Entries, error) {
e.content, e.content,
e.status, e.status,
e.starred, e.starred,
e.reading_time,
f.title as feed_title, f.title as feed_title,
f.feed_url, f.feed_url,
f.site_url, f.site_url,
@ -284,6 +285,7 @@ func (e *EntryQueryBuilder) GetEntries() (model.Entries, error) {
&entry.Content, &entry.Content,
&entry.Status, &entry.Status,
&entry.Starred, &entry.Starred,
&entry.ReadingTime,
&entry.Feed.Title, &entry.Feed.Title,
&entry.Feed.FeedURL, &entry.Feed.FeedURL,
&entry.Feed.SiteURL, &entry.Feed.SiteURL,

View file

@ -242,10 +242,10 @@ SOFTWARE.
<li> <li>
<time datetime="{{ isodate .entry.Date }}" title="{{ isodate .entry.Date }}">{{ elapsed .user.Timezone .entry.Date }}</time> <time datetime="{{ isodate .entry.Date }}" title="{{ isodate .entry.Date }}">{{ elapsed .user.Timezone .entry.Date }}</time>
</li> </li>
{{ if .user.ShowReadingTime }} {{ if and .user.ShowReadingTime (gt .entry.ReadingTime 0) }}
<li> <li>
<span> <span>
{{ plural "entry.estimated_reading_time" (timeToRead .entry.Content) (timeToRead .entry.Content) }} {{ plural "entry.estimated_reading_time" .entry.ReadingTime .entry.ReadingTime }}
</span> </span>
</li> </li>
{{ end }} {{ end }}
@ -523,7 +523,7 @@ var templateCommonMapChecksums = map[string]string{
"feed_list": "931e43d328a116318c510de5658c688cd940b934c86b6ec82a472e1f81e020ae", "feed_list": "931e43d328a116318c510de5658c688cd940b934c86b6ec82a472e1f81e020ae",
"feed_menu": "318d8662dda5ca9dfc75b909c8461e79c86fb5082df1428f67aaf856f19f4b50", "feed_menu": "318d8662dda5ca9dfc75b909c8461e79c86fb5082df1428f67aaf856f19f4b50",
"icons": "9a41753778072f286216085d8712495e2ccca20c7a24f5c982775436a3d38579", "icons": "9a41753778072f286216085d8712495e2ccca20c7a24f5c982775436a3d38579",
"item_meta": "eb72c6e2a924759af20b8ef41f2ce7495aedc053181c2e5ca1b063f9410c58b0", "item_meta": "56ab09d7dd46eeb2e2ee11ddcec0c157a5832c896dbd2887d9e2b013680b2af6",
"layout": "65767e7dbebe1f7ed42895ecd5a737b0693e4a2ec35e84e3e391f462beb11977", "layout": "65767e7dbebe1f7ed42895ecd5a737b0693e4a2ec35e84e3e391f462beb11977",
"pagination": "7b61288e86283c4cf0dc83bcbf8bf1c00c7cb29e60201c8c0b633b2450d2911f", "pagination": "7b61288e86283c4cf0dc83bcbf8bf1c00c7cb29e60201c8c0b633b2450d2911f",
"settings_menu": "e2b777630c0efdbc529800303c01d6744ed3af80ec505ac5a5b3f99c9b989156", "settings_menu": "e2b777630c0efdbc529800303c01d6744ed3af80ec505ac5a5b3f99c9b989156",

View file

@ -65,9 +65,6 @@ func (e *Engine) Render(name, language string, data interface{}) []byte {
"plural": func(key string, n int, args ...interface{}) string { "plural": func(key string, n int, args ...interface{}) string {
return printer.Plural(key, n, args...) return printer.Plural(key, n, args...)
}, },
"timeToRead": func(content string) int {
return timeToRead(content)
},
}) })
var b bytes.Buffer var b bytes.Buffer

View file

@ -11,19 +11,16 @@ import (
"net/mail" "net/mail"
"strings" "strings"
"time" "time"
"unicode/utf8"
"miniflux.app/config" "miniflux.app/config"
"miniflux.app/http/route" "miniflux.app/http/route"
"miniflux.app/locale" "miniflux.app/locale"
"miniflux.app/model" "miniflux.app/model"
"miniflux.app/proxy" "miniflux.app/proxy"
"miniflux.app/reader/sanitizer"
"miniflux.app/timezone" "miniflux.app/timezone"
"miniflux.app/url" "miniflux.app/url"
"github.com/gorilla/mux" "github.com/gorilla/mux"
"github.com/rylans/getlang"
) )
type funcMap struct { type funcMap struct {
@ -94,9 +91,6 @@ func (f *funcMap) Map() template.FuncMap {
"plural": func(key string, n int, args ...interface{}) string { "plural": func(key string, n int, args ...interface{}) string {
return "" return ""
}, },
"timeToRead": func(content string) int {
return 0
},
} }
} }
@ -195,18 +189,3 @@ func formatFileSize(b int64) string {
return fmt.Sprintf("%.1f %ciB", return fmt.Sprintf("%.1f %ciB",
float64(b)/float64(div), "KMGTPE"[exp]) float64(b)/float64(div), "KMGTPE"[exp])
} }
func timeToRead(content string) int {
sanitizedContent := sanitizer.StripTags(content)
languageInfo := getlang.FromString(sanitizedContent)
var timeToReadInt int
if languageInfo.LanguageCode() == "ko" || languageInfo.LanguageCode() == "zh" || languageInfo.LanguageCode() == "jp" {
timeToReadInt = int(math.Ceil(float64(utf8.RuneCountInString(sanitizedContent)) / 500))
} else {
nbOfWords := len(strings.Fields(sanitizedContent))
timeToReadInt = int(math.Ceil(float64(nbOfWords) / 265))
}
return timeToReadInt
}

View file

@ -7,10 +7,10 @@
<li> <li>
<time datetime="{{ isodate .entry.Date }}" title="{{ isodate .entry.Date }}">{{ elapsed .user.Timezone .entry.Date }}</time> <time datetime="{{ isodate .entry.Date }}" title="{{ isodate .entry.Date }}">{{ elapsed .user.Timezone .entry.Date }}</time>
</li> </li>
{{ if .user.ShowReadingTime }} {{ if and .user.ShowReadingTime (gt .entry.ReadingTime 0) }}
<li> <li>
<span> <span>
{{ plural "entry.estimated_reading_time" (timeToRead .entry.Content) (timeToRead .entry.Content) }} {{ plural "entry.estimated_reading_time" .entry.ReadingTime .entry.ReadingTime }}
</span> </span>
</li> </li>
{{ end }} {{ end }}