From fdd1b3f18e60a53fe703d63b0bf4663c835e7277 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Guillot?= Date: Thu, 4 Apr 2024 19:44:58 -0700 Subject: [PATCH] database: entry URLs can exceeds btree index size limit --- internal/database/migrations.go | 6 ++++++ internal/reader/processor/processor.go | 16 ++++++++-------- internal/storage/entry.go | 14 ++++++-------- 3 files changed, 20 insertions(+), 16 deletions(-) diff --git a/internal/database/migrations.go b/internal/database/migrations.go index d40e5d2f..fa3c3972 100644 --- a/internal/database/migrations.go +++ b/internal/database/migrations.go @@ -882,4 +882,10 @@ var migrations = []func(tx *sql.Tx) error{ _, err = tx.Exec(sql) return err }, + func(tx *sql.Tx) (err error) { + // Entry URLs can exceeds btree maximum size + // Checking entry existence is now using entries_feed_id_status_hash_idx index + _, err = tx.Exec(`DROP INDEX entries_feed_url_idx`) + return err + }, } diff --git a/internal/reader/processor/processor.go b/internal/reader/processor/processor.go index 913ae0b3..ab4448ef 100644 --- a/internal/reader/processor/processor.go +++ b/internal/reader/processor/processor.go @@ -42,8 +42,9 @@ func ProcessFeedEntries(store *storage.Storage, feed *model.Feed, user *model.Us slog.Debug("Processing entry", slog.Int64("user_id", user.ID), - slog.Int64("entry_id", entry.ID), slog.String("entry_url", entry.URL), + slog.String("entry_hash", entry.Hash), + slog.String("entry_title", entry.Title), slog.Int64("feed_id", feed.ID), slog.String("feed_url", feed.FeedURL), ) @@ -52,14 +53,18 @@ func ProcessFeedEntries(store *storage.Storage, feed *model.Feed, user *model.Us } websiteURL := getUrlFromEntry(feed, entry) - entryIsNew := !store.EntryURLExists(feed.ID, entry.URL) + entryIsNew := store.IsNewEntry(feed.ID, entry.Hash) if feed.Crawler && (entryIsNew || forceRefresh) { slog.Debug("Scraping entry", slog.Int64("user_id", user.ID), - slog.Int64("entry_id", entry.ID), slog.String("entry_url", entry.URL), + slog.String("entry_hash", entry.Hash), + slog.String("entry_title", entry.Title), slog.Int64("feed_id", feed.ID), slog.String("feed_url", feed.FeedURL), + slog.Bool("entry_is_new", entryIsNew), + slog.Bool("force_refresh", forceRefresh), + slog.String("website_url", websiteURL), ) startTime := time.Now() @@ -90,7 +95,6 @@ func ProcessFeedEntries(store *storage.Storage, feed *model.Feed, user *model.Us if scraperErr != nil { slog.Warn("Unable to scrape entry", slog.Int64("user_id", user.ID), - slog.Int64("entry_id", entry.ID), slog.String("entry_url", entry.URL), slog.Int64("feed_id", feed.ID), slog.String("feed_url", feed.FeedURL), @@ -134,7 +138,6 @@ func isBlockedEntry(feed *model.Feed, entry *model.Entry) bool { if compiledBlocklist.MatchString(entry.URL) || compiledBlocklist.MatchString(entry.Title) || compiledBlocklist.MatchString(entry.Author) || containsBlockedTag { slog.Debug("Blocking entry based on rule", - slog.Int64("entry_id", entry.ID), slog.String("entry_url", entry.URL), slog.Int64("feed_id", feed.ID), slog.String("feed_url", feed.FeedURL), @@ -165,7 +168,6 @@ func isAllowedEntry(feed *model.Feed, entry *model.Entry) bool { if compiledKeeplist.MatchString(entry.URL) || compiledKeeplist.MatchString(entry.Title) || compiledKeeplist.MatchString(entry.Author) || containsAllowedTag { slog.Debug("Allow entry based on rule", - slog.Int64("entry_id", entry.ID), slog.String("entry_url", entry.URL), slog.Int64("feed_id", feed.ID), slog.String("feed_url", feed.FeedURL), @@ -230,7 +232,6 @@ func getUrlFromEntry(feed *model.Feed, entry *model.Entry) string { re := regexp.MustCompile(parts[1]) url = re.ReplaceAllString(entry.URL, parts[2]) slog.Debug("Rewriting entry URL", - slog.Int64("entry_id", entry.ID), slog.String("original_entry_url", entry.URL), slog.String("rewritten_entry_url", url), slog.Int64("feed_id", feed.ID), @@ -238,7 +239,6 @@ func getUrlFromEntry(feed *model.Feed, entry *model.Entry) string { ) } else { slog.Debug("Cannot find search and replace terms for replace rule", - slog.Int64("entry_id", entry.ID), slog.String("original_entry_url", entry.URL), slog.String("rewritten_entry_url", url), slog.Int64("feed_id", feed.ID), diff --git a/internal/storage/entry.go b/internal/storage/entry.go index 1a7cc6d7..867338f7 100644 --- a/internal/storage/entry.go +++ b/internal/storage/entry.go @@ -225,6 +225,12 @@ func (s *Storage) entryExists(tx *sql.Tx, entry *model.Entry) (bool, error) { return result, nil } +func (s *Storage) IsNewEntry(feedID int64, entryHash string) bool { + var result bool + s.db.QueryRow(`SELECT true FROM entries WHERE feed_id=$1 AND hash=$2`, feedID, entryHash).Scan(&result) + return !result +} + // GetReadTime fetches the read time of an entry based on its hash, and the feed id and user id from the feed. // It's intended to be used on entries objects created by parsing a feed as they don't contain much information. // The feed param helps to scope the search to a specific user and feed in order to avoid hash clashes. @@ -575,14 +581,6 @@ func (s *Storage) MarkCategoryAsRead(userID, categoryID int64, before time.Time) return nil } -// EntryURLExists returns true if an entry with this URL already exists. -func (s *Storage) EntryURLExists(feedID int64, entryURL string) bool { - var result bool - query := `SELECT true FROM entries WHERE feed_id=$1 AND url=$2` - s.db.QueryRow(query, feedID, entryURL).Scan(&result) - return result -} - // EntryShareCode returns the share code of the provided entry. // It generates a new one if not already defined. func (s *Storage) EntryShareCode(userID int64, entryID int64) (shareCode string, err error) {