// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved. // SPDX-License-Identifier: Apache-2.0 package rss // import "miniflux.app/v2/internal/reader/rss" import ( "encoding/xml" "html" "log/slog" "path" "strconv" "strings" "time" "miniflux.app/v2/internal/crypto" "miniflux.app/v2/internal/model" "miniflux.app/v2/internal/reader/date" "miniflux.app/v2/internal/reader/dublincore" "miniflux.app/v2/internal/reader/media" "miniflux.app/v2/internal/reader/sanitizer" "miniflux.app/v2/internal/urllib" ) // Specs: https://www.rssboard.org/rss-specification type rssFeed struct { XMLName xml.Name `xml:"rss"` Version string `xml:"rss version,attr"` Channel rssChannel `xml:"rss channel"` } type rssChannel struct { Title string `xml:"rss title"` Link string `xml:"rss link"` ImageURL string `xml:"rss image>url"` Language string `xml:"rss language"` Description string `xml:"rss description"` PubDate string `xml:"rss pubDate"` ManagingEditor string `xml:"rss managingEditor"` Webmaster string `xml:"rss webMaster"` TimeToLive rssTTL `xml:"rss ttl"` Items []rssItem `xml:"rss item"` AtomLinks PodcastFeedElement } type rssTTL struct { Data string `xml:",chardata"` } func (r *rssTTL) Value() int { if r.Data == "" { return 0 } value, err := strconv.Atoi(r.Data) if err != nil { return 0 } return value } func (r *rssFeed) Transform(baseURL string) *model.Feed { var err error feed := new(model.Feed) siteURL := r.siteURL() feed.SiteURL, err = urllib.AbsoluteURL(baseURL, siteURL) if err != nil { feed.SiteURL = siteURL } feedURL := r.feedURL() feed.FeedURL, err = urllib.AbsoluteURL(baseURL, feedURL) if err != nil { feed.FeedURL = feedURL } feed.Title = html.UnescapeString(strings.TrimSpace(r.Channel.Title)) if feed.Title == "" { feed.Title = feed.SiteURL } feed.IconURL = strings.TrimSpace(r.Channel.ImageURL) feed.TTL = r.Channel.TimeToLive.Value() for _, item := range r.Channel.Items { entry := item.Transform() if entry.Author == "" { entry.Author = r.feedAuthor() } if entry.URL == "" { entry.URL = feed.SiteURL } else { entryURL, err := urllib.AbsoluteURL(feed.SiteURL, entry.URL) if err == nil { entry.URL = entryURL } } if entry.Title == "" { entry.Title = sanitizer.TruncateHTML(entry.Content, 100) } if entry.Title == "" { entry.Title = entry.URL } feed.Entries = append(feed.Entries, entry) } return feed } func (r *rssFeed) siteURL() string { return strings.TrimSpace(r.Channel.Link) } func (r *rssFeed) feedURL() string { for _, atomLink := range r.Channel.AtomLinks.Links { if atomLink.Rel == "self" { return strings.TrimSpace(atomLink.URL) } } return "" } func (r rssFeed) feedAuthor() string { author := r.Channel.PodcastAuthor() switch { case r.Channel.ManagingEditor != "": author = r.Channel.ManagingEditor case r.Channel.Webmaster != "": author = r.Channel.Webmaster case r.Channel.GooglePlayAuthor != "": author = r.Channel.GooglePlayAuthor case r.Channel.PodcastOwner.String() != "": author = r.Channel.PodcastOwner.String() } return sanitizer.StripTags(strings.TrimSpace(author)) } type rssGUID struct { XMLName xml.Name Data string `xml:",chardata"` IsPermaLink string `xml:"isPermaLink,attr"` } type rssAuthor struct { XMLName xml.Name Data string `xml:",chardata"` Inner string `xml:",innerxml"` } type rssEnclosure struct { URL string `xml:"url,attr"` Type string `xml:"type,attr"` Length string `xml:"length,attr"` } type rssCategory struct { XMLName xml.Name Data string `xml:",chardata"` Inner string `xml:",innerxml"` } func (enclosure *rssEnclosure) Size() int64 { if enclosure.Length == "" { return 0 } size, _ := strconv.ParseInt(enclosure.Length, 10, 0) return size } type rssItem struct { GUID rssGUID `xml:"rss guid"` Title string `xml:"rss title"` Link string `xml:"rss link"` Description string `xml:"rss description"` PubDate string `xml:"rss pubDate"` Author rssAuthor `xml:"rss author"` Comments string `xml:"rss comments"` EnclosureLinks []rssEnclosure `xml:"rss enclosure"` Categories []rssCategory `xml:"rss category"` dublincore.DublinCoreItemElement FeedBurnerElement PodcastEntryElement media.Element AtomAuthor AtomLinks } func (r *rssItem) Transform() *model.Entry { entry := model.NewEntry() entry.URL = r.entryURL() entry.CommentsURL = r.entryCommentsURL() entry.Date = r.entryDate() entry.Author = r.entryAuthor() entry.Hash = r.entryHash() entry.Content = r.entryContent() entry.Title = r.entryTitle() entry.Enclosures = r.entryEnclosures() entry.Tags = r.entryCategories() if duration, err := normalizeDuration(r.Duration); err == nil { entry.ReadingTime = duration } return entry } func (r *rssItem) entryDate() time.Time { value := r.PubDate if r.DublinCoreDate != "" { value = r.DublinCoreDate } if value != "" { result, err := date.Parse(value) if err != nil { slog.Debug("Unable to parse date from RSS feed", slog.String("date", value), slog.String("guid", r.GUID.Data), slog.Any("error", err), ) return time.Now() } return result } return time.Now() } func (r *rssItem) entryAuthor() string { var author string switch { case r.PodcastOwner.String() != "": author = r.PodcastOwner.String() case r.GooglePlayAuthor != "": author = r.GooglePlayAuthor case r.ItunesAuthor != "": author = r.ItunesAuthor case r.DublinCoreCreator != "": author = r.DublinCoreCreator case r.AtomAuthor.String() != "": author = r.AtomAuthor.String() case strings.Contains(r.Author.Inner, "