Remove some duplicated code in RSS parser

This commit is contained in:
Frédéric Guillot 2024-03-15 18:04:24 -07:00
parent dd4fb660c1
commit 4834e934f2
4 changed files with 227 additions and 64 deletions

View file

@ -39,7 +39,7 @@ func (r *RSSAdapter) BuildFeed(feedURL string) *model.Feed {
// Try to find the feed URL from the Atom links.
for _, atomLink := range r.rss.Channel.AtomLinks.Links {
atomLinkHref := strings.TrimSpace(atomLink.URL)
atomLinkHref := strings.TrimSpace(atomLink.Href)
if atomLinkHref != "" && atomLink.Rel == "self" {
if absoluteFeedURL, err := urllib.AbsoluteURL(feedURL, atomLinkHref); err == nil {
feed.FeedURL = absoluteFeedURL
@ -170,8 +170,8 @@ func findEntryURL(rssItem *RSSItem) string {
}
for _, atomLink := range rssItem.AtomLinks.Links {
if atomLink.URL != "" && (strings.EqualFold(atomLink.Rel, "alternate") || atomLink.Rel == "") {
return strings.TrimSpace(atomLink.URL)
if atomLink.Href != "" && (strings.EqualFold(atomLink.Rel, "alternate") || atomLink.Rel == "") {
return strings.TrimSpace(atomLink.Href)
}
}
@ -233,8 +233,8 @@ func findEntryAuthor(rssItem *RSSItem) string {
author = rssItem.ItunesAuthor
case rssItem.DublinCoreCreator != "":
author = rssItem.DublinCoreCreator
case rssItem.AtomAuthor.String() != "":
author = rssItem.AtomAuthor.String()
case rssItem.AtomAuthor.PersonName() != "":
author = rssItem.AtomAuthor.PersonName()
case strings.Contains(rssItem.Author.Inner, "<![CDATA["):
author = rssItem.Author.Data
default:

View file

@ -3,41 +3,18 @@
package rss // import "miniflux.app/v2/internal/reader/rss"
import "strings"
import (
"miniflux.app/v2/internal/reader/atom"
)
type AtomAuthor struct {
Author AtomPerson `xml:"http://www.w3.org/2005/Atom author"`
Author atom.AtomPerson `xml:"http://www.w3.org/2005/Atom author"`
}
func (a *AtomAuthor) String() string {
return a.Author.String()
}
type AtomPerson struct {
Name string `xml:"name"`
Email string `xml:"email"`
}
func (a *AtomPerson) String() string {
var name string
switch {
case a.Name != "":
name = a.Name
case a.Email != "":
name = a.Email
}
return strings.TrimSpace(name)
}
type AtomLink struct {
URL string `xml:"href,attr"`
Type string `xml:"type,attr"`
Rel string `xml:"rel,attr"`
Length string `xml:"length,attr"`
func (a *AtomAuthor) PersonName() string {
return a.Author.PersonName()
}
type AtomLinks struct {
Links []*AtomLink `xml:"http://www.w3.org/2005/Atom link"`
Links []*atom.AtomLink `xml:"http://www.w3.org/2005/Atom link"`
}

View file

@ -746,6 +746,106 @@ func TestParseEntryWithContentEncoded(t *testing.T) {
}
}
// https://www.rssboard.org/rss-encoding-examples
func TestParseEntryDescriptionWithEncodedHTMLTags(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?>
<rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/">
<channel>
<title>Example</title>
<link>http://example.org/</link>
<item>
<title>Item 1</title>
<link>http://example.org/item1</link>
<description>this is &lt;b&gt;bold&lt;/b&gt;</description>
</item>
</channel>
</rss>`
feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data)))
if err != nil {
t.Fatal(err)
}
if feed.Entries[0].Content != `this is <b>bold</b>` {
t.Errorf("Incorrect entry content, got: %q", feed.Entries[0].Content)
}
}
// https://www.rssboard.org/rss-encoding-examples
func TestParseEntryWithDescriptionWithHTMLCDATA(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?>
<rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/">
<channel>
<title>Example</title>
<link>http://example.org/</link>
<item>
<title>Item 1</title>
<link>http://example.org/item1</link>
<description><![CDATA[this is <b>bold</b>]]></description>
</item>
</channel>
</rss>`
feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data)))
if err != nil {
t.Fatal(err)
}
if feed.Entries[0].Content != `this is <b>bold</b>` {
t.Errorf("Incorrect entry content, got: %q", feed.Entries[0].Content)
}
}
// https://www.rssboard.org/rss-encoding-examples
func TestParseEntryDescriptionWithEncodingAngleBracketsInText(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?>
<rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/">
<channel>
<title>Example</title>
<link>http://example.org/</link>
<item>
<title>Item 1</title>
<link>http://example.org/item1</link>
<description>5 &amp;lt; 8, ticker symbol &amp;lt;BIGCO&amp;gt;</description>
</item>
</channel>
</rss>`
feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data)))
if err != nil {
t.Fatal(err)
}
if feed.Entries[0].Content != `5 &lt; 8, ticker symbol &lt;BIGCO&gt;` {
t.Errorf("Incorrect entry content, got: %q", feed.Entries[0].Content)
}
}
// https://www.rssboard.org/rss-encoding-examples
func TestParseEntryDescriptionWithEncodingAngleBracketsWithinCDATASection(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?>
<rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/">
<channel>
<title>Example</title>
<link>http://example.org/</link>
<item>
<title>Item 1</title>
<link>http://example.org/item1</link>
<description><![CDATA[5 &lt; 8, ticker symbol &lt;BIGCO&gt;]]></description>
</item>
</channel>
</rss>`
feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data)))
if err != nil {
t.Fatal(err)
}
if feed.Entries[0].Content != `5 &lt; 8, ticker symbol &lt;BIGCO&gt;` {
t.Errorf("Incorrect entry content, got: %q", feed.Entries[0].Content)
}
}
func TestParseEntryWithFeedBurnerLink(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?>
<rss version="2.0" xmlns:feedburner="http://rssnamespace.org/feedburner/ext/1.0">

View file

@ -16,29 +16,75 @@ import (
// Specs: https://www.rssboard.org/rss-specification
type RSS struct {
Version string `xml:"rss version,attr"`
// Version is the version of the RSS specification.
Version string `xml:"rss version,attr"`
// Channel is the main container for the RSS feed.
Channel RSSChannel `xml:"rss channel"`
}
type RSSChannel struct {
Title string `xml:"rss title"`
Link string `xml:"rss link"`
Description string `xml:"rss description"`
Language string `xml:"rss language"`
Copyright string `xml:"rss copyRight"`
ManagingEditor string `xml:"rss managingEditor"`
Webmaster string `xml:"rss webMaster"`
PubDate string `xml:"rss pubDate"`
LastBuildDate string `xml:"rss lastBuildDate"`
Categories []string `xml:"rss category"`
Generator string `xml:"rss generator"`
Docs string `xml:"rss docs"`
Cloud *RSSCloud `xml:"rss cloud"`
Image *RSSImage `xml:"rss image"`
TTL string `xml:"rss ttl"`
SkipHours []string `xml:"rss skipHours>hour"`
SkipDays []string `xml:"rss skipDays>day"`
Items []RSSItem `xml:"rss item"`
// Title is the name of the channel.
Title string `xml:"rss title"`
// Link is the URL to the HTML website corresponding to the channel.
Link string `xml:"rss link"`
// Description is a phrase or sentence describing the channel.
Description string `xml:"rss description"`
// Language is the language the channel is written in.
// A list of allowable values for this element, as provided by Netscape, is here: https://www.rssboard.org/rss-language-codes.
// You may also use values defined by the W3C: https://www.w3.org/TR/REC-html40/struct/dirlang.html#langcodes.
Language string `xml:"rss language"`
// Copyright is a string indicating the copyright.
Copyright string `xml:"rss copyRight"`
// ManagingEditor is the email address for the person responsible for editorial content.
ManagingEditor string `xml:"rss managingEditor"`
// Webmaster is the email address for the person responsible for technical issues relating to the channel.
Webmaster string `xml:"rss webMaster"`
// PubDate is the publication date for the content in the channel.
// All date-times in RSS conform to the Date and Time Specification of RFC 822, with the exception that the year may be expressed with two characters or four characters (four preferred).
PubDate string `xml:"rss pubDate"`
// LastBuildDate is the last time the content of the channel changed.
LastBuildDate string `xml:"rss lastBuildDate"`
// Categories is a collection of categories to which the channel belongs.
Categories []string `xml:"rss category"`
// Generator is a string indicating the program used to generate the channel.
Generator string `xml:"rss generator"`
// Docs is a URL that points to the documentation for the format used in the RSS file.
DocumentationURL string `xml:"rss docs"`
// Cloud is a web service that supports the rssCloud interface which can be implemented in HTTP-POST, XML-RPC or SOAP 1.1.
Cloud *RSSCloud `xml:"rss cloud"`
// Image specifies a GIF, JPEG or PNG image that can be displayed with the channel.
Image *RSSImage `xml:"rss image"`
// TTL is a number of minutes that indicates how long a channel can be cached before refreshing from the source.
TTL string `xml:"rss ttl"`
// SkipHours is a hint for aggregators telling them which hours they can skip.
// An XML element that contains up to 24 <hour> sub-elements whose value is a number between 0 and 23,
// representing a time in GMT, when aggregators,
// if they support the feature, may not read the channel on hours listed in the skipHours element.
SkipHours []string `xml:"rss skipHours>hour"`
// SkipDays is a hint for aggregators telling them which days they can skip.
// An XML element that contains up to seven <day> sub-elements whose value is Monday, Tuesday, Wednesday, Thursday, Friday, Saturday or Sunday.
SkipDays []string `xml:"rss skipDays>day"`
// Items is a collection of items.
Items []RSSItem `xml:"rss item"`
AtomLinks
itunes.ItunesChannelElement
googleplay.GooglePlayChannelElement
@ -64,16 +110,56 @@ type RSSImage struct {
}
type RSSItem struct {
Title string `xml:"rss title"`
Link string `xml:"rss link"`
Description string `xml:"rss description"`
Author RSSAuthor `xml:"rss author"`
Categories []string `xml:"rss category"`
CommentsURL string `xml:"rss comments"`
Enclosures []RSSEnclosure `xml:"rss enclosure"`
GUID RSSGUID `xml:"rss guid"`
PubDate string `xml:"rss pubDate"`
Source RSSSource `xml:"rss source"`
// Title is the title of the item.
Title string `xml:"rss title"`
// Link is the URL of the item.
Link string `xml:"rss link"`
// Description is the item synopsis.
Description string `xml:"rss description"`
// Author is the email address of the author of the item.
Author RSSAuthor `xml:"rss author"`
// <category> is an optional sub-element of <item>.
// It has one optional attribute, domain, a string that identifies a categorization taxonomy.
Categories []string `xml:"rss category"`
// <comments> is an optional sub-element of <item>.
// If present, it contains the URL of the comments page for the item.
CommentsURL string `xml:"rss comments"`
// <enclosure> is an optional sub-element of <item>.
// It has three required attributes. url says where the enclosure is located,
// length says how big it is in bytes, and type says what its type is, a standard MIME type.
Enclosures []RSSEnclosure `xml:"rss enclosure"`
// <guid> is an optional sub-element of <item>.
// It's a string that uniquely identifies the item.
// When present, an aggregator may choose to use this string to determine if an item is new.
//
// There are no rules for the syntax of a guid.
// Aggregators must view them as a string.
// It's up to the source of the feed to establish the uniqueness of the string.
//
// If the guid element has an attribute named isPermaLink with a value of true,
// the reader may assume that it is a permalink to the item, that is, a url that can be opened in a Web browser,
// that points to the full item described by the <item> element.
//
// isPermaLink is optional, its default value is true.
// If its value is false, the guid may not be assumed to be a url, or a url to anything in particular.
GUID RSSGUID `xml:"rss guid"`
// <pubDate> is the publication date of the item.
// Its value is a string in RFC 822 format.
PubDate string `xml:"rss pubDate"`
// <source> is an optional sub-element of <item>.
// Its value is the name of the RSS channel that the item came from, derived from its <title>.
// It has one required attribute, url, which contains the URL of the RSS channel.
Source RSSSource `xml:"rss source"`
dublincore.DublinCoreItemElement
FeedBurnerItemElement
media.MediaItemElement