Remove some duplicated code in RSS parser
This commit is contained in:
parent
dd4fb660c1
commit
4834e934f2
4 changed files with 227 additions and 64 deletions
|
@ -39,7 +39,7 @@ func (r *RSSAdapter) BuildFeed(feedURL string) *model.Feed {
|
|||
|
||||
// Try to find the feed URL from the Atom links.
|
||||
for _, atomLink := range r.rss.Channel.AtomLinks.Links {
|
||||
atomLinkHref := strings.TrimSpace(atomLink.URL)
|
||||
atomLinkHref := strings.TrimSpace(atomLink.Href)
|
||||
if atomLinkHref != "" && atomLink.Rel == "self" {
|
||||
if absoluteFeedURL, err := urllib.AbsoluteURL(feedURL, atomLinkHref); err == nil {
|
||||
feed.FeedURL = absoluteFeedURL
|
||||
|
@ -170,8 +170,8 @@ func findEntryURL(rssItem *RSSItem) string {
|
|||
}
|
||||
|
||||
for _, atomLink := range rssItem.AtomLinks.Links {
|
||||
if atomLink.URL != "" && (strings.EqualFold(atomLink.Rel, "alternate") || atomLink.Rel == "") {
|
||||
return strings.TrimSpace(atomLink.URL)
|
||||
if atomLink.Href != "" && (strings.EqualFold(atomLink.Rel, "alternate") || atomLink.Rel == "") {
|
||||
return strings.TrimSpace(atomLink.Href)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -233,8 +233,8 @@ func findEntryAuthor(rssItem *RSSItem) string {
|
|||
author = rssItem.ItunesAuthor
|
||||
case rssItem.DublinCoreCreator != "":
|
||||
author = rssItem.DublinCoreCreator
|
||||
case rssItem.AtomAuthor.String() != "":
|
||||
author = rssItem.AtomAuthor.String()
|
||||
case rssItem.AtomAuthor.PersonName() != "":
|
||||
author = rssItem.AtomAuthor.PersonName()
|
||||
case strings.Contains(rssItem.Author.Inner, "<![CDATA["):
|
||||
author = rssItem.Author.Data
|
||||
default:
|
||||
|
|
|
@ -3,41 +3,18 @@
|
|||
|
||||
package rss // import "miniflux.app/v2/internal/reader/rss"
|
||||
|
||||
import "strings"
|
||||
import (
|
||||
"miniflux.app/v2/internal/reader/atom"
|
||||
)
|
||||
|
||||
type AtomAuthor struct {
|
||||
Author AtomPerson `xml:"http://www.w3.org/2005/Atom author"`
|
||||
Author atom.AtomPerson `xml:"http://www.w3.org/2005/Atom author"`
|
||||
}
|
||||
|
||||
func (a *AtomAuthor) String() string {
|
||||
return a.Author.String()
|
||||
}
|
||||
|
||||
type AtomPerson struct {
|
||||
Name string `xml:"name"`
|
||||
Email string `xml:"email"`
|
||||
}
|
||||
|
||||
func (a *AtomPerson) String() string {
|
||||
var name string
|
||||
|
||||
switch {
|
||||
case a.Name != "":
|
||||
name = a.Name
|
||||
case a.Email != "":
|
||||
name = a.Email
|
||||
}
|
||||
|
||||
return strings.TrimSpace(name)
|
||||
}
|
||||
|
||||
type AtomLink struct {
|
||||
URL string `xml:"href,attr"`
|
||||
Type string `xml:"type,attr"`
|
||||
Rel string `xml:"rel,attr"`
|
||||
Length string `xml:"length,attr"`
|
||||
func (a *AtomAuthor) PersonName() string {
|
||||
return a.Author.PersonName()
|
||||
}
|
||||
|
||||
type AtomLinks struct {
|
||||
Links []*AtomLink `xml:"http://www.w3.org/2005/Atom link"`
|
||||
Links []*atom.AtomLink `xml:"http://www.w3.org/2005/Atom link"`
|
||||
}
|
||||
|
|
|
@ -746,6 +746,106 @@ func TestParseEntryWithContentEncoded(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
// https://www.rssboard.org/rss-encoding-examples
|
||||
func TestParseEntryDescriptionWithEncodedHTMLTags(t *testing.T) {
|
||||
data := `<?xml version="1.0" encoding="utf-8"?>
|
||||
<rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/">
|
||||
<channel>
|
||||
<title>Example</title>
|
||||
<link>http://example.org/</link>
|
||||
<item>
|
||||
<title>Item 1</title>
|
||||
<link>http://example.org/item1</link>
|
||||
<description>this is <b>bold</b></description>
|
||||
</item>
|
||||
</channel>
|
||||
</rss>`
|
||||
|
||||
feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data)))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if feed.Entries[0].Content != `this is <b>bold</b>` {
|
||||
t.Errorf("Incorrect entry content, got: %q", feed.Entries[0].Content)
|
||||
}
|
||||
}
|
||||
|
||||
// https://www.rssboard.org/rss-encoding-examples
|
||||
func TestParseEntryWithDescriptionWithHTMLCDATA(t *testing.T) {
|
||||
data := `<?xml version="1.0" encoding="utf-8"?>
|
||||
<rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/">
|
||||
<channel>
|
||||
<title>Example</title>
|
||||
<link>http://example.org/</link>
|
||||
<item>
|
||||
<title>Item 1</title>
|
||||
<link>http://example.org/item1</link>
|
||||
<description><![CDATA[this is <b>bold</b>]]></description>
|
||||
</item>
|
||||
</channel>
|
||||
</rss>`
|
||||
|
||||
feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data)))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if feed.Entries[0].Content != `this is <b>bold</b>` {
|
||||
t.Errorf("Incorrect entry content, got: %q", feed.Entries[0].Content)
|
||||
}
|
||||
}
|
||||
|
||||
// https://www.rssboard.org/rss-encoding-examples
|
||||
func TestParseEntryDescriptionWithEncodingAngleBracketsInText(t *testing.T) {
|
||||
data := `<?xml version="1.0" encoding="utf-8"?>
|
||||
<rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/">
|
||||
<channel>
|
||||
<title>Example</title>
|
||||
<link>http://example.org/</link>
|
||||
<item>
|
||||
<title>Item 1</title>
|
||||
<link>http://example.org/item1</link>
|
||||
<description>5 &lt; 8, ticker symbol &lt;BIGCO&gt;</description>
|
||||
</item>
|
||||
</channel>
|
||||
</rss>`
|
||||
|
||||
feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data)))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if feed.Entries[0].Content != `5 < 8, ticker symbol <BIGCO>` {
|
||||
t.Errorf("Incorrect entry content, got: %q", feed.Entries[0].Content)
|
||||
}
|
||||
}
|
||||
|
||||
// https://www.rssboard.org/rss-encoding-examples
|
||||
func TestParseEntryDescriptionWithEncodingAngleBracketsWithinCDATASection(t *testing.T) {
|
||||
data := `<?xml version="1.0" encoding="utf-8"?>
|
||||
<rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/">
|
||||
<channel>
|
||||
<title>Example</title>
|
||||
<link>http://example.org/</link>
|
||||
<item>
|
||||
<title>Item 1</title>
|
||||
<link>http://example.org/item1</link>
|
||||
<description><![CDATA[5 < 8, ticker symbol <BIGCO>]]></description>
|
||||
</item>
|
||||
</channel>
|
||||
</rss>`
|
||||
|
||||
feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data)))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if feed.Entries[0].Content != `5 < 8, ticker symbol <BIGCO>` {
|
||||
t.Errorf("Incorrect entry content, got: %q", feed.Entries[0].Content)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseEntryWithFeedBurnerLink(t *testing.T) {
|
||||
data := `<?xml version="1.0" encoding="utf-8"?>
|
||||
<rss version="2.0" xmlns:feedburner="http://rssnamespace.org/feedburner/ext/1.0">
|
||||
|
|
|
@ -16,29 +16,75 @@ import (
|
|||
|
||||
// Specs: https://www.rssboard.org/rss-specification
|
||||
type RSS struct {
|
||||
Version string `xml:"rss version,attr"`
|
||||
// Version is the version of the RSS specification.
|
||||
Version string `xml:"rss version,attr"`
|
||||
|
||||
// Channel is the main container for the RSS feed.
|
||||
Channel RSSChannel `xml:"rss channel"`
|
||||
}
|
||||
|
||||
type RSSChannel struct {
|
||||
Title string `xml:"rss title"`
|
||||
Link string `xml:"rss link"`
|
||||
Description string `xml:"rss description"`
|
||||
Language string `xml:"rss language"`
|
||||
Copyright string `xml:"rss copyRight"`
|
||||
ManagingEditor string `xml:"rss managingEditor"`
|
||||
Webmaster string `xml:"rss webMaster"`
|
||||
PubDate string `xml:"rss pubDate"`
|
||||
LastBuildDate string `xml:"rss lastBuildDate"`
|
||||
Categories []string `xml:"rss category"`
|
||||
Generator string `xml:"rss generator"`
|
||||
Docs string `xml:"rss docs"`
|
||||
Cloud *RSSCloud `xml:"rss cloud"`
|
||||
Image *RSSImage `xml:"rss image"`
|
||||
TTL string `xml:"rss ttl"`
|
||||
SkipHours []string `xml:"rss skipHours>hour"`
|
||||
SkipDays []string `xml:"rss skipDays>day"`
|
||||
Items []RSSItem `xml:"rss item"`
|
||||
// Title is the name of the channel.
|
||||
Title string `xml:"rss title"`
|
||||
|
||||
// Link is the URL to the HTML website corresponding to the channel.
|
||||
Link string `xml:"rss link"`
|
||||
|
||||
// Description is a phrase or sentence describing the channel.
|
||||
Description string `xml:"rss description"`
|
||||
|
||||
// Language is the language the channel is written in.
|
||||
// A list of allowable values for this element, as provided by Netscape, is here: https://www.rssboard.org/rss-language-codes.
|
||||
// You may also use values defined by the W3C: https://www.w3.org/TR/REC-html40/struct/dirlang.html#langcodes.
|
||||
Language string `xml:"rss language"`
|
||||
|
||||
// Copyright is a string indicating the copyright.
|
||||
Copyright string `xml:"rss copyRight"`
|
||||
|
||||
// ManagingEditor is the email address for the person responsible for editorial content.
|
||||
ManagingEditor string `xml:"rss managingEditor"`
|
||||
|
||||
// Webmaster is the email address for the person responsible for technical issues relating to the channel.
|
||||
Webmaster string `xml:"rss webMaster"`
|
||||
|
||||
// PubDate is the publication date for the content in the channel.
|
||||
// All date-times in RSS conform to the Date and Time Specification of RFC 822, with the exception that the year may be expressed with two characters or four characters (four preferred).
|
||||
PubDate string `xml:"rss pubDate"`
|
||||
|
||||
// LastBuildDate is the last time the content of the channel changed.
|
||||
LastBuildDate string `xml:"rss lastBuildDate"`
|
||||
|
||||
// Categories is a collection of categories to which the channel belongs.
|
||||
Categories []string `xml:"rss category"`
|
||||
|
||||
// Generator is a string indicating the program used to generate the channel.
|
||||
Generator string `xml:"rss generator"`
|
||||
|
||||
// Docs is a URL that points to the documentation for the format used in the RSS file.
|
||||
DocumentationURL string `xml:"rss docs"`
|
||||
|
||||
// Cloud is a web service that supports the rssCloud interface which can be implemented in HTTP-POST, XML-RPC or SOAP 1.1.
|
||||
Cloud *RSSCloud `xml:"rss cloud"`
|
||||
|
||||
// Image specifies a GIF, JPEG or PNG image that can be displayed with the channel.
|
||||
Image *RSSImage `xml:"rss image"`
|
||||
|
||||
// TTL is a number of minutes that indicates how long a channel can be cached before refreshing from the source.
|
||||
TTL string `xml:"rss ttl"`
|
||||
|
||||
// SkipHours is a hint for aggregators telling them which hours they can skip.
|
||||
// An XML element that contains up to 24 <hour> sub-elements whose value is a number between 0 and 23,
|
||||
// representing a time in GMT, when aggregators,
|
||||
// if they support the feature, may not read the channel on hours listed in the skipHours element.
|
||||
SkipHours []string `xml:"rss skipHours>hour"`
|
||||
|
||||
// SkipDays is a hint for aggregators telling them which days they can skip.
|
||||
// An XML element that contains up to seven <day> sub-elements whose value is Monday, Tuesday, Wednesday, Thursday, Friday, Saturday or Sunday.
|
||||
SkipDays []string `xml:"rss skipDays>day"`
|
||||
|
||||
// Items is a collection of items.
|
||||
Items []RSSItem `xml:"rss item"`
|
||||
|
||||
AtomLinks
|
||||
itunes.ItunesChannelElement
|
||||
googleplay.GooglePlayChannelElement
|
||||
|
@ -64,16 +110,56 @@ type RSSImage struct {
|
|||
}
|
||||
|
||||
type RSSItem struct {
|
||||
Title string `xml:"rss title"`
|
||||
Link string `xml:"rss link"`
|
||||
Description string `xml:"rss description"`
|
||||
Author RSSAuthor `xml:"rss author"`
|
||||
Categories []string `xml:"rss category"`
|
||||
CommentsURL string `xml:"rss comments"`
|
||||
Enclosures []RSSEnclosure `xml:"rss enclosure"`
|
||||
GUID RSSGUID `xml:"rss guid"`
|
||||
PubDate string `xml:"rss pubDate"`
|
||||
Source RSSSource `xml:"rss source"`
|
||||
// Title is the title of the item.
|
||||
Title string `xml:"rss title"`
|
||||
|
||||
// Link is the URL of the item.
|
||||
Link string `xml:"rss link"`
|
||||
|
||||
// Description is the item synopsis.
|
||||
Description string `xml:"rss description"`
|
||||
|
||||
// Author is the email address of the author of the item.
|
||||
Author RSSAuthor `xml:"rss author"`
|
||||
|
||||
// <category> is an optional sub-element of <item>.
|
||||
// It has one optional attribute, domain, a string that identifies a categorization taxonomy.
|
||||
Categories []string `xml:"rss category"`
|
||||
|
||||
// <comments> is an optional sub-element of <item>.
|
||||
// If present, it contains the URL of the comments page for the item.
|
||||
CommentsURL string `xml:"rss comments"`
|
||||
|
||||
// <enclosure> is an optional sub-element of <item>.
|
||||
// It has three required attributes. url says where the enclosure is located,
|
||||
// length says how big it is in bytes, and type says what its type is, a standard MIME type.
|
||||
Enclosures []RSSEnclosure `xml:"rss enclosure"`
|
||||
|
||||
// <guid> is an optional sub-element of <item>.
|
||||
// It's a string that uniquely identifies the item.
|
||||
// When present, an aggregator may choose to use this string to determine if an item is new.
|
||||
//
|
||||
// There are no rules for the syntax of a guid.
|
||||
// Aggregators must view them as a string.
|
||||
// It's up to the source of the feed to establish the uniqueness of the string.
|
||||
//
|
||||
// If the guid element has an attribute named isPermaLink with a value of true,
|
||||
// the reader may assume that it is a permalink to the item, that is, a url that can be opened in a Web browser,
|
||||
// that points to the full item described by the <item> element.
|
||||
//
|
||||
// isPermaLink is optional, its default value is true.
|
||||
// If its value is false, the guid may not be assumed to be a url, or a url to anything in particular.
|
||||
GUID RSSGUID `xml:"rss guid"`
|
||||
|
||||
// <pubDate> is the publication date of the item.
|
||||
// Its value is a string in RFC 822 format.
|
||||
PubDate string `xml:"rss pubDate"`
|
||||
|
||||
// <source> is an optional sub-element of <item>.
|
||||
// Its value is the name of the RSS channel that the item came from, derived from its <title>.
|
||||
// It has one required attribute, url, which contains the URL of the RSS channel.
|
||||
Source RSSSource `xml:"rss source"`
|
||||
|
||||
dublincore.DublinCoreItemElement
|
||||
FeedBurnerItemElement
|
||||
media.MediaItemElement
|
||||
|
|
Loading…
Reference in a new issue