Refactor RSS parser to use default namespace

This change avoid some limitations of the Go XML parser regarding XML namespaces
This commit is contained in:
Frédéric Guillot 2024-03-11 20:43:14 -07:00
parent d3a85b049b
commit 9a637ce95e
6 changed files with 185 additions and 181 deletions

View file

@ -12,6 +12,7 @@ import (
var textLinkRegex = regexp.MustCompile(`(?mi)(\bhttps?:\/\/[-A-Z0-9+&@#\/%?=~_|!:,.;]*[-A-Z0-9+&@#\/%=~_|])`) var textLinkRegex = regexp.MustCompile(`(?mi)(\bhttps?:\/\/[-A-Z0-9+&@#\/%?=~_|!:,.;]*[-A-Z0-9+&@#\/%=~_|])`)
// Element represents XML media elements. // Element represents XML media elements.
// Specs: https://www.rssboard.org/media-rss
type Element struct { type Element struct {
MediaGroups []Group `xml:"http://search.yahoo.com/mrss/ group"` MediaGroups []Group `xml:"http://search.yahoo.com/mrss/ group"`
MediaContents []Content `xml:"http://search.yahoo.com/mrss/ content"` MediaContents []Content `xml:"http://search.yahoo.com/mrss/ content"`

View file

@ -0,0 +1,43 @@
// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
package rss // import "miniflux.app/v2/internal/reader/rss"
import "strings"
type AtomAuthor struct {
Author AtomPerson `xml:"http://www.w3.org/2005/Atom author"`
}
func (a *AtomAuthor) String() string {
return a.Author.String()
}
type AtomPerson struct {
Name string `xml:"name"`
Email string `xml:"email"`
}
func (a *AtomPerson) String() string {
var name string
switch {
case a.Name != "":
name = a.Name
case a.Email != "":
name = a.Email
}
return strings.TrimSpace(name)
}
type AtomLink struct {
URL string `xml:"href,attr"`
Type string `xml:"type,attr"`
Rel string `xml:"rel,attr"`
Length string `xml:"length,attr"`
}
type AtomLinks struct {
Links []*AtomLink `xml:"http://www.w3.org/2005/Atom link"`
}

View file

@ -14,7 +14,9 @@ import (
// Parse returns a normalized feed struct from a RSS feed. // Parse returns a normalized feed struct from a RSS feed.
func Parse(baseURL string, data io.ReadSeeker) (*model.Feed, error) { func Parse(baseURL string, data io.ReadSeeker) (*model.Feed, error) {
feed := new(rssFeed) feed := new(rssFeed)
if err := xml.NewXMLDecoder(data).Decode(feed); err != nil { decoder := xml.NewXMLDecoder(data)
decoder.DefaultSpace = "rss"
if err := decoder.Decode(feed); err != nil {
return nil, fmt.Errorf("rss: unable to parse feed: %w", err) return nil, fmt.Errorf("rss: unable to parse feed: %w", err)
} }
return feed.Transform(baseURL), nil return feed.Transform(baseURL), nil

View file

@ -300,7 +300,7 @@ func TestParseEntryWithMultipleAtomLinks(t *testing.T) {
<item> <item>
<title>Test</title> <title>Test</title>
<atom:link rel="payment" href="https://example.org/a" /> <atom:link rel="payment" href="https://example.org/a" />
<atom:link rel="http://foobar.tld" href="https://example.org/b" /> <atom:link rel="alternate" href="https://example.org/b" />
</item> </item>
</channel> </channel>
</rss>` </rss>`
@ -430,7 +430,7 @@ func TestParseEntryWithAuthorAndCDATA(t *testing.T) {
<title>Test</title> <title>Test</title>
<link>https://example.org/item</link> <link>https://example.org/item</link>
<author> <author>
by <![CDATA[Foo Bar]]> <![CDATA[by Foo Bar]]>
</author> </author>
</item> </item>
</channel> </channel>
@ -447,38 +447,6 @@ func TestParseEntryWithAuthorAndCDATA(t *testing.T) {
} }
} }
func TestParseEntryWithNonStandardAtomAuthor(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?>
<rss xmlns:atom="http://www.w3.org/2005/Atom" version="2.0">
<channel>
<title>Example</title>
<link>https://example.org/</link>
<atom:link href="https://example.org/rss" type="application/rss+xml" rel="self"></atom:link>
<item>
<title>Test</title>
<link>https://example.org/item</link>
<author xmlns:author="http://www.w3.org/2005/Atom">
<name>Foo Bar</name>
<title>Vice President</title>
<department/>
<company>FooBar Inc.</company>
</author>
</item>
</channel>
</rss>`
feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data)))
if err != nil {
t.Fatal(err)
}
expected := "Foo Bar"
result := feed.Entries[0].Author
if result != expected {
t.Errorf("Incorrect entry author, got %q instead of %q", result, expected)
}
}
func TestParseEntryWithAtomAuthorEmail(t *testing.T) { func TestParseEntryWithAtomAuthorEmail(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?> data := `<?xml version="1.0" encoding="utf-8"?>
<rss xmlns:atom="http://www.w3.org/2005/Atom" version="2.0"> <rss xmlns:atom="http://www.w3.org/2005/Atom" version="2.0">
@ -508,7 +476,7 @@ func TestParseEntryWithAtomAuthorEmail(t *testing.T) {
} }
} }
func TestParseEntryWithAtomAuthor(t *testing.T) { func TestParseEntryWithAtomAuthorName(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?> data := `<?xml version="1.0" encoding="utf-8"?>
<rss xmlns:atom="http://www.w3.org/2005/Atom" version="2.0"> <rss xmlns:atom="http://www.w3.org/2005/Atom" version="2.0">
<channel> <channel>
@ -1435,6 +1403,37 @@ func TestEntryDescriptionFromGooglePlayDescription(t *testing.T) {
} }
} }
func TestParseEntryWithRSSDescriptionAndMediaDescription(t *testing.T) {
data := `<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0" xmlns:media="http://search.yahoo.com/mrss/">
<channel>
<title>Podcast Example</title>
<link>http://www.example.com/index.html</link>
<item>
<title>Entry Title</title>
<link>http://www.example.com/entries/1</link>
<description>Entry Description</description>
<media:description type="plain">Media Description</media:description>
</item>
</channel>
</rss>`
feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data)))
if err != nil {
t.Fatal(err)
}
if len(feed.Entries) != 1 {
t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries))
}
expected := "Entry Description"
result := feed.Entries[0].Content
if expected != result {
t.Errorf(`Unexpected description, got %q instead of %q`, result, expected)
}
}
func TestParseEntryWithCategoryAndInnerHTML(t *testing.T) { func TestParseEntryWithCategoryAndInnerHTML(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?> data := `<?xml version="1.0" encoding="utf-8"?>
<rss xmlns:atom="http://www.w3.org/2005/Atom" version="2.0"> <rss xmlns:atom="http://www.w3.org/2005/Atom" version="2.0">

View file

@ -15,21 +15,24 @@ var ErrInvalidDurationFormat = errors.New("rss: invalid duration format")
// PodcastFeedElement represents iTunes and GooglePlay feed XML elements. // PodcastFeedElement represents iTunes and GooglePlay feed XML elements.
// Specs: // Specs:
// - https://github.com/simplepie/simplepie-ng/wiki/Spec:-iTunes-Podcast-RSS // - https://github.com/simplepie/simplepie-ng/wiki/Spec:-iTunes-Podcast-RSS
// - https://developers.google.com/search/reference/podcast/rss-feed // - https://support.google.com/podcast-publishers/answer/9889544
type PodcastFeedElement struct { type PodcastFeedElement struct {
ItunesAuthor string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd channel>author"` ItunesAuthor string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd author"`
Subtitle string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd channel>subtitle"` Subtitle string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd subtitle"`
Summary string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd channel>summary"` Summary string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd summary"`
PodcastOwner PodcastOwner `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd channel>owner"` PodcastOwner PodcastOwner `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd owner"`
GooglePlayAuthor string `xml:"http://www.google.com/schemas/play-podcasts/1.0 channel>author"` GooglePlayAuthor string `xml:"http://www.google.com/schemas/play-podcasts/1.0 author"`
} }
// PodcastEntryElement represents iTunes and GooglePlay entry XML elements. // PodcastEntryElement represents iTunes and GooglePlay entry XML elements.
type PodcastEntryElement struct { type PodcastEntryElement struct {
Subtitle string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd subtitle"` ItunesAuthor string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd author"`
Summary string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd summary"` Subtitle string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd subtitle"`
Duration string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd duration"` Summary string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd summary"`
GooglePlayDescription string `xml:"http://www.google.com/schemas/play-podcasts/1.0 description"` Duration string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd duration"`
PodcastOwner PodcastOwner `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd owner"`
GooglePlayAuthor string `xml:"http://www.google.com/schemas/play-podcasts/1.0 author"`
GooglePlayDescription string `xml:"http://www.google.com/schemas/play-podcasts/1.0 description"`
} }
// PodcastOwner represents contact information for the podcast owner. // PodcastOwner represents contact information for the podcast owner.
@ -38,6 +41,19 @@ type PodcastOwner struct {
Email string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd email"` Email string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd email"`
} }
func (p *PodcastOwner) String() string {
var name string
switch {
case p.Name != "":
name = p.Name
case p.Email != "":
name = p.Email
}
return strings.TrimSpace(name)
}
// Image represents podcast artwork. // Image represents podcast artwork.
type Image struct { type Image struct {
URL string `xml:"href,attr"` URL string `xml:"href,attr"`
@ -52,10 +68,8 @@ func (e *PodcastFeedElement) PodcastAuthor() string {
author = e.ItunesAuthor author = e.ItunesAuthor
case e.GooglePlayAuthor != "": case e.GooglePlayAuthor != "":
author = e.GooglePlayAuthor author = e.GooglePlayAuthor
case e.PodcastOwner.Name != "": case e.PodcastOwner.String() != "":
author = e.PodcastOwner.Name author = e.PodcastOwner.String()
case e.PodcastOwner.Email != "":
author = e.PodcastOwner.Email
} }
return strings.TrimSpace(author) return strings.TrimSpace(author)

View file

@ -21,20 +21,25 @@ import (
"miniflux.app/v2/internal/urllib" "miniflux.app/v2/internal/urllib"
) )
// Specs: https://cyber.harvard.edu/rss/rss.html // Specs: https://www.rssboard.org/rss-specification
type rssFeed struct { type rssFeed struct {
XMLName xml.Name `xml:"rss"` XMLName xml.Name `xml:"rss"`
Version string `xml:"version,attr"` Version string `xml:"rss version,attr"`
Title string `xml:"channel>title"` Channel rssChannel `xml:"rss channel"`
Links []rssLink `xml:"channel>link"` }
ImageURL string `xml:"channel>image>url"`
Language string `xml:"channel>language"` type rssChannel struct {
Description string `xml:"channel>description"` Title string `xml:"rss title"`
PubDate string `xml:"channel>pubDate"` Link string `xml:"rss link"`
ManagingEditor string `xml:"channel>managingEditor"` ImageURL string `xml:"rss image>url"`
Webmaster string `xml:"channel>webMaster"` Language string `xml:"rss language"`
TimeToLive rssTTL `xml:"channel>ttl"` Description string `xml:"rss description"`
Items []rssItem `xml:"channel>item"` PubDate string `xml:"rss pubDate"`
ManagingEditor string `xml:"rss managingEditor"`
Webmaster string `xml:"rss webMaster"`
TimeToLive rssTTL `xml:"rss ttl"`
Items []rssItem `xml:"rss item"`
AtomLinks
PodcastFeedElement PodcastFeedElement
} }
@ -72,15 +77,15 @@ func (r *rssFeed) Transform(baseURL string) *model.Feed {
feed.FeedURL = feedURL feed.FeedURL = feedURL
} }
feed.Title = html.UnescapeString(strings.TrimSpace(r.Title)) feed.Title = html.UnescapeString(strings.TrimSpace(r.Channel.Title))
if feed.Title == "" { if feed.Title == "" {
feed.Title = feed.SiteURL feed.Title = feed.SiteURL
} }
feed.IconURL = strings.TrimSpace(r.ImageURL) feed.IconURL = strings.TrimSpace(r.Channel.ImageURL)
feed.TTL = r.TimeToLive.Value() feed.TTL = r.Channel.TimeToLive.Value()
for _, item := range r.Items { for _, item := range r.Channel.Items {
entry := item.Transform() entry := item.Transform()
if entry.Author == "" { if entry.Author == "" {
entry.Author = r.feedAuthor() entry.Author = r.feedAuthor()
@ -110,32 +115,29 @@ func (r *rssFeed) Transform(baseURL string) *model.Feed {
} }
func (r *rssFeed) siteURL() string { func (r *rssFeed) siteURL() string {
for _, element := range r.Links { return strings.TrimSpace(r.Channel.Link)
if element.XMLName.Space == "" {
return strings.TrimSpace(element.Data)
}
}
return ""
} }
func (r *rssFeed) feedURL() string { func (r *rssFeed) feedURL() string {
for _, element := range r.Links { for _, atomLink := range r.Channel.AtomLinks.Links {
if element.XMLName.Space == "http://www.w3.org/2005/Atom" { if atomLink.Rel == "self" {
return strings.TrimSpace(element.Href) return strings.TrimSpace(atomLink.URL)
} }
} }
return "" return ""
} }
func (r rssFeed) feedAuthor() string { func (r rssFeed) feedAuthor() string {
author := r.PodcastAuthor() author := r.Channel.PodcastAuthor()
switch { switch {
case r.ManagingEditor != "": case r.Channel.ManagingEditor != "":
author = r.ManagingEditor author = r.Channel.ManagingEditor
case r.Webmaster != "": case r.Channel.Webmaster != "":
author = r.Webmaster author = r.Channel.Webmaster
case r.Channel.GooglePlayAuthor != "":
author = r.Channel.GooglePlayAuthor
case r.Channel.PodcastOwner.String() != "":
author = r.Channel.PodcastOwner.String()
} }
return sanitizer.StripTags(strings.TrimSpace(author)) return sanitizer.StripTags(strings.TrimSpace(author))
} }
@ -146,27 +148,7 @@ type rssGUID struct {
IsPermaLink string `xml:"isPermaLink,attr"` IsPermaLink string `xml:"isPermaLink,attr"`
} }
type rssLink struct {
XMLName xml.Name
Data string `xml:",chardata"`
Href string `xml:"href,attr"`
Rel string `xml:"rel,attr"`
}
type rssCommentLink struct {
XMLName xml.Name
Data string `xml:",chardata"`
}
type rssAuthor struct { type rssAuthor struct {
XMLName xml.Name
Data string `xml:",chardata"`
Name string `xml:"name"`
Email string `xml:"email"`
Inner string `xml:",innerxml"`
}
type rssTitle struct {
XMLName xml.Name XMLName xml.Name
Data string `xml:",chardata"` Data string `xml:",chardata"`
Inner string `xml:",innerxml"` Inner string `xml:",innerxml"`
@ -193,19 +175,21 @@ func (enclosure *rssEnclosure) Size() int64 {
} }
type rssItem struct { type rssItem struct {
GUID rssGUID `xml:"guid"` GUID rssGUID `xml:"rss guid"`
Title []rssTitle `xml:"title"` Title string `xml:"rss title"`
Links []rssLink `xml:"link"` Link string `xml:"rss link"`
Description string `xml:"description"` Description string `xml:"rss description"`
PubDate string `xml:"pubDate"` PubDate string `xml:"rss pubDate"`
Authors []rssAuthor `xml:"author"` Author rssAuthor `xml:"rss author"`
CommentLinks []rssCommentLink `xml:"comments"` Comments string `xml:"rss comments"`
EnclosureLinks []rssEnclosure `xml:"enclosure"` EnclosureLinks []rssEnclosure `xml:"rss enclosure"`
Categories []rssCategory `xml:"category"` Categories []rssCategory `xml:"rss category"`
dublincore.DublinCoreItemElement dublincore.DublinCoreItemElement
FeedBurnerElement FeedBurnerElement
PodcastEntryElement PodcastEntryElement
media.Element media.Element
AtomAuthor
AtomLinks
} }
func (r *rssItem) Transform() *model.Entry { func (r *rssItem) Transform() *model.Entry {
@ -250,34 +234,26 @@ func (r *rssItem) entryDate() time.Time {
} }
func (r *rssItem) entryAuthor() string { func (r *rssItem) entryAuthor() string {
author := "" var author string
for _, rssAuthor := range r.Authors { switch {
switch rssAuthor.XMLName.Space { case r.PodcastOwner.String() != "":
case "http://www.itunes.com/dtds/podcast-1.0.dtd", "http://www.google.com/schemas/play-podcasts/1.0": author = r.PodcastOwner.String()
author = rssAuthor.Data case r.GooglePlayAuthor != "":
case "http://www.w3.org/2005/Atom": author = r.GooglePlayAuthor
if rssAuthor.Name != "" { case r.ItunesAuthor != "":
author = rssAuthor.Name author = r.ItunesAuthor
} else if rssAuthor.Email != "" { case r.DublinCoreCreator != "":
author = rssAuthor.Email author = r.DublinCoreCreator
} case r.AtomAuthor.String() != "":
default: author = r.AtomAuthor.String()
if rssAuthor.Name != "" { case strings.Contains(r.Author.Inner, "<![CDATA["):
author = rssAuthor.Name author = r.Author.Data
} else if strings.Contains(rssAuthor.Inner, "<![CDATA[") { default:
author = rssAuthor.Data author = r.Author.Inner
} else {
author = rssAuthor.Inner
}
}
} }
if author == "" { return strings.TrimSpace(sanitizer.StripTags(author))
author = r.GetSanitizedCreator()
}
return sanitizer.StripTags(strings.TrimSpace(author))
} }
func (r *rssItem) entryHash() string { func (r *rssItem) entryHash() string {
@ -291,21 +267,10 @@ func (r *rssItem) entryHash() string {
} }
func (r *rssItem) entryTitle() string { func (r *rssItem) entryTitle() string {
var title string title := r.Title
for _, rssTitle := range r.Title { if r.DublinCoreTitle != "" {
switch rssTitle.XMLName.Space { title = r.DublinCoreTitle
case "http://search.yahoo.com/mrss/":
// Ignore title in media namespace
case "http://purl.org/dc/elements/1.1/":
title = rssTitle.Data
default:
title = rssTitle.Data
}
if title != "" {
break
}
} }
return html.UnescapeString(strings.TrimSpace(title)) return html.UnescapeString(strings.TrimSpace(title))
@ -321,17 +286,15 @@ func (r *rssItem) entryContent() string {
} }
func (r *rssItem) entryURL() string { func (r *rssItem) entryURL() string {
if r.FeedBurnerLink != "" { for _, link := range []string{r.FeedBurnerLink, r.Link} {
return r.FeedBurnerLink if link != "" {
return strings.TrimSpace(link)
}
} }
for _, link := range r.Links { for _, atomLink := range r.AtomLinks.Links {
if link.XMLName.Space == "http://www.w3.org/2005/Atom" && link.Href != "" && isValidLinkRelation(link.Rel) { if atomLink.URL != "" && (strings.EqualFold(atomLink.Rel, "alternate") || atomLink.Rel == "") {
return strings.TrimSpace(link.Href) return strings.TrimSpace(atomLink.URL)
}
if link.Data != "" {
return strings.TrimSpace(link.Data)
} }
} }
@ -425,28 +388,10 @@ func (r *rssItem) entryCategories() []string {
} }
func (r *rssItem) entryCommentsURL() string { func (r *rssItem) entryCommentsURL() string {
for _, commentLink := range r.CommentLinks { commentsURL := strings.TrimSpace(r.Comments)
if commentLink.XMLName.Space == "" { if commentsURL != "" && urllib.IsAbsoluteURL(commentsURL) {
commentsURL := strings.TrimSpace(commentLink.Data) return commentsURL
// The comments URL is supposed to be absolute (some feeds publishes incorrect comments URL)
// See https://cyber.harvard.edu/rss/rss.html#ltcommentsgtSubelementOfLtitemgt
if urllib.IsAbsoluteURL(commentsURL) {
return commentsURL
}
}
} }
return "" return ""
} }
func isValidLinkRelation(rel string) bool {
switch rel {
case "", "alternate", "enclosure", "related", "self", "via":
return true
default:
if strings.HasPrefix(rel, "http") {
return true
}
return false
}
}