Refactor RSS parser to use default namespace

This change avoid some limitations of the Go XML parser regarding XML namespaces
2024-03-11 20:43:14 -07:00 · 2024-03-11 20:43:14 -07:00 · 9a637ce95e
commit 9a637ce95e
parent d3a85b049b
6 changed files with 185 additions and 181 deletions
--- a/internal/reader/media/media.go
+++ b/internal/reader/media/media.go
@ -12,6 +12,7 @@ import (
 var textLinkRegex = regexp.MustCompile(`(?mi)(\bhttps?:\/\/[-A-Z0-9+&@#\/%?=~_|!:,.;]*[-A-Z0-9+&@#\/%=~_|])`)

 // Element represents XML media elements.
+// Specs: https://www.rssboard.org/media-rss
 type Element struct {
 	MediaGroups       []Group         `xml:"http://search.yahoo.com/mrss/ group"`
 	MediaContents     []Content       `xml:"http://search.yahoo.com/mrss/ content"`
--- a/internal/reader/rss/atom.go
+++ b/internal/reader/rss/atom.go
@ -0,0 +1,43 @@
+// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+package rss // import "miniflux.app/v2/internal/reader/rss"
+
+import "strings"
+
+type AtomAuthor struct {
+	Author AtomPerson `xml:"http://www.w3.org/2005/Atom author"`
+}
+
+func (a *AtomAuthor) String() string {
+	return a.Author.String()
+}
+
+type AtomPerson struct {
+	Name  string `xml:"name"`
+	Email string `xml:"email"`
+}
+
+func (a *AtomPerson) String() string {
+	var name string
+
+	switch {
+	case a.Name != "":
+		name = a.Name
+	case a.Email != "":
+		name = a.Email
+	}
+
+	return strings.TrimSpace(name)
+}
+
+type AtomLink struct {
+	URL    string `xml:"href,attr"`
+	Type   string `xml:"type,attr"`
+	Rel    string `xml:"rel,attr"`
+	Length string `xml:"length,attr"`
+}
+
+type AtomLinks struct {
+	Links []*AtomLink `xml:"http://www.w3.org/2005/Atom link"`
+}
--- a/internal/reader/rss/parser.go
+++ b/internal/reader/rss/parser.go
@ -14,7 +14,9 @@ import (
 // Parse returns a normalized feed struct from a RSS feed.
 func Parse(baseURL string, data io.ReadSeeker) (*model.Feed, error) {
 	feed := new(rssFeed)
-	if err := xml.NewXMLDecoder(data).Decode(feed); err != nil {
+	decoder := xml.NewXMLDecoder(data)
+	decoder.DefaultSpace = "rss"
+	if err := decoder.Decode(feed); err != nil {
 		return nil, fmt.Errorf("rss: unable to parse feed: %w", err)
 	}
 	return feed.Transform(baseURL), nil
--- a/internal/reader/rss/parser_test.go
+++ b/internal/reader/rss/parser_test.go
@ -300,7 +300,7 @@ func TestParseEntryWithMultipleAtomLinks(t *testing.T) {
 			<item>
 				<title>Test</title>
 				<atom:link rel="payment" href="https://example.org/a" />
-				<atom:link rel="http://foobar.tld" href="https://example.org/b" />
+				<atom:link rel="alternate" href="https://example.org/b" />
 			</item>
 		</channel>
 		</rss>`
@ -430,7 +430,7 @@ func TestParseEntryWithAuthorAndCDATA(t *testing.T) {
 				<title>Test</title>
 				<link>https://example.org/item</link>
 				<author>
-					by <![CDATA[Foo Bar]]>
+					<![CDATA[by Foo Bar]]>
 				</author>
 			</item>
 		</channel>
@ -447,38 +447,6 @@ func TestParseEntryWithAuthorAndCDATA(t *testing.T) {
 	}
 }

-func TestParseEntryWithNonStandardAtomAuthor(t *testing.T) {
-	data := `<?xml version="1.0" encoding="utf-8"?>
-		<rss xmlns:atom="http://www.w3.org/2005/Atom" version="2.0">
-		<channel>
-			<title>Example</title>
-			<link>https://example.org/</link>
-			<atom:link href="https://example.org/rss" type="application/rss+xml" rel="self"></atom:link>
-			<item>
-				<title>Test</title>
-				<link>https://example.org/item</link>
-				<author xmlns:author="http://www.w3.org/2005/Atom">
-					<name>Foo Bar</name>
-					<title>Vice President</title>
-					<department/>
-					<company>FooBar Inc.</company>
-				</author>
-			</item>
-		</channel>
-		</rss>`
-
-	feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data)))
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	expected := "Foo Bar"
-	result := feed.Entries[0].Author
-	if result != expected {
-		t.Errorf("Incorrect entry author, got %q instead of %q", result, expected)
-	}
-}
-
 func TestParseEntryWithAtomAuthorEmail(t *testing.T) {
 	data := `<?xml version="1.0" encoding="utf-8"?>
 		<rss xmlns:atom="http://www.w3.org/2005/Atom" version="2.0">
@ -508,7 +476,7 @@ func TestParseEntryWithAtomAuthorEmail(t *testing.T) {
 	}
 }

-func TestParseEntryWithAtomAuthor(t *testing.T) {
+func TestParseEntryWithAtomAuthorName(t *testing.T) {
 	data := `<?xml version="1.0" encoding="utf-8"?>
 		<rss xmlns:atom="http://www.w3.org/2005/Atom" version="2.0">
 		<channel>
@ -1435,6 +1403,37 @@ func TestEntryDescriptionFromGooglePlayDescription(t *testing.T) {
 	}
 }

+func TestParseEntryWithRSSDescriptionAndMediaDescription(t *testing.T) {
+	data := `<?xml version="1.0" encoding="UTF-8"?>
+	<rss version="2.0" xmlns:media="http://search.yahoo.com/mrss/">
+		<channel>
+			<title>Podcast Example</title>
+			<link>http://www.example.com/index.html</link>
+			<item>
+				<title>Entry Title</title>
+				<link>http://www.example.com/entries/1</link>
+				<description>Entry Description</description>
+				<media:description type="plain">Media Description</media:description>
+			</item>
+		</channel>
+	</rss>`
+
+	feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data)))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if len(feed.Entries) != 1 {
+		t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries))
+	}
+
+	expected := "Entry Description"
+	result := feed.Entries[0].Content
+	if expected != result {
+		t.Errorf(`Unexpected description, got %q instead of %q`, result, expected)
+	}
+}
+
 func TestParseEntryWithCategoryAndInnerHTML(t *testing.T) {
 	data := `<?xml version="1.0" encoding="utf-8"?>
 		<rss xmlns:atom="http://www.w3.org/2005/Atom" version="2.0">
--- a/internal/reader/rss/podcast.go
+++ b/internal/reader/rss/podcast.go
@ -15,21 +15,24 @@ var ErrInvalidDurationFormat = errors.New("rss: invalid duration format")
 // PodcastFeedElement represents iTunes and GooglePlay feed XML elements.
 // Specs:
 // - https://github.com/simplepie/simplepie-ng/wiki/Spec:-iTunes-Podcast-RSS
-// - https://developers.google.com/search/reference/podcast/rss-feed
+// - https://support.google.com/podcast-publishers/answer/9889544
 type PodcastFeedElement struct {
-	ItunesAuthor     string       `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd channel>author"`
-	Subtitle         string       `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd channel>subtitle"`
-	Summary          string       `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd channel>summary"`
-	PodcastOwner     PodcastOwner `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd channel>owner"`
-	GooglePlayAuthor string       `xml:"http://www.google.com/schemas/play-podcasts/1.0 channel>author"`
+	ItunesAuthor     string       `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd author"`
+	Subtitle         string       `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd subtitle"`
+	Summary          string       `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd summary"`
+	PodcastOwner     PodcastOwner `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd owner"`
+	GooglePlayAuthor string       `xml:"http://www.google.com/schemas/play-podcasts/1.0 author"`
 }

 // PodcastEntryElement represents iTunes and GooglePlay entry XML elements.
 type PodcastEntryElement struct {
-	Subtitle              string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd subtitle"`
-	Summary               string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd summary"`
-	Duration              string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd duration"`
-	GooglePlayDescription string `xml:"http://www.google.com/schemas/play-podcasts/1.0 description"`
+	ItunesAuthor          string       `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd author"`
+	Subtitle              string       `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd subtitle"`
+	Summary               string       `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd summary"`
+	Duration              string       `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd duration"`
+	PodcastOwner          PodcastOwner `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd owner"`
+	GooglePlayAuthor      string       `xml:"http://www.google.com/schemas/play-podcasts/1.0 author"`
+	GooglePlayDescription string       `xml:"http://www.google.com/schemas/play-podcasts/1.0 description"`
 }

 // PodcastOwner represents contact information for the podcast owner.
@ -38,6 +41,19 @@ type PodcastOwner struct {
 	Email string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd email"`
 }

+func (p *PodcastOwner) String() string {
+	var name string
+
+	switch {
+	case p.Name != "":
+		name = p.Name
+	case p.Email != "":
+		name = p.Email
+	}
+
+	return strings.TrimSpace(name)
+}
+
 // Image represents podcast artwork.
 type Image struct {
 	URL string `xml:"href,attr"`
@ -52,10 +68,8 @@ func (e *PodcastFeedElement) PodcastAuthor() string {
 		author = e.ItunesAuthor
 	case e.GooglePlayAuthor != "":
 		author = e.GooglePlayAuthor
-	case e.PodcastOwner.Name != "":
-		author = e.PodcastOwner.Name
-	case e.PodcastOwner.Email != "":
-		author = e.PodcastOwner.Email
+	case e.PodcastOwner.String() != "":
+		author = e.PodcastOwner.String()
 	}

 	return strings.TrimSpace(author)
--- a/internal/reader/rss/rss.go
+++ b/internal/reader/rss/rss.go
@ -21,20 +21,25 @@ import (
 	"miniflux.app/v2/internal/urllib"
 )

-// Specs: https://cyber.harvard.edu/rss/rss.html
+// Specs: https://www.rssboard.org/rss-specification
 type rssFeed struct {
-	XMLName        xml.Name  `xml:"rss"`
-	Version        string    `xml:"version,attr"`
-	Title          string    `xml:"channel>title"`
-	Links          []rssLink `xml:"channel>link"`
-	ImageURL       string    `xml:"channel>image>url"`
-	Language       string    `xml:"channel>language"`
-	Description    string    `xml:"channel>description"`
-	PubDate        string    `xml:"channel>pubDate"`
-	ManagingEditor string    `xml:"channel>managingEditor"`
-	Webmaster      string    `xml:"channel>webMaster"`
-	TimeToLive     rssTTL    `xml:"channel>ttl"`
-	Items          []rssItem `xml:"channel>item"`
+	XMLName xml.Name   `xml:"rss"`
+	Version string     `xml:"rss version,attr"`
+	Channel rssChannel `xml:"rss channel"`
+}
+
+type rssChannel struct {
+	Title          string    `xml:"rss title"`
+	Link           string    `xml:"rss link"`
+	ImageURL       string    `xml:"rss image>url"`
+	Language       string    `xml:"rss language"`
+	Description    string    `xml:"rss description"`
+	PubDate        string    `xml:"rss pubDate"`
+	ManagingEditor string    `xml:"rss managingEditor"`
+	Webmaster      string    `xml:"rss webMaster"`
+	TimeToLive     rssTTL    `xml:"rss ttl"`
+	Items          []rssItem `xml:"rss item"`
+	AtomLinks
 	PodcastFeedElement
 }

@ -72,15 +77,15 @@ func (r *rssFeed) Transform(baseURL string) *model.Feed {
 		feed.FeedURL = feedURL
 	}

-	feed.Title = html.UnescapeString(strings.TrimSpace(r.Title))
+	feed.Title = html.UnescapeString(strings.TrimSpace(r.Channel.Title))
 	if feed.Title == "" {
 		feed.Title = feed.SiteURL
 	}

-	feed.IconURL = strings.TrimSpace(r.ImageURL)
-	feed.TTL = r.TimeToLive.Value()
+	feed.IconURL = strings.TrimSpace(r.Channel.ImageURL)
+	feed.TTL = r.Channel.TimeToLive.Value()

-	for _, item := range r.Items {
+	for _, item := range r.Channel.Items {
 		entry := item.Transform()
 		if entry.Author == "" {
 			entry.Author = r.feedAuthor()
@ -110,32 +115,29 @@ func (r *rssFeed) Transform(baseURL string) *model.Feed {
 }

 func (r *rssFeed) siteURL() string {
-	for _, element := range r.Links {
-		if element.XMLName.Space == "" {
-			return strings.TrimSpace(element.Data)
-		}
-	}
-
-	return ""
+	return strings.TrimSpace(r.Channel.Link)
 }

 func (r *rssFeed) feedURL() string {
-	for _, element := range r.Links {
-		if element.XMLName.Space == "http://www.w3.org/2005/Atom" {
-			return strings.TrimSpace(element.Href)
+	for _, atomLink := range r.Channel.AtomLinks.Links {
+		if atomLink.Rel == "self" {
+			return strings.TrimSpace(atomLink.URL)
 		}
 	}
-
 	return ""
 }

 func (r rssFeed) feedAuthor() string {
-	author := r.PodcastAuthor()
+	author := r.Channel.PodcastAuthor()
 	switch {
-	case r.ManagingEditor != "":
-		author = r.ManagingEditor
-	case r.Webmaster != "":
-		author = r.Webmaster
+	case r.Channel.ManagingEditor != "":
+		author = r.Channel.ManagingEditor
+	case r.Channel.Webmaster != "":
+		author = r.Channel.Webmaster
+	case r.Channel.GooglePlayAuthor != "":
+		author = r.Channel.GooglePlayAuthor
+	case r.Channel.PodcastOwner.String() != "":
+		author = r.Channel.PodcastOwner.String()
 	}
 	return sanitizer.StripTags(strings.TrimSpace(author))
 }
@ -146,27 +148,7 @@ type rssGUID struct {
 	IsPermaLink string `xml:"isPermaLink,attr"`
 }

-type rssLink struct {
-	XMLName xml.Name
-	Data    string `xml:",chardata"`
-	Href    string `xml:"href,attr"`
-	Rel     string `xml:"rel,attr"`
-}
-
-type rssCommentLink struct {
-	XMLName xml.Name
-	Data    string `xml:",chardata"`
-}
-
 type rssAuthor struct {
-	XMLName xml.Name
-	Data    string `xml:",chardata"`
-	Name    string `xml:"name"`
-	Email   string `xml:"email"`
-	Inner   string `xml:",innerxml"`
-}
-
-type rssTitle struct {
 	XMLName xml.Name
 	Data    string `xml:",chardata"`
 	Inner   string `xml:",innerxml"`
@ -193,19 +175,21 @@ func (enclosure *rssEnclosure) Size() int64 {
 }

 type rssItem struct {
-	GUID           rssGUID          `xml:"guid"`
-	Title          []rssTitle       `xml:"title"`
-	Links          []rssLink        `xml:"link"`
-	Description    string           `xml:"description"`
-	PubDate        string           `xml:"pubDate"`
-	Authors        []rssAuthor      `xml:"author"`
-	CommentLinks   []rssCommentLink `xml:"comments"`
-	EnclosureLinks []rssEnclosure   `xml:"enclosure"`
-	Categories     []rssCategory    `xml:"category"`
+	GUID           rssGUID        `xml:"rss guid"`
+	Title          string         `xml:"rss title"`
+	Link           string         `xml:"rss link"`
+	Description    string         `xml:"rss description"`
+	PubDate        string         `xml:"rss pubDate"`
+	Author         rssAuthor      `xml:"rss author"`
+	Comments       string         `xml:"rss comments"`
+	EnclosureLinks []rssEnclosure `xml:"rss enclosure"`
+	Categories     []rssCategory  `xml:"rss category"`
 	dublincore.DublinCoreItemElement
 	FeedBurnerElement
 	PodcastEntryElement
 	media.Element
+	AtomAuthor
+	AtomLinks
 }

 func (r *rssItem) Transform() *model.Entry {
@ -250,34 +234,26 @@ func (r *rssItem) entryDate() time.Time {
 }

 func (r *rssItem) entryAuthor() string {
-	author := ""
+	var author string

-	for _, rssAuthor := range r.Authors {
-		switch rssAuthor.XMLName.Space {
-		case "http://www.itunes.com/dtds/podcast-1.0.dtd", "http://www.google.com/schemas/play-podcasts/1.0":
-			author = rssAuthor.Data
-		case "http://www.w3.org/2005/Atom":
-			if rssAuthor.Name != "" {
-				author = rssAuthor.Name
-			} else if rssAuthor.Email != "" {
-				author = rssAuthor.Email
-			}
-		default:
-			if rssAuthor.Name != "" {
-				author = rssAuthor.Name
-			} else if strings.Contains(rssAuthor.Inner, "<![CDATA[") {
-				author = rssAuthor.Data
-			} else {
-				author = rssAuthor.Inner
-			}
-		}
+	switch {
+	case r.PodcastOwner.String() != "":
+		author = r.PodcastOwner.String()
+	case r.GooglePlayAuthor != "":
+		author = r.GooglePlayAuthor
+	case r.ItunesAuthor != "":
+		author = r.ItunesAuthor
+	case r.DublinCoreCreator != "":
+		author = r.DublinCoreCreator
+	case r.AtomAuthor.String() != "":
+		author = r.AtomAuthor.String()
+	case strings.Contains(r.Author.Inner, "<![CDATA["):
+		author = r.Author.Data
+	default:
+		author = r.Author.Inner
 	}

-	if author == "" {
-		author = r.GetSanitizedCreator()
-	}
-
-	return sanitizer.StripTags(strings.TrimSpace(author))
+	return strings.TrimSpace(sanitizer.StripTags(author))
 }

 func (r *rssItem) entryHash() string {
@ -291,21 +267,10 @@ func (r *rssItem) entryHash() string {
 }

 func (r *rssItem) entryTitle() string {
-	var title string
+	title := r.Title

-	for _, rssTitle := range r.Title {
-		switch rssTitle.XMLName.Space {
-		case "http://search.yahoo.com/mrss/":
-			// Ignore title in media namespace
-		case "http://purl.org/dc/elements/1.1/":
-			title = rssTitle.Data
-		default:
-			title = rssTitle.Data
-		}
-
-		if title != "" {
-			break
-		}
+	if r.DublinCoreTitle != "" {
+		title = r.DublinCoreTitle
 	}

 	return html.UnescapeString(strings.TrimSpace(title))
@ -321,17 +286,15 @@ func (r *rssItem) entryContent() string {
 }

 func (r *rssItem) entryURL() string {
-	if r.FeedBurnerLink != "" {
-		return r.FeedBurnerLink
+	for _, link := range []string{r.FeedBurnerLink, r.Link} {
+		if link != "" {
+			return strings.TrimSpace(link)
+		}
 	}

-	for _, link := range r.Links {
-		if link.XMLName.Space == "http://www.w3.org/2005/Atom" && link.Href != "" && isValidLinkRelation(link.Rel) {
-			return strings.TrimSpace(link.Href)
-		}
-
-		if link.Data != "" {
-			return strings.TrimSpace(link.Data)
+	for _, atomLink := range r.AtomLinks.Links {
+		if atomLink.URL != "" && (strings.EqualFold(atomLink.Rel, "alternate") || atomLink.Rel == "") {
+			return strings.TrimSpace(atomLink.URL)
 		}
 	}

@ -425,28 +388,10 @@ func (r *rssItem) entryCategories() []string {
 }

 func (r *rssItem) entryCommentsURL() string {
-	for _, commentLink := range r.CommentLinks {
-		if commentLink.XMLName.Space == "" {
-			commentsURL := strings.TrimSpace(commentLink.Data)
-			// The comments URL is supposed to be absolute (some feeds publishes incorrect comments URL)
-			// See https://cyber.harvard.edu/rss/rss.html#ltcommentsgtSubelementOfLtitemgt
-			if urllib.IsAbsoluteURL(commentsURL) {
-				return commentsURL
-			}
-		}
+	commentsURL := strings.TrimSpace(r.Comments)
+	if commentsURL != "" && urllib.IsAbsoluteURL(commentsURL) {
+		return commentsURL
 	}

 	return ""
 }
-
-func isValidLinkRelation(rel string) bool {
-	switch rel {
-	case "", "alternate", "enclosure", "related", "self", "via":
-		return true
-	default:
-		if strings.HasPrefix(rel, "http") {
-			return true
-		}
-		return false
-	}
-}