diff --git a/reader/rss/dublincore.go b/reader/rss/dublincore.go new file mode 100644 index 00000000..c461ece8 --- /dev/null +++ b/reader/rss/dublincore.go @@ -0,0 +1,12 @@ +// Copyright 2019 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package rss // import "miniflux.app/reader/rss" + +// DublinCoreElement represents Dublin Core XML elements. +type DublinCoreElement struct { + DublinCoreDate string `xml:"http://purl.org/dc/elements/1.1/ date"` + DublinCoreCreator string `xml:"http://purl.org/dc/elements/1.1/ creator"` + DublinCoreContent string `xml:"http://purl.org/rss/1.0/modules/content/ encoded"` +} diff --git a/reader/rss/feedburner.go b/reader/rss/feedburner.go new file mode 100644 index 00000000..baa4e19e --- /dev/null +++ b/reader/rss/feedburner.go @@ -0,0 +1,11 @@ +// Copyright 2019 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package rss // import "miniflux.app/reader/rss" + +// FeedBurnerElement represents FeedBurner XML elements. +type FeedBurnerElement struct { + FeedBurnerLink string `xml:"http://rssnamespace.org/feedburner/ext/1.0 origLink"` + FeedBurnerEnclosureLink string `xml:"http://rssnamespace.org/feedburner/ext/1.0 origEnclosureLink"` +} diff --git a/reader/rss/parser_test.go b/reader/rss/parser_test.go index 56ae1594..dd1d261a 100644 --- a/reader/rss/parser_test.go +++ b/reader/rss/parser_test.go @@ -230,6 +230,59 @@ func TestParseFeedURLWithAtomLink(t *testing.T) { } } +func TestParseFeedWithWebmaster(t *testing.T) { + data := ` + + + Example + https://example.org/ + webmaster@example.com + + Test + https://example.org/item + + + ` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Fatal(err) + } + + expected := "webmaster@example.com" + result := feed.Entries[0].Author + if result != expected { + t.Errorf("Incorrect entry author, got %q instead of %q", result, expected) + } +} + +func TestParseFeedWithManagingEditor(t *testing.T) { + data := ` + + + Example + https://example.org/ + webmaster@example.com + editor@example.com + + Test + https://example.org/item + + + ` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Fatal(err) + } + + expected := "editor@example.com" + result := feed.Entries[0].Author + if result != expected { + t.Errorf("Incorrect entry author, got %q instead of %q", result, expected) + } +} + func TestParseEntryWithAuthorAndInnerHTML(t *testing.T) { data := ` @@ -250,12 +303,14 @@ func TestParseEntryWithAuthorAndInnerHTML(t *testing.T) { t.Fatal(err) } - if feed.Entries[0].Author != "by Foo Bar" { - t.Errorf("Incorrect entry author, got: %s", feed.Entries[0].Author) + expected := "by Foo Bar" + result := feed.Entries[0].Author + if result != expected { + t.Errorf("Incorrect entry author, got %q instead of %q", result, expected) } } -func TestParseEntryWithAtomAuthor(t *testing.T) { +func TestParseEntryWithNonStandardAtomAuthor(t *testing.T) { data := ` @@ -280,8 +335,68 @@ func TestParseEntryWithAtomAuthor(t *testing.T) { t.Fatal(err) } - if feed.Entries[0].Author != "Foo Bar" { - t.Errorf("Incorrect entry author, got: %s", feed.Entries[0].Author) + expected := "Foo Bar" + result := feed.Entries[0].Author + if result != expected { + t.Errorf("Incorrect entry author, got %q instead of %q", result, expected) + } +} + +func TestParseEntryWithAtomAuthorEmail(t *testing.T) { + data := ` + + + Example + https://example.org/ + + + Test + https://example.org/item + + author@example.org + + + + ` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Fatal(err) + } + + expected := "author@example.org" + result := feed.Entries[0].Author + if result != expected { + t.Errorf("Incorrect entry author, got %q instead of %q", result, expected) + } +} + +func TestParseEntryWithAtomAuthor(t *testing.T) { + data := ` + + + Example + https://example.org/ + + + Test + https://example.org/item + + Foo Bar + + + + ` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Fatal(err) + } + + expected := "Foo Bar" + result := feed.Entries[0].Author + if result != expected { + t.Errorf("Incorrect entry author, got: %q instead of %q", result, expected) } } @@ -304,8 +419,10 @@ func TestParseEntryWithDublinCoreAuthor(t *testing.T) { t.Fatal(err) } - if feed.Entries[0].Author != "Me (me@example.com)" { - t.Errorf("Incorrect entry author, got: %s", feed.Entries[0].Author) + expected := "Me (me@example.com)" + result := feed.Entries[0].Author + if result != expected { + t.Errorf("Incorrect entry author, got %q instead of %q", result, expected) } } @@ -328,8 +445,10 @@ func TestParseEntryWithItunesAuthor(t *testing.T) { t.Fatal(err) } - if feed.Entries[0].Author != "Someone" { - t.Errorf("Incorrect entry author, got: %s", feed.Entries[0].Author) + expected := "Someone" + result := feed.Entries[0].Author + if result != expected { + t.Errorf("Incorrect entry author, got %q instead of %q", result, expected) } } @@ -352,8 +471,119 @@ func TestParseFeedWithItunesAuthor(t *testing.T) { t.Fatal(err) } - if feed.Entries[0].Author != "Someone" { - t.Errorf("Incorrect entry author, got: %s", feed.Entries[0].Author) + expected := "Someone" + result := feed.Entries[0].Author + if result != expected { + t.Errorf("Incorrect entry author, got %q instead of %q", result, expected) + } +} + +func TestParseFeedWithItunesOwner(t *testing.T) { + data := ` + + + Example + https://example.org/ + + John Doe + john.doe@example.com + + + Test + https://example.org/item + + + ` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Fatal(err) + } + + expected := "John Doe" + result := feed.Entries[0].Author + if result != expected { + t.Errorf("Incorrect entry author, got %q instead of %q", result, expected) + } +} + +func TestParseFeedWithItunesOwnerEmail(t *testing.T) { + data := ` + + + Example + https://example.org/ + + john.doe@example.com + + + Test + https://example.org/item + + + ` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Fatal(err) + } + + expected := "john.doe@example.com" + result := feed.Entries[0].Author + if result != expected { + t.Errorf("Incorrect entry author, got %q instead of %q", result, expected) + } +} + +func TestParseEntryWithGooglePlayAuthor(t *testing.T) { + data := ` + + + Example + https://example.org/ + + Test + https://example.org/item + Someone + + + ` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Fatal(err) + } + + expected := "Someone" + result := feed.Entries[0].Author + if result != expected { + t.Errorf("Incorrect entry author, got %q instead of %q", result, expected) + } +} + +func TestParseFeedWithGooglePlayAuthor(t *testing.T) { + data := ` + + + Example + https://example.org/ + Someone + + Test + https://example.org/item + + + ` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Fatal(err) + } + + expected := "Someone" + result := feed.Entries[0].Author + if result != expected { + t.Errorf("Incorrect entry author, got %q instead of %q", result, expected) } } @@ -794,6 +1024,7 @@ func TestParseEntryWithMediaPeerLink(t *testing.T) { if len(feed.Entries) != 1 { t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries)) } + if len(feed.Entries[0].Enclosures) != 1 { t.Fatalf("Incorrect number of enclosures, got: %d", len(feed.Entries[0].Enclosures)) } @@ -820,3 +1051,100 @@ func TestParseEntryWithMediaPeerLink(t *testing.T) { } } } + +func TestEntryDescriptionFromItunesSummary(t *testing.T) { + data := ` + + + Podcast Example + http://www.example.com/index.html + + Podcast Episode + http://example.com/episode.m4a + Tue, 08 Mar 2016 12:00:00 GMT + Episode Subtitle + Episode Summary + + + ` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Fatal(err) + } + + if len(feed.Entries) != 1 { + t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries)) + } + + expected := "Episode Summary" + result := feed.Entries[0].Content + if expected != result { + t.Errorf(`Unexpected podcast content, got %q instead of %q`, result, expected) + } +} + +func TestEntryDescriptionFromItunesSubtitle(t *testing.T) { + data := ` + + + Podcast Example + http://www.example.com/index.html + + Podcast Episode + http://example.com/episode.m4a + Tue, 08 Mar 2016 12:00:00 GMT + Episode Subtitle + + + ` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Fatal(err) + } + + if len(feed.Entries) != 1 { + t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries)) + } + + expected := "Episode Subtitle" + result := feed.Entries[0].Content + if expected != result { + t.Errorf(`Unexpected podcast content, got %q instead of %q`, result, expected) + } +} + +func TestEntryDescriptionFromGooglePlayDescription(t *testing.T) { + data := ` + + + Podcast Example + http://www.example.com/index.html + + Podcast Episode + http://example.com/episode.m4a + Tue, 08 Mar 2016 12:00:00 GMT + Episode Subtitle + Episode Description + + + ` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Fatal(err) + } + + if len(feed.Entries) != 1 { + t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries)) + } + + expected := "Episode Description" + result := feed.Entries[0].Content + if expected != result { + t.Errorf(`Unexpected podcast content, got %q instead of %q`, result, expected) + } +} diff --git a/reader/rss/podcast.go b/reader/rss/podcast.go new file mode 100644 index 00000000..61501ee9 --- /dev/null +++ b/reader/rss/podcast.go @@ -0,0 +1,70 @@ +// Copyright 2019 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package rss // import "miniflux.app/reader/rss" + +import "strings" + +// PodcastFeedElement represents iTunes and GooglePlay feed XML elements. +// Specs: +// - https://github.com/simplepie/simplepie-ng/wiki/Spec:-iTunes-Podcast-RSS +// - https://developers.google.com/search/reference/podcast/rss-feed +type PodcastFeedElement struct { + ItunesAuthor string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd channel>author"` + Subtitle string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd channel>subtitle"` + Summary string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd channel>summary"` + PodcastOwner PodcastOwner `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd channel>owner"` + GooglePlayAuthor string `xml:"http://www.google.com/schemas/play-podcasts/1.0 channel>author"` +} + +// PodcastEntryElement represents iTunes and GooglePlay entry XML elements. +type PodcastEntryElement struct { + Subtitle string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd subtitle"` + Summary string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd summary"` + GooglePlayDescription string `xml:"http://www.google.com/schemas/play-podcasts/1.0 description"` +} + +// PodcastOwner represents contact information for the podcast owner. +type PodcastOwner struct { + Name string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd name"` + Email string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd email"` +} + +// Image represents podcast artwork. +type Image struct { + URL string `xml:"href,attr"` +} + +// PodcastAuthor returns the author of the podcast. +func (e *PodcastFeedElement) PodcastAuthor() string { + author := "" + + switch { + case e.ItunesAuthor != "": + author = e.ItunesAuthor + case e.GooglePlayAuthor != "": + author = e.GooglePlayAuthor + case e.PodcastOwner.Name != "": + author = e.PodcastOwner.Name + case e.PodcastOwner.Email != "": + author = e.PodcastOwner.Email + } + + return strings.TrimSpace(author) +} + +// PodcastDescription returns the description of the podcast. +func (e *PodcastEntryElement) PodcastDescription() string { + description := "" + + switch { + case e.GooglePlayDescription != "": + description = e.GooglePlayDescription + case e.Summary != "": + description = e.Summary + case e.Subtitle != "": + description = e.Subtitle + } + return strings.TrimSpace(description) +} diff --git a/reader/rss/rss.go b/reader/rss/rss.go index fb0c5bfa..fd120cb2 100644 --- a/reader/rss/rss.go +++ b/reader/rss/rss.go @@ -20,92 +20,25 @@ import ( "miniflux.app/url" ) +// Specs: https://cyber.harvard.edu/rss/rss.html type rssFeed struct { - XMLName xml.Name `xml:"rss"` - Version string `xml:"version,attr"` - Title string `xml:"channel>title"` - Links []rssLink `xml:"channel>link"` - Language string `xml:"channel>language"` - Description string `xml:"channel>description"` - PubDate string `xml:"channel>pubDate"` - ItunesAuthor string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd channel>author"` - Items []rssItem `xml:"channel>item"` -} - -type rssLink struct { - XMLName xml.Name - Data string `xml:",chardata"` - Href string `xml:"href,attr"` - Rel string `xml:"rel,attr"` -} - -type rssCommentLink struct { - XMLName xml.Name - Data string `xml:",chardata"` -} - -type rssAuthor struct { - XMLName xml.Name - Data string `xml:",chardata"` - Name string `xml:"name"` - Inner string `xml:",innerxml"` -} - -type rssEnclosure struct { - URL string `xml:"url,attr"` - Type string `xml:"type,attr"` - Length string `xml:"length,attr"` -} - -func (enclosure *rssEnclosure) Size() int64 { - if enclosure.Length == "" { - return 0 - } - size, _ := strconv.ParseInt(enclosure.Length, 10, 0) - return size -} - -type rssItem struct { - GUID string `xml:"guid"` - Title string `xml:"title"` - Links []rssLink `xml:"link"` - OriginalLink string `xml:"http://rssnamespace.org/feedburner/ext/1.0 origLink"` - CommentLinks []rssCommentLink `xml:"comments"` - Description string `xml:"description"` - EncodedContent string `xml:"http://purl.org/rss/1.0/modules/content/ encoded"` - PubDate string `xml:"pubDate"` - Date string `xml:"http://purl.org/dc/elements/1.1/ date"` - Authors []rssAuthor `xml:"author"` - Creator string `xml:"http://purl.org/dc/elements/1.1/ creator"` - EnclosureLinks []rssEnclosure `xml:"enclosure"` - OrigEnclosureLink string `xml:"http://rssnamespace.org/feedburner/ext/1.0 origEnclosureLink"` - media.Element -} - -func (r *rssFeed) SiteURL() string { - for _, element := range r.Links { - if element.XMLName.Space == "" { - return strings.TrimSpace(element.Data) - } - } - - return "" -} - -func (r *rssFeed) FeedURL() string { - for _, element := range r.Links { - if element.XMLName.Space == "http://www.w3.org/2005/Atom" { - return strings.TrimSpace(element.Href) - } - } - - return "" + XMLName xml.Name `xml:"rss"` + Version string `xml:"version,attr"` + Title string `xml:"channel>title"` + Links []rssLink `xml:"channel>link"` + Language string `xml:"channel>language"` + Description string `xml:"channel>description"` + PubDate string `xml:"channel>pubDate"` + ManagingEditor string `xml:"channel>managingEditor"` + Webmaster string `xml:"channel>webMaster"` + Items []rssItem `xml:"channel>item"` + PodcastFeedElement } func (r *rssFeed) Transform() *model.Feed { feed := new(model.Feed) - feed.SiteURL = r.SiteURL() - feed.FeedURL = r.FeedURL() + feed.SiteURL = r.siteURL() + feed.FeedURL = r.feedURL() feed.Title = strings.TrimSpace(r.Title) if feed.Title == "" { @@ -114,11 +47,10 @@ func (r *rssFeed) Transform() *model.Feed { for _, item := range r.Items { entry := item.Transform() - - if entry.Author == "" && r.ItunesAuthor != "" { - entry.Author = r.ItunesAuthor + if entry.Author == "" { + entry.Author = r.feedAuthor() } - entry.Author = strings.TrimSpace(sanitizer.StripTags(entry.Author)) + entry.Author = sanitizer.StripTags(entry.Author) if entry.URL == "" { entry.URL = feed.SiteURL @@ -139,10 +71,103 @@ func (r *rssFeed) Transform() *model.Feed { return feed } -func (r *rssItem) PublishedDate() time.Time { +func (r *rssFeed) siteURL() string { + for _, element := range r.Links { + if element.XMLName.Space == "" { + return strings.TrimSpace(element.Data) + } + } + + return "" +} + +func (r *rssFeed) feedURL() string { + for _, element := range r.Links { + if element.XMLName.Space == "http://www.w3.org/2005/Atom" { + return strings.TrimSpace(element.Href) + } + } + + return "" +} + +func (r rssFeed) feedAuthor() string { + author := r.PodcastAuthor() + switch { + case r.ManagingEditor != "": + author = r.ManagingEditor + case r.Webmaster != "": + author = r.Webmaster + } + return strings.TrimSpace(author) +} + +type rssLink struct { + XMLName xml.Name + Data string `xml:",chardata"` + Href string `xml:"href,attr"` + Rel string `xml:"rel,attr"` +} + +type rssCommentLink struct { + XMLName xml.Name + Data string `xml:",chardata"` +} + +type rssAuthor struct { + XMLName xml.Name + Data string `xml:",chardata"` + Name string `xml:"name"` + Email string `xml:"email"` + Inner string `xml:",innerxml"` +} + +type rssEnclosure struct { + URL string `xml:"url,attr"` + Type string `xml:"type,attr"` + Length string `xml:"length,attr"` +} + +func (enclosure *rssEnclosure) Size() int64 { + if enclosure.Length == "" { + return 0 + } + size, _ := strconv.ParseInt(enclosure.Length, 10, 0) + return size +} + +type rssItem struct { + GUID string `xml:"guid"` + Title string `xml:"title"` + Links []rssLink `xml:"link"` + Description string `xml:"description"` + PubDate string `xml:"pubDate"` + Authors []rssAuthor `xml:"author"` + CommentLinks []rssCommentLink `xml:"comments"` + EnclosureLinks []rssEnclosure `xml:"enclosure"` + DublinCoreElement + FeedBurnerElement + PodcastEntryElement + media.Element +} + +func (r *rssItem) Transform() *model.Entry { + entry := new(model.Entry) + entry.URL = r.entryURL() + entry.CommentsURL = r.entryCommentsURL() + entry.Date = r.entryDate() + entry.Author = r.entryAuthor() + entry.Hash = r.entryHash() + entry.Content = r.entryContent() + entry.Title = r.entryTitle() + entry.Enclosures = r.entryEnclosures() + return entry +} + +func (r *rssItem) entryDate() time.Time { value := r.PubDate - if r.Date != "" { - value = r.Date + if r.DublinCoreDate != "" { + value = r.DublinCoreDate } if value != "" { @@ -158,22 +183,37 @@ func (r *rssItem) PublishedDate() time.Time { return time.Now() } -func (r *rssItem) Author() string { - for _, element := range r.Authors { - if element.Name != "" { - return element.Name - } +func (r *rssItem) entryAuthor() string { + author := "" - if element.Inner != "" { - return element.Inner + for _, rssAuthor := range r.Authors { + switch rssAuthor.XMLName.Space { + case "http://www.itunes.com/dtds/podcast-1.0.dtd", "http://www.google.com/schemas/play-podcasts/1.0": + author = rssAuthor.Data + case "http://www.w3.org/2005/Atom": + if rssAuthor.Name != "" { + author = rssAuthor.Name + } else if rssAuthor.Email != "" { + author = rssAuthor.Email + } + default: + if rssAuthor.Name != "" { + author = rssAuthor.Name + } else { + author = rssAuthor.Inner + } } } - return r.Creator + if author == "" { + author = r.DublinCoreCreator + } + + return strings.TrimSpace(author) } -func (r *rssItem) Hash() string { - for _, value := range []string{r.GUID, r.URL()} { +func (r *rssItem) entryHash() string { + for _, value := range []string{r.GUID, r.entryURL()} { if value != "" { return crypto.Hash(value) } @@ -182,17 +222,22 @@ func (r *rssItem) Hash() string { return "" } -func (r *rssItem) Content() string { - if r.EncodedContent != "" { - return r.EncodedContent - } - - return r.Description +func (r *rssItem) entryTitle() string { + return strings.TrimSpace(sanitizer.StripTags(r.Title)) } -func (r *rssItem) URL() string { - if r.OriginalLink != "" { - return r.OriginalLink +func (r *rssItem) entryContent() string { + for _, value := range []string{r.DublinCoreContent, r.Description, r.PodcastDescription()} { + if value != "" { + return value + } + } + return "" +} + +func (r *rssItem) entryURL() string { + if r.FeedBurnerLink != "" { + return r.FeedBurnerLink } for _, link := range r.Links { @@ -208,7 +253,7 @@ func (r *rssItem) URL() string { return "" } -func (r *rssItem) Enclosures() model.EnclosureList { +func (r *rssItem) entryEnclosures() model.EnclosureList { enclosures := make(model.EnclosureList, 0) duplicates := make(map[string]bool, 0) @@ -226,10 +271,10 @@ func (r *rssItem) Enclosures() model.EnclosureList { for _, enclosure := range r.EnclosureLinks { enclosureURL := enclosure.URL - if r.OrigEnclosureLink != "" { - filename := path.Base(r.OrigEnclosureLink) + if r.FeedBurnerEnclosureLink != "" { + filename := path.Base(r.FeedBurnerEnclosureLink) if strings.Contains(enclosureURL, filename) { - enclosureURL = r.OrigEnclosureLink + enclosureURL = r.FeedBurnerEnclosureLink } } @@ -269,7 +314,7 @@ func (r *rssItem) Enclosures() model.EnclosureList { return enclosures } -func (r *rssItem) CommentsURL() string { +func (r *rssItem) entryCommentsURL() string { for _, commentLink := range r.CommentLinks { if commentLink.XMLName.Space == "" { return strings.TrimSpace(commentLink.Data) @@ -279,19 +324,6 @@ func (r *rssItem) CommentsURL() string { return "" } -func (r *rssItem) Transform() *model.Entry { - entry := new(model.Entry) - entry.URL = r.URL() - entry.CommentsURL = r.CommentsURL() - entry.Date = r.PublishedDate() - entry.Author = r.Author() - entry.Hash = r.Hash() - entry.Content = r.Content() - entry.Title = strings.TrimSpace(r.Title) - entry.Enclosures = r.Enclosures() - return entry -} - func isValidLinkRelation(rel string) bool { switch rel { case "", "alternate", "enclosure", "related", "self", "via":