Refactor RSS parser to use default namespace
This change avoid some limitations of the Go XML parser regarding XML namespaces
This commit is contained in:
parent
d3a85b049b
commit
9a637ce95e
6 changed files with 185 additions and 181 deletions
|
@ -12,6 +12,7 @@ import (
|
||||||
var textLinkRegex = regexp.MustCompile(`(?mi)(\bhttps?:\/\/[-A-Z0-9+&@#\/%?=~_|!:,.;]*[-A-Z0-9+&@#\/%=~_|])`)
|
var textLinkRegex = regexp.MustCompile(`(?mi)(\bhttps?:\/\/[-A-Z0-9+&@#\/%?=~_|!:,.;]*[-A-Z0-9+&@#\/%=~_|])`)
|
||||||
|
|
||||||
// Element represents XML media elements.
|
// Element represents XML media elements.
|
||||||
|
// Specs: https://www.rssboard.org/media-rss
|
||||||
type Element struct {
|
type Element struct {
|
||||||
MediaGroups []Group `xml:"http://search.yahoo.com/mrss/ group"`
|
MediaGroups []Group `xml:"http://search.yahoo.com/mrss/ group"`
|
||||||
MediaContents []Content `xml:"http://search.yahoo.com/mrss/ content"`
|
MediaContents []Content `xml:"http://search.yahoo.com/mrss/ content"`
|
||||||
|
|
43
internal/reader/rss/atom.go
Normal file
43
internal/reader/rss/atom.go
Normal file
|
@ -0,0 +1,43 @@
|
||||||
|
// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
|
package rss // import "miniflux.app/v2/internal/reader/rss"
|
||||||
|
|
||||||
|
import "strings"
|
||||||
|
|
||||||
|
type AtomAuthor struct {
|
||||||
|
Author AtomPerson `xml:"http://www.w3.org/2005/Atom author"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *AtomAuthor) String() string {
|
||||||
|
return a.Author.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
type AtomPerson struct {
|
||||||
|
Name string `xml:"name"`
|
||||||
|
Email string `xml:"email"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *AtomPerson) String() string {
|
||||||
|
var name string
|
||||||
|
|
||||||
|
switch {
|
||||||
|
case a.Name != "":
|
||||||
|
name = a.Name
|
||||||
|
case a.Email != "":
|
||||||
|
name = a.Email
|
||||||
|
}
|
||||||
|
|
||||||
|
return strings.TrimSpace(name)
|
||||||
|
}
|
||||||
|
|
||||||
|
type AtomLink struct {
|
||||||
|
URL string `xml:"href,attr"`
|
||||||
|
Type string `xml:"type,attr"`
|
||||||
|
Rel string `xml:"rel,attr"`
|
||||||
|
Length string `xml:"length,attr"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type AtomLinks struct {
|
||||||
|
Links []*AtomLink `xml:"http://www.w3.org/2005/Atom link"`
|
||||||
|
}
|
|
@ -14,7 +14,9 @@ import (
|
||||||
// Parse returns a normalized feed struct from a RSS feed.
|
// Parse returns a normalized feed struct from a RSS feed.
|
||||||
func Parse(baseURL string, data io.ReadSeeker) (*model.Feed, error) {
|
func Parse(baseURL string, data io.ReadSeeker) (*model.Feed, error) {
|
||||||
feed := new(rssFeed)
|
feed := new(rssFeed)
|
||||||
if err := xml.NewXMLDecoder(data).Decode(feed); err != nil {
|
decoder := xml.NewXMLDecoder(data)
|
||||||
|
decoder.DefaultSpace = "rss"
|
||||||
|
if err := decoder.Decode(feed); err != nil {
|
||||||
return nil, fmt.Errorf("rss: unable to parse feed: %w", err)
|
return nil, fmt.Errorf("rss: unable to parse feed: %w", err)
|
||||||
}
|
}
|
||||||
return feed.Transform(baseURL), nil
|
return feed.Transform(baseURL), nil
|
||||||
|
|
|
@ -300,7 +300,7 @@ func TestParseEntryWithMultipleAtomLinks(t *testing.T) {
|
||||||
<item>
|
<item>
|
||||||
<title>Test</title>
|
<title>Test</title>
|
||||||
<atom:link rel="payment" href="https://example.org/a" />
|
<atom:link rel="payment" href="https://example.org/a" />
|
||||||
<atom:link rel="http://foobar.tld" href="https://example.org/b" />
|
<atom:link rel="alternate" href="https://example.org/b" />
|
||||||
</item>
|
</item>
|
||||||
</channel>
|
</channel>
|
||||||
</rss>`
|
</rss>`
|
||||||
|
@ -430,7 +430,7 @@ func TestParseEntryWithAuthorAndCDATA(t *testing.T) {
|
||||||
<title>Test</title>
|
<title>Test</title>
|
||||||
<link>https://example.org/item</link>
|
<link>https://example.org/item</link>
|
||||||
<author>
|
<author>
|
||||||
by <![CDATA[Foo Bar]]>
|
<![CDATA[by Foo Bar]]>
|
||||||
</author>
|
</author>
|
||||||
</item>
|
</item>
|
||||||
</channel>
|
</channel>
|
||||||
|
@ -447,38 +447,6 @@ func TestParseEntryWithAuthorAndCDATA(t *testing.T) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestParseEntryWithNonStandardAtomAuthor(t *testing.T) {
|
|
||||||
data := `<?xml version="1.0" encoding="utf-8"?>
|
|
||||||
<rss xmlns:atom="http://www.w3.org/2005/Atom" version="2.0">
|
|
||||||
<channel>
|
|
||||||
<title>Example</title>
|
|
||||||
<link>https://example.org/</link>
|
|
||||||
<atom:link href="https://example.org/rss" type="application/rss+xml" rel="self"></atom:link>
|
|
||||||
<item>
|
|
||||||
<title>Test</title>
|
|
||||||
<link>https://example.org/item</link>
|
|
||||||
<author xmlns:author="http://www.w3.org/2005/Atom">
|
|
||||||
<name>Foo Bar</name>
|
|
||||||
<title>Vice President</title>
|
|
||||||
<department/>
|
|
||||||
<company>FooBar Inc.</company>
|
|
||||||
</author>
|
|
||||||
</item>
|
|
||||||
</channel>
|
|
||||||
</rss>`
|
|
||||||
|
|
||||||
feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data)))
|
|
||||||
if err != nil {
|
|
||||||
t.Fatal(err)
|
|
||||||
}
|
|
||||||
|
|
||||||
expected := "Foo Bar"
|
|
||||||
result := feed.Entries[0].Author
|
|
||||||
if result != expected {
|
|
||||||
t.Errorf("Incorrect entry author, got %q instead of %q", result, expected)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestParseEntryWithAtomAuthorEmail(t *testing.T) {
|
func TestParseEntryWithAtomAuthorEmail(t *testing.T) {
|
||||||
data := `<?xml version="1.0" encoding="utf-8"?>
|
data := `<?xml version="1.0" encoding="utf-8"?>
|
||||||
<rss xmlns:atom="http://www.w3.org/2005/Atom" version="2.0">
|
<rss xmlns:atom="http://www.w3.org/2005/Atom" version="2.0">
|
||||||
|
@ -508,7 +476,7 @@ func TestParseEntryWithAtomAuthorEmail(t *testing.T) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestParseEntryWithAtomAuthor(t *testing.T) {
|
func TestParseEntryWithAtomAuthorName(t *testing.T) {
|
||||||
data := `<?xml version="1.0" encoding="utf-8"?>
|
data := `<?xml version="1.0" encoding="utf-8"?>
|
||||||
<rss xmlns:atom="http://www.w3.org/2005/Atom" version="2.0">
|
<rss xmlns:atom="http://www.w3.org/2005/Atom" version="2.0">
|
||||||
<channel>
|
<channel>
|
||||||
|
@ -1435,6 +1403,37 @@ func TestEntryDescriptionFromGooglePlayDescription(t *testing.T) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestParseEntryWithRSSDescriptionAndMediaDescription(t *testing.T) {
|
||||||
|
data := `<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<rss version="2.0" xmlns:media="http://search.yahoo.com/mrss/">
|
||||||
|
<channel>
|
||||||
|
<title>Podcast Example</title>
|
||||||
|
<link>http://www.example.com/index.html</link>
|
||||||
|
<item>
|
||||||
|
<title>Entry Title</title>
|
||||||
|
<link>http://www.example.com/entries/1</link>
|
||||||
|
<description>Entry Description</description>
|
||||||
|
<media:description type="plain">Media Description</media:description>
|
||||||
|
</item>
|
||||||
|
</channel>
|
||||||
|
</rss>`
|
||||||
|
|
||||||
|
feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data)))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(feed.Entries) != 1 {
|
||||||
|
t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries))
|
||||||
|
}
|
||||||
|
|
||||||
|
expected := "Entry Description"
|
||||||
|
result := feed.Entries[0].Content
|
||||||
|
if expected != result {
|
||||||
|
t.Errorf(`Unexpected description, got %q instead of %q`, result, expected)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestParseEntryWithCategoryAndInnerHTML(t *testing.T) {
|
func TestParseEntryWithCategoryAndInnerHTML(t *testing.T) {
|
||||||
data := `<?xml version="1.0" encoding="utf-8"?>
|
data := `<?xml version="1.0" encoding="utf-8"?>
|
||||||
<rss xmlns:atom="http://www.w3.org/2005/Atom" version="2.0">
|
<rss xmlns:atom="http://www.w3.org/2005/Atom" version="2.0">
|
||||||
|
|
|
@ -15,20 +15,23 @@ var ErrInvalidDurationFormat = errors.New("rss: invalid duration format")
|
||||||
// PodcastFeedElement represents iTunes and GooglePlay feed XML elements.
|
// PodcastFeedElement represents iTunes and GooglePlay feed XML elements.
|
||||||
// Specs:
|
// Specs:
|
||||||
// - https://github.com/simplepie/simplepie-ng/wiki/Spec:-iTunes-Podcast-RSS
|
// - https://github.com/simplepie/simplepie-ng/wiki/Spec:-iTunes-Podcast-RSS
|
||||||
// - https://developers.google.com/search/reference/podcast/rss-feed
|
// - https://support.google.com/podcast-publishers/answer/9889544
|
||||||
type PodcastFeedElement struct {
|
type PodcastFeedElement struct {
|
||||||
ItunesAuthor string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd channel>author"`
|
ItunesAuthor string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd author"`
|
||||||
Subtitle string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd channel>subtitle"`
|
Subtitle string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd subtitle"`
|
||||||
Summary string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd channel>summary"`
|
Summary string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd summary"`
|
||||||
PodcastOwner PodcastOwner `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd channel>owner"`
|
PodcastOwner PodcastOwner `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd owner"`
|
||||||
GooglePlayAuthor string `xml:"http://www.google.com/schemas/play-podcasts/1.0 channel>author"`
|
GooglePlayAuthor string `xml:"http://www.google.com/schemas/play-podcasts/1.0 author"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// PodcastEntryElement represents iTunes and GooglePlay entry XML elements.
|
// PodcastEntryElement represents iTunes and GooglePlay entry XML elements.
|
||||||
type PodcastEntryElement struct {
|
type PodcastEntryElement struct {
|
||||||
|
ItunesAuthor string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd author"`
|
||||||
Subtitle string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd subtitle"`
|
Subtitle string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd subtitle"`
|
||||||
Summary string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd summary"`
|
Summary string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd summary"`
|
||||||
Duration string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd duration"`
|
Duration string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd duration"`
|
||||||
|
PodcastOwner PodcastOwner `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd owner"`
|
||||||
|
GooglePlayAuthor string `xml:"http://www.google.com/schemas/play-podcasts/1.0 author"`
|
||||||
GooglePlayDescription string `xml:"http://www.google.com/schemas/play-podcasts/1.0 description"`
|
GooglePlayDescription string `xml:"http://www.google.com/schemas/play-podcasts/1.0 description"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -38,6 +41,19 @@ type PodcastOwner struct {
|
||||||
Email string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd email"`
|
Email string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd email"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (p *PodcastOwner) String() string {
|
||||||
|
var name string
|
||||||
|
|
||||||
|
switch {
|
||||||
|
case p.Name != "":
|
||||||
|
name = p.Name
|
||||||
|
case p.Email != "":
|
||||||
|
name = p.Email
|
||||||
|
}
|
||||||
|
|
||||||
|
return strings.TrimSpace(name)
|
||||||
|
}
|
||||||
|
|
||||||
// Image represents podcast artwork.
|
// Image represents podcast artwork.
|
||||||
type Image struct {
|
type Image struct {
|
||||||
URL string `xml:"href,attr"`
|
URL string `xml:"href,attr"`
|
||||||
|
@ -52,10 +68,8 @@ func (e *PodcastFeedElement) PodcastAuthor() string {
|
||||||
author = e.ItunesAuthor
|
author = e.ItunesAuthor
|
||||||
case e.GooglePlayAuthor != "":
|
case e.GooglePlayAuthor != "":
|
||||||
author = e.GooglePlayAuthor
|
author = e.GooglePlayAuthor
|
||||||
case e.PodcastOwner.Name != "":
|
case e.PodcastOwner.String() != "":
|
||||||
author = e.PodcastOwner.Name
|
author = e.PodcastOwner.String()
|
||||||
case e.PodcastOwner.Email != "":
|
|
||||||
author = e.PodcastOwner.Email
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return strings.TrimSpace(author)
|
return strings.TrimSpace(author)
|
||||||
|
|
|
@ -21,20 +21,25 @@ import (
|
||||||
"miniflux.app/v2/internal/urllib"
|
"miniflux.app/v2/internal/urllib"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Specs: https://cyber.harvard.edu/rss/rss.html
|
// Specs: https://www.rssboard.org/rss-specification
|
||||||
type rssFeed struct {
|
type rssFeed struct {
|
||||||
XMLName xml.Name `xml:"rss"`
|
XMLName xml.Name `xml:"rss"`
|
||||||
Version string `xml:"version,attr"`
|
Version string `xml:"rss version,attr"`
|
||||||
Title string `xml:"channel>title"`
|
Channel rssChannel `xml:"rss channel"`
|
||||||
Links []rssLink `xml:"channel>link"`
|
}
|
||||||
ImageURL string `xml:"channel>image>url"`
|
|
||||||
Language string `xml:"channel>language"`
|
type rssChannel struct {
|
||||||
Description string `xml:"channel>description"`
|
Title string `xml:"rss title"`
|
||||||
PubDate string `xml:"channel>pubDate"`
|
Link string `xml:"rss link"`
|
||||||
ManagingEditor string `xml:"channel>managingEditor"`
|
ImageURL string `xml:"rss image>url"`
|
||||||
Webmaster string `xml:"channel>webMaster"`
|
Language string `xml:"rss language"`
|
||||||
TimeToLive rssTTL `xml:"channel>ttl"`
|
Description string `xml:"rss description"`
|
||||||
Items []rssItem `xml:"channel>item"`
|
PubDate string `xml:"rss pubDate"`
|
||||||
|
ManagingEditor string `xml:"rss managingEditor"`
|
||||||
|
Webmaster string `xml:"rss webMaster"`
|
||||||
|
TimeToLive rssTTL `xml:"rss ttl"`
|
||||||
|
Items []rssItem `xml:"rss item"`
|
||||||
|
AtomLinks
|
||||||
PodcastFeedElement
|
PodcastFeedElement
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -72,15 +77,15 @@ func (r *rssFeed) Transform(baseURL string) *model.Feed {
|
||||||
feed.FeedURL = feedURL
|
feed.FeedURL = feedURL
|
||||||
}
|
}
|
||||||
|
|
||||||
feed.Title = html.UnescapeString(strings.TrimSpace(r.Title))
|
feed.Title = html.UnescapeString(strings.TrimSpace(r.Channel.Title))
|
||||||
if feed.Title == "" {
|
if feed.Title == "" {
|
||||||
feed.Title = feed.SiteURL
|
feed.Title = feed.SiteURL
|
||||||
}
|
}
|
||||||
|
|
||||||
feed.IconURL = strings.TrimSpace(r.ImageURL)
|
feed.IconURL = strings.TrimSpace(r.Channel.ImageURL)
|
||||||
feed.TTL = r.TimeToLive.Value()
|
feed.TTL = r.Channel.TimeToLive.Value()
|
||||||
|
|
||||||
for _, item := range r.Items {
|
for _, item := range r.Channel.Items {
|
||||||
entry := item.Transform()
|
entry := item.Transform()
|
||||||
if entry.Author == "" {
|
if entry.Author == "" {
|
||||||
entry.Author = r.feedAuthor()
|
entry.Author = r.feedAuthor()
|
||||||
|
@ -110,32 +115,29 @@ func (r *rssFeed) Transform(baseURL string) *model.Feed {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (r *rssFeed) siteURL() string {
|
func (r *rssFeed) siteURL() string {
|
||||||
for _, element := range r.Links {
|
return strings.TrimSpace(r.Channel.Link)
|
||||||
if element.XMLName.Space == "" {
|
|
||||||
return strings.TrimSpace(element.Data)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return ""
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (r *rssFeed) feedURL() string {
|
func (r *rssFeed) feedURL() string {
|
||||||
for _, element := range r.Links {
|
for _, atomLink := range r.Channel.AtomLinks.Links {
|
||||||
if element.XMLName.Space == "http://www.w3.org/2005/Atom" {
|
if atomLink.Rel == "self" {
|
||||||
return strings.TrimSpace(element.Href)
|
return strings.TrimSpace(atomLink.URL)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return ""
|
return ""
|
||||||
}
|
}
|
||||||
|
|
||||||
func (r rssFeed) feedAuthor() string {
|
func (r rssFeed) feedAuthor() string {
|
||||||
author := r.PodcastAuthor()
|
author := r.Channel.PodcastAuthor()
|
||||||
switch {
|
switch {
|
||||||
case r.ManagingEditor != "":
|
case r.Channel.ManagingEditor != "":
|
||||||
author = r.ManagingEditor
|
author = r.Channel.ManagingEditor
|
||||||
case r.Webmaster != "":
|
case r.Channel.Webmaster != "":
|
||||||
author = r.Webmaster
|
author = r.Channel.Webmaster
|
||||||
|
case r.Channel.GooglePlayAuthor != "":
|
||||||
|
author = r.Channel.GooglePlayAuthor
|
||||||
|
case r.Channel.PodcastOwner.String() != "":
|
||||||
|
author = r.Channel.PodcastOwner.String()
|
||||||
}
|
}
|
||||||
return sanitizer.StripTags(strings.TrimSpace(author))
|
return sanitizer.StripTags(strings.TrimSpace(author))
|
||||||
}
|
}
|
||||||
|
@ -146,27 +148,7 @@ type rssGUID struct {
|
||||||
IsPermaLink string `xml:"isPermaLink,attr"`
|
IsPermaLink string `xml:"isPermaLink,attr"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type rssLink struct {
|
|
||||||
XMLName xml.Name
|
|
||||||
Data string `xml:",chardata"`
|
|
||||||
Href string `xml:"href,attr"`
|
|
||||||
Rel string `xml:"rel,attr"`
|
|
||||||
}
|
|
||||||
|
|
||||||
type rssCommentLink struct {
|
|
||||||
XMLName xml.Name
|
|
||||||
Data string `xml:",chardata"`
|
|
||||||
}
|
|
||||||
|
|
||||||
type rssAuthor struct {
|
type rssAuthor struct {
|
||||||
XMLName xml.Name
|
|
||||||
Data string `xml:",chardata"`
|
|
||||||
Name string `xml:"name"`
|
|
||||||
Email string `xml:"email"`
|
|
||||||
Inner string `xml:",innerxml"`
|
|
||||||
}
|
|
||||||
|
|
||||||
type rssTitle struct {
|
|
||||||
XMLName xml.Name
|
XMLName xml.Name
|
||||||
Data string `xml:",chardata"`
|
Data string `xml:",chardata"`
|
||||||
Inner string `xml:",innerxml"`
|
Inner string `xml:",innerxml"`
|
||||||
|
@ -193,19 +175,21 @@ func (enclosure *rssEnclosure) Size() int64 {
|
||||||
}
|
}
|
||||||
|
|
||||||
type rssItem struct {
|
type rssItem struct {
|
||||||
GUID rssGUID `xml:"guid"`
|
GUID rssGUID `xml:"rss guid"`
|
||||||
Title []rssTitle `xml:"title"`
|
Title string `xml:"rss title"`
|
||||||
Links []rssLink `xml:"link"`
|
Link string `xml:"rss link"`
|
||||||
Description string `xml:"description"`
|
Description string `xml:"rss description"`
|
||||||
PubDate string `xml:"pubDate"`
|
PubDate string `xml:"rss pubDate"`
|
||||||
Authors []rssAuthor `xml:"author"`
|
Author rssAuthor `xml:"rss author"`
|
||||||
CommentLinks []rssCommentLink `xml:"comments"`
|
Comments string `xml:"rss comments"`
|
||||||
EnclosureLinks []rssEnclosure `xml:"enclosure"`
|
EnclosureLinks []rssEnclosure `xml:"rss enclosure"`
|
||||||
Categories []rssCategory `xml:"category"`
|
Categories []rssCategory `xml:"rss category"`
|
||||||
dublincore.DublinCoreItemElement
|
dublincore.DublinCoreItemElement
|
||||||
FeedBurnerElement
|
FeedBurnerElement
|
||||||
PodcastEntryElement
|
PodcastEntryElement
|
||||||
media.Element
|
media.Element
|
||||||
|
AtomAuthor
|
||||||
|
AtomLinks
|
||||||
}
|
}
|
||||||
|
|
||||||
func (r *rssItem) Transform() *model.Entry {
|
func (r *rssItem) Transform() *model.Entry {
|
||||||
|
@ -250,34 +234,26 @@ func (r *rssItem) entryDate() time.Time {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (r *rssItem) entryAuthor() string {
|
func (r *rssItem) entryAuthor() string {
|
||||||
author := ""
|
var author string
|
||||||
|
|
||||||
for _, rssAuthor := range r.Authors {
|
switch {
|
||||||
switch rssAuthor.XMLName.Space {
|
case r.PodcastOwner.String() != "":
|
||||||
case "http://www.itunes.com/dtds/podcast-1.0.dtd", "http://www.google.com/schemas/play-podcasts/1.0":
|
author = r.PodcastOwner.String()
|
||||||
author = rssAuthor.Data
|
case r.GooglePlayAuthor != "":
|
||||||
case "http://www.w3.org/2005/Atom":
|
author = r.GooglePlayAuthor
|
||||||
if rssAuthor.Name != "" {
|
case r.ItunesAuthor != "":
|
||||||
author = rssAuthor.Name
|
author = r.ItunesAuthor
|
||||||
} else if rssAuthor.Email != "" {
|
case r.DublinCoreCreator != "":
|
||||||
author = rssAuthor.Email
|
author = r.DublinCoreCreator
|
||||||
}
|
case r.AtomAuthor.String() != "":
|
||||||
|
author = r.AtomAuthor.String()
|
||||||
|
case strings.Contains(r.Author.Inner, "<![CDATA["):
|
||||||
|
author = r.Author.Data
|
||||||
default:
|
default:
|
||||||
if rssAuthor.Name != "" {
|
author = r.Author.Inner
|
||||||
author = rssAuthor.Name
|
|
||||||
} else if strings.Contains(rssAuthor.Inner, "<![CDATA[") {
|
|
||||||
author = rssAuthor.Data
|
|
||||||
} else {
|
|
||||||
author = rssAuthor.Inner
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if author == "" {
|
return strings.TrimSpace(sanitizer.StripTags(author))
|
||||||
author = r.GetSanitizedCreator()
|
|
||||||
}
|
|
||||||
|
|
||||||
return sanitizer.StripTags(strings.TrimSpace(author))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (r *rssItem) entryHash() string {
|
func (r *rssItem) entryHash() string {
|
||||||
|
@ -291,21 +267,10 @@ func (r *rssItem) entryHash() string {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (r *rssItem) entryTitle() string {
|
func (r *rssItem) entryTitle() string {
|
||||||
var title string
|
title := r.Title
|
||||||
|
|
||||||
for _, rssTitle := range r.Title {
|
if r.DublinCoreTitle != "" {
|
||||||
switch rssTitle.XMLName.Space {
|
title = r.DublinCoreTitle
|
||||||
case "http://search.yahoo.com/mrss/":
|
|
||||||
// Ignore title in media namespace
|
|
||||||
case "http://purl.org/dc/elements/1.1/":
|
|
||||||
title = rssTitle.Data
|
|
||||||
default:
|
|
||||||
title = rssTitle.Data
|
|
||||||
}
|
|
||||||
|
|
||||||
if title != "" {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return html.UnescapeString(strings.TrimSpace(title))
|
return html.UnescapeString(strings.TrimSpace(title))
|
||||||
|
@ -321,17 +286,15 @@ func (r *rssItem) entryContent() string {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (r *rssItem) entryURL() string {
|
func (r *rssItem) entryURL() string {
|
||||||
if r.FeedBurnerLink != "" {
|
for _, link := range []string{r.FeedBurnerLink, r.Link} {
|
||||||
return r.FeedBurnerLink
|
if link != "" {
|
||||||
|
return strings.TrimSpace(link)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, link := range r.Links {
|
for _, atomLink := range r.AtomLinks.Links {
|
||||||
if link.XMLName.Space == "http://www.w3.org/2005/Atom" && link.Href != "" && isValidLinkRelation(link.Rel) {
|
if atomLink.URL != "" && (strings.EqualFold(atomLink.Rel, "alternate") || atomLink.Rel == "") {
|
||||||
return strings.TrimSpace(link.Href)
|
return strings.TrimSpace(atomLink.URL)
|
||||||
}
|
|
||||||
|
|
||||||
if link.Data != "" {
|
|
||||||
return strings.TrimSpace(link.Data)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -425,28 +388,10 @@ func (r *rssItem) entryCategories() []string {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (r *rssItem) entryCommentsURL() string {
|
func (r *rssItem) entryCommentsURL() string {
|
||||||
for _, commentLink := range r.CommentLinks {
|
commentsURL := strings.TrimSpace(r.Comments)
|
||||||
if commentLink.XMLName.Space == "" {
|
if commentsURL != "" && urllib.IsAbsoluteURL(commentsURL) {
|
||||||
commentsURL := strings.TrimSpace(commentLink.Data)
|
|
||||||
// The comments URL is supposed to be absolute (some feeds publishes incorrect comments URL)
|
|
||||||
// See https://cyber.harvard.edu/rss/rss.html#ltcommentsgtSubelementOfLtitemgt
|
|
||||||
if urllib.IsAbsoluteURL(commentsURL) {
|
|
||||||
return commentsURL
|
return commentsURL
|
||||||
}
|
}
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return ""
|
return ""
|
||||||
}
|
}
|
||||||
|
|
||||||
func isValidLinkRelation(rel string) bool {
|
|
||||||
switch rel {
|
|
||||||
case "", "alternate", "enclosure", "related", "self", "via":
|
|
||||||
return true
|
|
||||||
default:
|
|
||||||
if strings.HasPrefix(rel, "http") {
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
Loading…
Reference in a new issue