Parse podcast categories

This commit is contained in:
Frédéric Guillot 2024-03-11 22:10:47 -07:00
parent f8e50947f2
commit 6d97f8b458
3 changed files with 113 additions and 40 deletions

View file

@ -22,6 +22,17 @@ type ItunesFeedElement struct {
ItunesType string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd type"` ItunesType string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd type"`
} }
func (i *ItunesFeedElement) GetItunesCategories() []string {
var categories []string
for _, category := range i.ItunesCategories {
categories = append(categories, category.Text)
if category.SubCategory != nil {
categories = append(categories, category.SubCategory.Text)
}
}
return categories
}
type ItunesItemElement struct { type ItunesItemElement struct {
ItunesAuthor string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd author"` ItunesAuthor string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd author"`
ItunesEpisode string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd episode"` ItunesEpisode string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd episode"`

View file

@ -1434,18 +1434,17 @@ func TestParseEntryWithRSSDescriptionAndMediaDescription(t *testing.T) {
} }
} }
func TestParseEntryWithCategoryAndInnerHTML(t *testing.T) { func TestParseFeedWithCategories(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?> data := `<?xml version="1.0" encoding="utf-8"?>
<rss xmlns:atom="http://www.w3.org/2005/Atom" version="2.0"> <rss xmlns:atom="http://www.w3.org/2005/Atom" version="2.0">
<channel> <channel>
<title>Example</title> <title>Example</title>
<link>https://example.org/</link> <link>https://example.org/</link>
<atom:link href="https://example.org/rss" type="application/rss+xml" rel="self"></atom:link> <category>Category 1</category>
<category><![CDATA[Category 2]]></category>
<item> <item>
<title>Test</title> <title>Test</title>
<link>https://example.org/item</link> <link>https://example.org/item</link>
<category>Category 1</category>
<category>Category 2</category>
</item> </item>
</channel> </channel>
</rss>` </rss>`
@ -1459,27 +1458,99 @@ func TestParseEntryWithCategoryAndInnerHTML(t *testing.T) {
t.Errorf("Incorrect number of tags, got: %d", len(feed.Entries[0].Tags)) t.Errorf("Incorrect number of tags, got: %d", len(feed.Entries[0].Tags))
} }
expected := "Category 2" expected := []string{"Category 1", "Category 2"}
result := feed.Entries[0].Tags[1] result := feed.Entries[0].Tags
if result != expected {
t.Errorf("Incorrect entry category, got %q instead of %q", result, expected) for i, tag := range result {
if tag != expected[i] {
t.Errorf("Incorrect tag, got: %q", tag)
}
} }
} }
func TestParseEntryWithCategoryAndCDATA(t *testing.T) { func TestParseEntryWithCategories(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?> data := `<?xml version="1.0" encoding="utf-8"?>
<rss xmlns:atom="http://www.w3.org/2005/Atom" version="2.0"> <rss xmlns:atom="http://www.w3.org/2005/Atom" version="2.0">
<channel> <channel>
<title>Example</title> <title>Example</title>
<link>https://example.org/</link> <link>https://example.org/</link>
<atom:link href="https://example.org/rss" type="application/rss+xml" rel="self"></atom:link> <category>Category 3</category>
<item>
<title>Test</title>
<link>https://example.org/item</link>
<category>Category 1</category>
<category><![CDATA[Category 2]]></category>
</item>
</channel>
</rss>`
feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data)))
if err != nil {
t.Fatal(err)
}
if len(feed.Entries[0].Tags) != 3 {
t.Errorf("Incorrect number of tags, got: %d", len(feed.Entries[0].Tags))
}
expected := []string{"Category 1", "Category 2", "Category 3"}
result := feed.Entries[0].Tags
for i, tag := range result {
if tag != expected[i] {
t.Errorf("Incorrect tag, got: %q", tag)
}
}
}
func TestParseFeedWithItunesCategories(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?>
<rss xmlns:atom="http://www.w3.org/2005/Atom" xmlns:itunes="http://www.itunes.com/dtds/podcast-1.0.dtd" version="2.0">
<channel>
<title>Example</title>
<link>https://example.org/</link>
<itunes:category text="Society &amp; Culture">
<itunes:category text="Documentary" />
</itunes:category>
<itunes:category text="Health">
<itunes:category text="Mental Health" />
</itunes:category>
<item>
<title>Test</title>
<link>https://example.org/item</link>
</item>
</channel>
</rss>`
feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data)))
if err != nil {
t.Fatal(err)
}
if len(feed.Entries[0].Tags) != 4 {
t.Errorf("Incorrect number of tags, got: %d", len(feed.Entries[0].Tags))
}
expected := []string{"Society & Culture", "Documentary", "Health", "Mental Health"}
result := feed.Entries[0].Tags
for i, tag := range result {
if tag != expected[i] {
t.Errorf("Incorrect tag, got: %q", tag)
}
}
}
func TestParseFeedWithGooglePlayCategory(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?>
<rss xmlns:atom="http://www.w3.org/2005/Atom" xmlns:gplay="http://www.google.com/schemas/play-podcasts/1.0" version="2.0">
<channel>
<title>Example</title>
<link>https://example.org/</link>
<gplay:category text="Art"></gplay:category>
<item> <item>
<title>Test</title> <title>Test</title>
<link>https://example.org/item</link> <link>https://example.org/item</link>
<author>
by <![CDATA[Foo Bar]]>
</author>
<category>Sample Category</category>
</item> </item>
</channel> </channel>
</rss>` </rss>`
@ -1493,10 +1564,13 @@ func TestParseEntryWithCategoryAndCDATA(t *testing.T) {
t.Errorf("Incorrect number of tags, got: %d", len(feed.Entries[0].Tags)) t.Errorf("Incorrect number of tags, got: %d", len(feed.Entries[0].Tags))
} }
expected := "Sample Category" expected := []string{"Art"}
result := feed.Entries[0].Tags[0] result := feed.Entries[0].Tags
if result != expected {
t.Errorf("Incorrect entry category, got %q instead of %q", result, expected) for i, tag := range result {
if tag != expected[i] {
t.Errorf("Incorrect tag, got: %q", tag)
}
} }
} }

View file

@ -31,6 +31,7 @@ type rssFeed struct {
} }
type rssChannel struct { type rssChannel struct {
Categories []string `xml:"rss category"`
Title string `xml:"rss title"` Title string `xml:"rss title"`
Link string `xml:"rss link"` Link string `xml:"rss link"`
ImageURL string `xml:"rss image>url"` ImageURL string `xml:"rss image>url"`
@ -111,6 +112,13 @@ func (r *rssFeed) Transform(baseURL string) *model.Feed {
entry.Title = entry.URL entry.Title = entry.URL
} }
entry.Tags = append(entry.Tags, r.Channel.Categories...)
entry.Tags = append(entry.Tags, r.Channel.GetItunesCategories()...)
if r.Channel.GooglePlayCategory.Text != "" {
entry.Tags = append(entry.Tags, r.Channel.GooglePlayCategory.Text)
}
feed.Entries = append(feed.Entries, entry) feed.Entries = append(feed.Entries, entry)
} }
@ -165,12 +173,6 @@ type rssEnclosure struct {
Length string `xml:"length,attr"` Length string `xml:"length,attr"`
} }
type rssCategory struct {
XMLName xml.Name
Data string `xml:",chardata"`
Inner string `xml:",innerxml"`
}
func (enclosure *rssEnclosure) Size() int64 { func (enclosure *rssEnclosure) Size() int64 {
if enclosure.Length == "" { if enclosure.Length == "" {
return 0 return 0
@ -188,7 +190,7 @@ type rssItem struct {
Author rssAuthor `xml:"rss author"` Author rssAuthor `xml:"rss author"`
Comments string `xml:"rss comments"` Comments string `xml:"rss comments"`
EnclosureLinks []rssEnclosure `xml:"rss enclosure"` EnclosureLinks []rssEnclosure `xml:"rss enclosure"`
Categories []rssCategory `xml:"rss category"` Categories []string `xml:"rss category"`
dublincore.DublinCoreItemElement dublincore.DublinCoreItemElement
FeedBurnerElement FeedBurnerElement
media.Element media.Element
@ -208,7 +210,7 @@ func (r *rssItem) Transform() *model.Entry {
entry.Content = r.entryContent() entry.Content = r.entryContent()
entry.Title = r.entryTitle() entry.Title = r.entryTitle()
entry.Enclosures = r.entryEnclosures() entry.Enclosures = r.entryEnclosures()
entry.Tags = r.entryCategories() entry.Tags = r.Categories
if duration, err := normalizeDuration(r.ItunesDuration); err == nil { if duration, err := normalizeDuration(r.ItunesDuration); err == nil {
entry.ReadingTime = duration entry.ReadingTime = duration
} }
@ -383,20 +385,6 @@ func (r *rssItem) entryEnclosures() model.EnclosureList {
return enclosures return enclosures
} }
func (r *rssItem) entryCategories() []string {
categoryList := make([]string, 0)
for _, rssCategory := range r.Categories {
if strings.Contains(rssCategory.Inner, "<![CDATA[") {
categoryList = append(categoryList, strings.TrimSpace(rssCategory.Data))
} else {
categoryList = append(categoryList, strings.TrimSpace(rssCategory.Inner))
}
}
return categoryList
}
func (r *rssItem) entryCommentsURL() string { func (r *rssItem) entryCommentsURL() string {
commentsURL := strings.TrimSpace(r.Comments) commentsURL := strings.TrimSpace(r.Comments)
if commentsURL != "" && urllib.IsAbsoluteURL(commentsURL) { if commentsURL != "" && urllib.IsAbsoluteURL(commentsURL) {