Handle RDF feeds with duplicated <title> elements

This commit is contained in:
Frédéric Guillot 2024-02-23 17:15:22 -08:00
parent 20e5fbcd7a
commit c595c80356
3 changed files with 117 additions and 7 deletions

View file

@ -20,6 +20,7 @@ func (feed *DublinCoreFeedElement) GetSanitizedCreator() string {
// DublinCoreItemElement represents Dublin Core entry XML elements. // DublinCoreItemElement represents Dublin Core entry XML elements.
type DublinCoreItemElement struct { type DublinCoreItemElement struct {
DublinCoreTitle string `xml:"http://purl.org/dc/elements/1.1/ title"`
DublinCoreDate string `xml:"http://purl.org/dc/elements/1.1/ date"` DublinCoreDate string `xml:"http://purl.org/dc/elements/1.1/ date"`
DublinCoreCreator string `xml:"http://purl.org/dc/elements/1.1/ creator"` DublinCoreCreator string `xml:"http://purl.org/dc/elements/1.1/ creator"`
DublinCoreContent string `xml:"http://purl.org/rss/1.0/modules/content/ encoded"` DublinCoreContent string `xml:"http://purl.org/rss/1.0/modules/content/ encoded"`

View file

@ -406,7 +406,7 @@ func TestParseItemWithoutDate(t *testing.T) {
func TestParseItemWithEncodedHTMLTitle(t *testing.T) { func TestParseItemWithEncodedHTMLTitle(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?> data := `<?xml version="1.0" encoding="utf-8"?>
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"> <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/">
<channel> <channel>
<title>Example</title> <title>Example</title>
<link>http://example.org</link> <link>http://example.org</link>
@ -425,7 +425,7 @@ func TestParseItemWithEncodedHTMLTitle(t *testing.T) {
} }
if feed.Entries[0].Title != `AT&T` { if feed.Entries[0].Title != `AT&T` {
t.Errorf("Incorrect entry title, got: %v", feed.Entries[0].Title) t.Errorf("Incorrect entry title, got: %q", feed.Entries[0].Title)
} }
} }
@ -502,7 +502,7 @@ func TestParseFeedWithURLWrappedInSpaces(t *testing.T) {
<item rdf:about="http://biorxiv.org/cgi/content/short/857789v1?rss=1"> <item rdf:about="http://biorxiv.org/cgi/content/short/857789v1?rss=1">
<title> <title>
<![CDATA[ <![CDATA[
Microscale Collagen and Fibroblast Interactions Enhance Primary Human Hepatocyte Functions in 3-Dimensional Models Microscale Collagen and Fibroblast Interactions Enhance Primary Human Hepatocyte Functions in 3-Dimensional Models
]]> ]]>
</title> </title>
<link> <link>
@ -568,7 +568,7 @@ func TestParseRDFWithContentEncoded(t *testing.T) {
expected := `<p>Test</p>` expected := `<p>Test</p>`
result := feed.Entries[0].Content result := feed.Entries[0].Content
if result != expected { if result != expected {
t.Errorf(`Unexpected entry URL, got %q instead of %q`, result, expected) t.Errorf(`Unexpected entry content, got %q instead of %q`, result, expected)
} }
} }
@ -601,6 +601,105 @@ func TestParseRDFWithEncodedHTMLDescription(t *testing.T) {
expected := `AT&amp;T <img src="https://example.org/img.png"></a>` expected := `AT&amp;T <img src="https://example.org/img.png"></a>`
result := feed.Entries[0].Content result := feed.Entries[0].Content
if result != expected { if result != expected {
t.Errorf(`Unexpected entry URL, got %v instead of %v`, result, expected) t.Errorf(`Unexpected entry content, got %v instead of %v`, result, expected)
}
}
func TestParseRDFItemWithDuplicateTitleElement(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?>
<rdf:RDF
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns="http://purl.org/rss/1.0/"
xmlns:dc="http://purl.org/dc/elements/1.1/">
<channel>
<title>Example Feed</title>
<link>http://example.org/</link>
</channel>
<item>
<title>Item Title</title>
<dc:title/>
<link>http://example.org/</link>
<description>Test</description>
</item>
</rdf:RDF>`
feed, err := Parse("http://example.org/", bytes.NewBufferString(data))
if err != nil {
t.Fatal(err)
}
if len(feed.Entries) != 1 {
t.Fatalf(`Unexpected number of entries, got %d`, len(feed.Entries))
}
expected := `Item Title`
result := feed.Entries[0].Title
if result != expected {
t.Errorf(`Unexpected entry title, got %q instead of %q`, result, expected)
}
}
func TestParseRDFItemWithDublinCoreTitleElement(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?>
<rdf:RDF
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns="http://purl.org/rss/1.0/"
xmlns:dc="http://purl.org/dc/elements/1.1/">
<channel>
<title>Example Feed</title>
<link>http://example.org/</link>
</channel>
<item>
<dc:title>Dublin Core Title</dc:title>
<link>http://example.org/</link>
<description>Test</description>
</item>
</rdf:RDF>`
feed, err := Parse("http://example.org/", bytes.NewBufferString(data))
if err != nil {
t.Fatal(err)
}
if len(feed.Entries) != 1 {
t.Fatalf(`Unexpected number of entries, got %d`, len(feed.Entries))
}
expected := `Dublin Core Title`
result := feed.Entries[0].Title
if result != expected {
t.Errorf(`Unexpected entry title, got %q instead of %q`, result, expected)
}
}
func TestParseRDFItemWitEmptyTitleElement(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?>
<rdf:RDF
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns="http://purl.org/rss/1.0/">
<channel>
<title>Example Feed</title>
<link>http://example.org/</link>
</channel>
<item>
<title> </title>
<link>http://example.org/item</link>
<description>Test</description>
</item>
</rdf:RDF>`
feed, err := Parse("http://example.org/", bytes.NewBufferString(data))
if err != nil {
t.Fatal(err)
}
if len(feed.Entries) != 1 {
t.Fatalf(`Unexpected number of entries, got %d`, len(feed.Entries))
}
expected := `http://example.org/item`
result := feed.Entries[0].Title
if result != expected {
t.Errorf(`Unexpected entry title, got %q instead of %q`, result, expected)
} }
} }

View file

@ -58,7 +58,7 @@ func (r *rdfFeed) Transform(baseURL string) *model.Feed {
} }
type rdfItem struct { type rdfItem struct {
Title string `xml:"title"` Title string `xml:"http://purl.org/rss/1.0/ title"`
Link string `xml:"link"` Link string `xml:"link"`
Description string `xml:"description"` Description string `xml:"description"`
dublincore.DublinCoreItemElement dublincore.DublinCoreItemElement
@ -72,11 +72,21 @@ func (r *rdfItem) Transform() *model.Entry {
entry.Content = r.entryContent() entry.Content = r.entryContent()
entry.Hash = r.entryHash() entry.Hash = r.entryHash()
entry.Date = r.entryDate() entry.Date = r.entryDate()
if entry.Title == "" {
entry.Title = entry.URL
}
return entry return entry
} }
func (r *rdfItem) entryTitle() string { func (r *rdfItem) entryTitle() string {
return html.UnescapeString(strings.TrimSpace(r.Title)) for _, title := range []string{r.Title, r.DublinCoreTitle} {
title = strings.TrimSpace(title)
if title != "" {
return html.UnescapeString(title)
}
}
return ""
} }
func (r *rdfItem) entryContent() string { func (r *rdfItem) entryContent() string {