Handle RDF feeds with duplicated <title> elements
This commit is contained in:
parent
20e5fbcd7a
commit
c595c80356
3 changed files with 117 additions and 7 deletions
|
@ -20,6 +20,7 @@ func (feed *DublinCoreFeedElement) GetSanitizedCreator() string {
|
|||
|
||||
// DublinCoreItemElement represents Dublin Core entry XML elements.
|
||||
type DublinCoreItemElement struct {
|
||||
DublinCoreTitle string `xml:"http://purl.org/dc/elements/1.1/ title"`
|
||||
DublinCoreDate string `xml:"http://purl.org/dc/elements/1.1/ date"`
|
||||
DublinCoreCreator string `xml:"http://purl.org/dc/elements/1.1/ creator"`
|
||||
DublinCoreContent string `xml:"http://purl.org/rss/1.0/modules/content/ encoded"`
|
||||
|
|
|
@ -406,7 +406,7 @@ func TestParseItemWithoutDate(t *testing.T) {
|
|||
|
||||
func TestParseItemWithEncodedHTMLTitle(t *testing.T) {
|
||||
data := `<?xml version="1.0" encoding="utf-8"?>
|
||||
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
|
||||
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/">
|
||||
<channel>
|
||||
<title>Example</title>
|
||||
<link>http://example.org</link>
|
||||
|
@ -425,7 +425,7 @@ func TestParseItemWithEncodedHTMLTitle(t *testing.T) {
|
|||
}
|
||||
|
||||
if feed.Entries[0].Title != `AT&T` {
|
||||
t.Errorf("Incorrect entry title, got: %v", feed.Entries[0].Title)
|
||||
t.Errorf("Incorrect entry title, got: %q", feed.Entries[0].Title)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -502,7 +502,7 @@ func TestParseFeedWithURLWrappedInSpaces(t *testing.T) {
|
|||
<item rdf:about="http://biorxiv.org/cgi/content/short/857789v1?rss=1">
|
||||
<title>
|
||||
<![CDATA[
|
||||
Microscale Collagen and Fibroblast Interactions Enhance Primary Human Hepatocyte Functions in 3-Dimensional Models
|
||||
Microscale Collagen and Fibroblast Interactions Enhance Primary Human Hepatocyte Functions in 3-Dimensional Models
|
||||
]]>
|
||||
</title>
|
||||
<link>
|
||||
|
@ -568,7 +568,7 @@ func TestParseRDFWithContentEncoded(t *testing.T) {
|
|||
expected := `<p>Test</p>`
|
||||
result := feed.Entries[0].Content
|
||||
if result != expected {
|
||||
t.Errorf(`Unexpected entry URL, got %q instead of %q`, result, expected)
|
||||
t.Errorf(`Unexpected entry content, got %q instead of %q`, result, expected)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -601,6 +601,105 @@ func TestParseRDFWithEncodedHTMLDescription(t *testing.T) {
|
|||
expected := `AT&T <img src="https://example.org/img.png"></a>`
|
||||
result := feed.Entries[0].Content
|
||||
if result != expected {
|
||||
t.Errorf(`Unexpected entry URL, got %v instead of %v`, result, expected)
|
||||
t.Errorf(`Unexpected entry content, got %v instead of %v`, result, expected)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseRDFItemWithDuplicateTitleElement(t *testing.T) {
|
||||
data := `<?xml version="1.0" encoding="utf-8"?>
|
||||
<rdf:RDF
|
||||
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
|
||||
xmlns="http://purl.org/rss/1.0/"
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/">
|
||||
<channel>
|
||||
<title>Example Feed</title>
|
||||
<link>http://example.org/</link>
|
||||
</channel>
|
||||
<item>
|
||||
<title>Item Title</title>
|
||||
<dc:title/>
|
||||
<link>http://example.org/</link>
|
||||
<description>Test</description>
|
||||
</item>
|
||||
</rdf:RDF>`
|
||||
|
||||
feed, err := Parse("http://example.org/", bytes.NewBufferString(data))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if len(feed.Entries) != 1 {
|
||||
t.Fatalf(`Unexpected number of entries, got %d`, len(feed.Entries))
|
||||
}
|
||||
|
||||
expected := `Item Title`
|
||||
result := feed.Entries[0].Title
|
||||
if result != expected {
|
||||
t.Errorf(`Unexpected entry title, got %q instead of %q`, result, expected)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseRDFItemWithDublinCoreTitleElement(t *testing.T) {
|
||||
data := `<?xml version="1.0" encoding="utf-8"?>
|
||||
<rdf:RDF
|
||||
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
|
||||
xmlns="http://purl.org/rss/1.0/"
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/">
|
||||
<channel>
|
||||
<title>Example Feed</title>
|
||||
<link>http://example.org/</link>
|
||||
</channel>
|
||||
<item>
|
||||
<dc:title>Dublin Core Title</dc:title>
|
||||
<link>http://example.org/</link>
|
||||
<description>Test</description>
|
||||
</item>
|
||||
</rdf:RDF>`
|
||||
|
||||
feed, err := Parse("http://example.org/", bytes.NewBufferString(data))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if len(feed.Entries) != 1 {
|
||||
t.Fatalf(`Unexpected number of entries, got %d`, len(feed.Entries))
|
||||
}
|
||||
|
||||
expected := `Dublin Core Title`
|
||||
result := feed.Entries[0].Title
|
||||
if result != expected {
|
||||
t.Errorf(`Unexpected entry title, got %q instead of %q`, result, expected)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseRDFItemWitEmptyTitleElement(t *testing.T) {
|
||||
data := `<?xml version="1.0" encoding="utf-8"?>
|
||||
<rdf:RDF
|
||||
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
|
||||
xmlns="http://purl.org/rss/1.0/">
|
||||
<channel>
|
||||
<title>Example Feed</title>
|
||||
<link>http://example.org/</link>
|
||||
</channel>
|
||||
<item>
|
||||
<title> </title>
|
||||
<link>http://example.org/item</link>
|
||||
<description>Test</description>
|
||||
</item>
|
||||
</rdf:RDF>`
|
||||
|
||||
feed, err := Parse("http://example.org/", bytes.NewBufferString(data))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if len(feed.Entries) != 1 {
|
||||
t.Fatalf(`Unexpected number of entries, got %d`, len(feed.Entries))
|
||||
}
|
||||
|
||||
expected := `http://example.org/item`
|
||||
result := feed.Entries[0].Title
|
||||
if result != expected {
|
||||
t.Errorf(`Unexpected entry title, got %q instead of %q`, result, expected)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -58,7 +58,7 @@ func (r *rdfFeed) Transform(baseURL string) *model.Feed {
|
|||
}
|
||||
|
||||
type rdfItem struct {
|
||||
Title string `xml:"title"`
|
||||
Title string `xml:"http://purl.org/rss/1.0/ title"`
|
||||
Link string `xml:"link"`
|
||||
Description string `xml:"description"`
|
||||
dublincore.DublinCoreItemElement
|
||||
|
@ -72,11 +72,21 @@ func (r *rdfItem) Transform() *model.Entry {
|
|||
entry.Content = r.entryContent()
|
||||
entry.Hash = r.entryHash()
|
||||
entry.Date = r.entryDate()
|
||||
|
||||
if entry.Title == "" {
|
||||
entry.Title = entry.URL
|
||||
}
|
||||
return entry
|
||||
}
|
||||
|
||||
func (r *rdfItem) entryTitle() string {
|
||||
return html.UnescapeString(strings.TrimSpace(r.Title))
|
||||
for _, title := range []string{r.Title, r.DublinCoreTitle} {
|
||||
title = strings.TrimSpace(title)
|
||||
if title != "" {
|
||||
return html.UnescapeString(title)
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func (r *rdfItem) entryContent() string {
|
||||
|
|
Loading…
Reference in a new issue