Handle RDF feeds with duplicated <title> elements
This commit is contained in:
parent
20e5fbcd7a
commit
c595c80356
3 changed files with 117 additions and 7 deletions
|
@ -20,6 +20,7 @@ func (feed *DublinCoreFeedElement) GetSanitizedCreator() string {
|
||||||
|
|
||||||
// DublinCoreItemElement represents Dublin Core entry XML elements.
|
// DublinCoreItemElement represents Dublin Core entry XML elements.
|
||||||
type DublinCoreItemElement struct {
|
type DublinCoreItemElement struct {
|
||||||
|
DublinCoreTitle string `xml:"http://purl.org/dc/elements/1.1/ title"`
|
||||||
DublinCoreDate string `xml:"http://purl.org/dc/elements/1.1/ date"`
|
DublinCoreDate string `xml:"http://purl.org/dc/elements/1.1/ date"`
|
||||||
DublinCoreCreator string `xml:"http://purl.org/dc/elements/1.1/ creator"`
|
DublinCoreCreator string `xml:"http://purl.org/dc/elements/1.1/ creator"`
|
||||||
DublinCoreContent string `xml:"http://purl.org/rss/1.0/modules/content/ encoded"`
|
DublinCoreContent string `xml:"http://purl.org/rss/1.0/modules/content/ encoded"`
|
||||||
|
|
|
@ -406,7 +406,7 @@ func TestParseItemWithoutDate(t *testing.T) {
|
||||||
|
|
||||||
func TestParseItemWithEncodedHTMLTitle(t *testing.T) {
|
func TestParseItemWithEncodedHTMLTitle(t *testing.T) {
|
||||||
data := `<?xml version="1.0" encoding="utf-8"?>
|
data := `<?xml version="1.0" encoding="utf-8"?>
|
||||||
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
|
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/">
|
||||||
<channel>
|
<channel>
|
||||||
<title>Example</title>
|
<title>Example</title>
|
||||||
<link>http://example.org</link>
|
<link>http://example.org</link>
|
||||||
|
@ -425,7 +425,7 @@ func TestParseItemWithEncodedHTMLTitle(t *testing.T) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if feed.Entries[0].Title != `AT&T` {
|
if feed.Entries[0].Title != `AT&T` {
|
||||||
t.Errorf("Incorrect entry title, got: %v", feed.Entries[0].Title)
|
t.Errorf("Incorrect entry title, got: %q", feed.Entries[0].Title)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -502,7 +502,7 @@ func TestParseFeedWithURLWrappedInSpaces(t *testing.T) {
|
||||||
<item rdf:about="http://biorxiv.org/cgi/content/short/857789v1?rss=1">
|
<item rdf:about="http://biorxiv.org/cgi/content/short/857789v1?rss=1">
|
||||||
<title>
|
<title>
|
||||||
<![CDATA[
|
<![CDATA[
|
||||||
Microscale Collagen and Fibroblast Interactions Enhance Primary Human Hepatocyte Functions in 3-Dimensional Models
|
Microscale Collagen and Fibroblast Interactions Enhance Primary Human Hepatocyte Functions in 3-Dimensional Models
|
||||||
]]>
|
]]>
|
||||||
</title>
|
</title>
|
||||||
<link>
|
<link>
|
||||||
|
@ -568,7 +568,7 @@ func TestParseRDFWithContentEncoded(t *testing.T) {
|
||||||
expected := `<p>Test</p>`
|
expected := `<p>Test</p>`
|
||||||
result := feed.Entries[0].Content
|
result := feed.Entries[0].Content
|
||||||
if result != expected {
|
if result != expected {
|
||||||
t.Errorf(`Unexpected entry URL, got %q instead of %q`, result, expected)
|
t.Errorf(`Unexpected entry content, got %q instead of %q`, result, expected)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -601,6 +601,105 @@ func TestParseRDFWithEncodedHTMLDescription(t *testing.T) {
|
||||||
expected := `AT&T <img src="https://example.org/img.png"></a>`
|
expected := `AT&T <img src="https://example.org/img.png"></a>`
|
||||||
result := feed.Entries[0].Content
|
result := feed.Entries[0].Content
|
||||||
if result != expected {
|
if result != expected {
|
||||||
t.Errorf(`Unexpected entry URL, got %v instead of %v`, result, expected)
|
t.Errorf(`Unexpected entry content, got %v instead of %v`, result, expected)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseRDFItemWithDuplicateTitleElement(t *testing.T) {
|
||||||
|
data := `<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
<rdf:RDF
|
||||||
|
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
|
||||||
|
xmlns="http://purl.org/rss/1.0/"
|
||||||
|
xmlns:dc="http://purl.org/dc/elements/1.1/">
|
||||||
|
<channel>
|
||||||
|
<title>Example Feed</title>
|
||||||
|
<link>http://example.org/</link>
|
||||||
|
</channel>
|
||||||
|
<item>
|
||||||
|
<title>Item Title</title>
|
||||||
|
<dc:title/>
|
||||||
|
<link>http://example.org/</link>
|
||||||
|
<description>Test</description>
|
||||||
|
</item>
|
||||||
|
</rdf:RDF>`
|
||||||
|
|
||||||
|
feed, err := Parse("http://example.org/", bytes.NewBufferString(data))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(feed.Entries) != 1 {
|
||||||
|
t.Fatalf(`Unexpected number of entries, got %d`, len(feed.Entries))
|
||||||
|
}
|
||||||
|
|
||||||
|
expected := `Item Title`
|
||||||
|
result := feed.Entries[0].Title
|
||||||
|
if result != expected {
|
||||||
|
t.Errorf(`Unexpected entry title, got %q instead of %q`, result, expected)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseRDFItemWithDublinCoreTitleElement(t *testing.T) {
|
||||||
|
data := `<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
<rdf:RDF
|
||||||
|
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
|
||||||
|
xmlns="http://purl.org/rss/1.0/"
|
||||||
|
xmlns:dc="http://purl.org/dc/elements/1.1/">
|
||||||
|
<channel>
|
||||||
|
<title>Example Feed</title>
|
||||||
|
<link>http://example.org/</link>
|
||||||
|
</channel>
|
||||||
|
<item>
|
||||||
|
<dc:title>Dublin Core Title</dc:title>
|
||||||
|
<link>http://example.org/</link>
|
||||||
|
<description>Test</description>
|
||||||
|
</item>
|
||||||
|
</rdf:RDF>`
|
||||||
|
|
||||||
|
feed, err := Parse("http://example.org/", bytes.NewBufferString(data))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(feed.Entries) != 1 {
|
||||||
|
t.Fatalf(`Unexpected number of entries, got %d`, len(feed.Entries))
|
||||||
|
}
|
||||||
|
|
||||||
|
expected := `Dublin Core Title`
|
||||||
|
result := feed.Entries[0].Title
|
||||||
|
if result != expected {
|
||||||
|
t.Errorf(`Unexpected entry title, got %q instead of %q`, result, expected)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseRDFItemWitEmptyTitleElement(t *testing.T) {
|
||||||
|
data := `<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
<rdf:RDF
|
||||||
|
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
|
||||||
|
xmlns="http://purl.org/rss/1.0/">
|
||||||
|
<channel>
|
||||||
|
<title>Example Feed</title>
|
||||||
|
<link>http://example.org/</link>
|
||||||
|
</channel>
|
||||||
|
<item>
|
||||||
|
<title> </title>
|
||||||
|
<link>http://example.org/item</link>
|
||||||
|
<description>Test</description>
|
||||||
|
</item>
|
||||||
|
</rdf:RDF>`
|
||||||
|
|
||||||
|
feed, err := Parse("http://example.org/", bytes.NewBufferString(data))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(feed.Entries) != 1 {
|
||||||
|
t.Fatalf(`Unexpected number of entries, got %d`, len(feed.Entries))
|
||||||
|
}
|
||||||
|
|
||||||
|
expected := `http://example.org/item`
|
||||||
|
result := feed.Entries[0].Title
|
||||||
|
if result != expected {
|
||||||
|
t.Errorf(`Unexpected entry title, got %q instead of %q`, result, expected)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -58,7 +58,7 @@ func (r *rdfFeed) Transform(baseURL string) *model.Feed {
|
||||||
}
|
}
|
||||||
|
|
||||||
type rdfItem struct {
|
type rdfItem struct {
|
||||||
Title string `xml:"title"`
|
Title string `xml:"http://purl.org/rss/1.0/ title"`
|
||||||
Link string `xml:"link"`
|
Link string `xml:"link"`
|
||||||
Description string `xml:"description"`
|
Description string `xml:"description"`
|
||||||
dublincore.DublinCoreItemElement
|
dublincore.DublinCoreItemElement
|
||||||
|
@ -72,11 +72,21 @@ func (r *rdfItem) Transform() *model.Entry {
|
||||||
entry.Content = r.entryContent()
|
entry.Content = r.entryContent()
|
||||||
entry.Hash = r.entryHash()
|
entry.Hash = r.entryHash()
|
||||||
entry.Date = r.entryDate()
|
entry.Date = r.entryDate()
|
||||||
|
|
||||||
|
if entry.Title == "" {
|
||||||
|
entry.Title = entry.URL
|
||||||
|
}
|
||||||
return entry
|
return entry
|
||||||
}
|
}
|
||||||
|
|
||||||
func (r *rdfItem) entryTitle() string {
|
func (r *rdfItem) entryTitle() string {
|
||||||
return html.UnescapeString(strings.TrimSpace(r.Title))
|
for _, title := range []string{r.Title, r.DublinCoreTitle} {
|
||||||
|
title = strings.TrimSpace(title)
|
||||||
|
if title != "" {
|
||||||
|
return html.UnescapeString(title)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ""
|
||||||
}
|
}
|
||||||
|
|
||||||
func (r *rdfItem) entryContent() string {
|
func (r *rdfItem) entryContent() string {
|
||||||
|
|
Loading…
Reference in a new issue