Fix incorrect parsing of Atom entry content of type HTML

This commit is contained in:
Frédéric Guillot 2021-03-18 21:34:09 -07:00 committed by fguillot
parent 49171c5e8c
commit 14888f1cb8
2 changed files with 154 additions and 23 deletions

View file

@ -48,7 +48,7 @@ func (a *atom10Feed) Transform(baseURL string) *model.Feed {
feed.SiteURL = siteURL
}
feed.Title = a.Title.String()
feed.Title = html.UnescapeString(a.Title.String())
if feed.Title == "" {
feed.Title = feed.SiteURL
}
@ -100,7 +100,7 @@ func (a *atom10Entry) Transform() *model.Entry {
}
func (a *atom10Entry) entryTitle() string {
return a.Title.String()
return html.UnescapeString(a.Title.String())
}
func (a *atom10Entry) entryContent() string {
@ -221,20 +221,19 @@ func (a *atom10Entry) entryCommentsURL() string {
}
type atom10Text struct {
Type string `xml:"type,attr"`
Data string `xml:",chardata"`
XML string `xml:",innerxml"`
Type string `xml:"type,attr"`
CharData string `xml:",chardata"`
InnerXML string `xml:",innerxml"`
}
func (a *atom10Text) String() string {
content := ""
var content string
switch {
case a.Type == "xhtml":
content = a.XML
default:
content = a.Data
if a.Type == "xhtml" {
content = a.InnerXML
} else {
content = a.CharData
}
return html.UnescapeString(strings.TrimSpace(content))
return strings.TrimSpace(content)
}

View file

@ -244,7 +244,33 @@ func TestParseEntryTitleWithWhitespaces(t *testing.T) {
}
}
func TestParseEntryTitleWithHTMLAndCDATA(t *testing.T) {
func TestParseEntryWithPlainTextTitle(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<title>Example Feed</title>
<link href="http://example.org/"/>
<entry>
<title type="text">AT&amp;T bought by SBC!</title>
<link href="http://example.org/2003/12/13/atom03"/>
<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
<updated>2003-12-13T18:30:02Z</updated>
<summary>Some text.</summary>
</entry>
</feed>`
feed, err := Parse("https://example.org/", bytes.NewBufferString(data))
if err != nil {
t.Fatal(err)
}
if feed.Entries[0].Title != `AT&T bought by SBC!` {
t.Errorf("Incorrect entry title, got: %q", feed.Entries[0].Title)
}
}
func TestParseEntryWithHTMLAndCDATATitle(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<title>Example Feed</title>
@ -270,7 +296,7 @@ func TestParseEntryTitleWithHTMLAndCDATA(t *testing.T) {
}
}
func TestParseEntryTitleWithHTML(t *testing.T) {
func TestParseEntryWithHTMLTitle(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<title>Example Feed</title>
@ -296,7 +322,7 @@ func TestParseEntryTitleWithHTML(t *testing.T) {
}
}
func TestParseEntryTitleWithXHTML(t *testing.T) {
func TestParseEntryWithXHTMLTitle(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<title>Example Feed</title>
@ -322,7 +348,7 @@ func TestParseEntryTitleWithXHTML(t *testing.T) {
}
}
func TestParseEntryTitleWithNumericCharacterReference(t *testing.T) {
func TestParseEntryWithNumericCharacterReferenceTitle(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<title>Example Feed</title>
@ -348,7 +374,7 @@ func TestParseEntryTitleWithNumericCharacterReference(t *testing.T) {
}
}
func TestParseEntryTitleWithDoubleEncodedEntities(t *testing.T) {
func TestParseEntryWithDoubleEncodedEntitiesTitle(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<title>Example Feed</title>
@ -374,14 +400,14 @@ func TestParseEntryTitleWithDoubleEncodedEntities(t *testing.T) {
}
}
func TestParseEntrySummaryWithXHTML(t *testing.T) {
func TestParseEntryWithXHTMLSummary(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<title>Example Feed</title>
<link href="http://example.org/"/>
<entry>
<title type="xhtml"><code>Test</code> Test</title>
<title type="xhtml">Example</title>
<link href="http://example.org/2003/12/13/atom03"/>
<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
<updated>2003-12-13T18:30:02Z</updated>
@ -400,14 +426,14 @@ func TestParseEntrySummaryWithXHTML(t *testing.T) {
}
}
func TestParseEntrySummaryWithHTML(t *testing.T) {
func TestParseEntryWithHTMLAndCDATASummary(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<title>Example Feed</title>
<link href="http://example.org/"/>
<entry>
<title type="html">&lt;code&gt;Test&lt;/code&gt; Test</title>
<title type="html">Example</title>
<link href="http://example.org/2003/12/13/atom03"/>
<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
<updated>2003-12-13T18:30:02Z</updated>
@ -426,14 +452,14 @@ func TestParseEntrySummaryWithHTML(t *testing.T) {
}
}
func TestParseEntrySummaryWithPlainText(t *testing.T) {
func TestParseEntryWithPlainTextAndCDATASummary(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<title>Example Feed</title>
<link href="http://example.org/"/>
<entry>
<title type="html">&lt;code&gt;Test&lt;/code&gt; Test</title>
<title type="html">Example</title>
<link href="http://example.org/2003/12/13/atom03"/>
<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
<updated>2003-12-13T18:30:02Z</updated>
@ -452,6 +478,112 @@ func TestParseEntrySummaryWithPlainText(t *testing.T) {
}
}
func TestParseEntryWithTextAndCDATAContent(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<title>Example Feed</title>
<link href="http://example.org/"/>
<entry>
<title type="html">Example</title>
<link href="http://example.org/2003/12/13/atom03"/>
<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
<updated>2003-12-13T18:30:02Z</updated>
<content><![CDATA[AT&amp;T bought by SBC!]]></content>
</entry>
</feed>`
feed, err := Parse("https://example.org/", bytes.NewBufferString(data))
if err != nil {
t.Fatal(err)
}
if feed.Entries[0].Content != "AT&amp;T bought by SBC!" {
t.Errorf("Incorrect entry content, got: %q", feed.Entries[0].Content)
}
}
func TestParseEntryWithTextContent(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<title>Example Feed</title>
<link href="http://example.org/"/>
<entry>
<title type="html">Example</title>
<link href="http://example.org/2003/12/13/atom03"/>
<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
<updated>2003-12-13T18:30:02Z</updated>
<content>AT&amp;T bought by SBC!</content>
</entry>
</feed>`
feed, err := Parse("https://example.org/", bytes.NewBufferString(data))
if err != nil {
t.Fatal(err)
}
if feed.Entries[0].Content != "AT&T bought by SBC!" {
t.Errorf("Incorrect entry content, got: %q", feed.Entries[0].Content)
}
}
func TestParseEntryWithHTMLContent(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<title>Example Feed</title>
<link href="http://example.org/"/>
<entry>
<title type="html">Example</title>
<link href="http://example.org/2003/12/13/atom03"/>
<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
<updated>2003-12-13T18:30:02Z</updated>
<content type="html">AT&amp;amp;T bought &lt;b&gt;by SBC&lt;/b&gt;!</content>
</entry>
</feed>`
feed, err := Parse("https://example.org/", bytes.NewBufferString(data))
if err != nil {
t.Fatal(err)
}
if feed.Entries[0].Content != "AT&amp;T bought <b>by SBC</b>!" {
t.Errorf("Incorrect entry content, got: %q", feed.Entries[0].Content)
}
}
func TestParseEntryWithXHTMLContent(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<title>Example Feed</title>
<link href="http://example.org/"/>
<entry>
<title type="html">Example</title>
<link href="http://example.org/2003/12/13/atom03"/>
<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
<updated>2003-12-13T18:30:02Z</updated>
<content type="xhtml">
<div xmlns="http://www.w3.org/1999/xhtml">AT&amp;T bought <b>by SBC</b>!</div>
</content>
</entry>
</feed>`
feed, err := Parse("https://example.org/", bytes.NewBufferString(data))
if err != nil {
t.Fatal(err)
}
if feed.Entries[0].Content != `<div xmlns="http://www.w3.org/1999/xhtml">AT&amp;T bought <b>by SBC</b>!</div>` {
t.Errorf("Incorrect entry content, got: %q", feed.Entries[0].Content)
}
}
func TestParseEntryWithAuthorName(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">