Allow only absolute URLs in comments URL

Some feeds are using invalid URLs (random text).
This commit is contained in:
Frédéric Guillot 2020-01-04 15:18:24 -08:00
parent 8cebd985a2
commit bf632fad2e
6 changed files with 105 additions and 2 deletions

View file

@ -84,7 +84,7 @@ func (a *atom10Entry) Transform() *model.Entry {
entry.Content = a.entryContent()
entry.Title = a.entryTitle()
entry.Enclosures = a.entryEnclosures()
entry.CommentsURL = a.Links.firstLinkWithRelationAndType("replies", "text/html")
entry.CommentsURL = a.entryCommentsURL()
return entry
}
@ -194,6 +194,15 @@ func (a *atom10Entry) entryEnclosures() model.EnclosureList {
return enclosures
}
// See https://tools.ietf.org/html/rfc4685#section-3
func (a *atom10Entry) entryCommentsURL() string {
commentsURL := a.Links.firstLinkWithRelationAndType("replies", "text/html")
if url.IsAbsoluteURL(commentsURL) {
return commentsURL
}
return ""
}
type atom10Text struct {
Type string `xml:"type,attr"`
Data string `xml:",chardata"`

View file

@ -777,3 +777,43 @@ func TestParseRepliesLinkRelation(t *testing.T) {
t.Errorf("Incorrect entry comments URL, got: %s", feed.Entries[0].CommentsURL)
}
}
func TestAbsoluteCommentsURL(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom"
xmlns:thr="http://purl.org/syndication/thread/1.0">
<id>http://www.example.org/myfeed</id>
<title>My Example Feed</title>
<updated>2005-07-28T12:00:00Z</updated>
<link href="http://www.example.org/myfeed" />
<author><name>James</name></author>
<entry>
<id>tag:entries.com,2005:1</id>
<title>My original entry</title>
<updated>2006-03-01T12:12:12Z</updated>
<link href="http://www.example.org/entries/1" />
<link rel="replies"
type="text/html"
href="invalid url"
thr:count="10" thr:updated="2005-07-28T12:10:00Z" />
<summary>This is my original entry</summary>
</entry>
</feed>`
feed, err := Parse(bytes.NewBufferString(data))
if err != nil {
t.Fatal(err)
}
if len(feed.Entries) != 1 {
t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries))
}
if feed.Entries[0].URL != "http://www.example.org/entries/1" {
t.Errorf("Incorrect entry URL, got: %s", feed.Entries[0].URL)
}
if feed.Entries[0].CommentsURL != "" {
t.Errorf("Incorrect entry comments URL, got: %s", feed.Entries[0].CommentsURL)
}
}

View file

@ -837,6 +837,31 @@ func TestParseEntryWithCommentsURL(t *testing.T) {
}
}
func TestParseEntryWithInvalidCommentsURL(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?>
<rss version="2.0" xmlns:slash="http://purl.org/rss/1.0/modules/slash/">
<channel>
<link>https://example.org/</link>
<item>
<title>Item 1</title>
<link>https://example.org/item1</link>
<comments>
Some text
</comments>
</item>
</channel>
</rss>`
feed, err := Parse(bytes.NewBufferString(data))
if err != nil {
t.Fatal(err)
}
if feed.Entries[0].CommentsURL != "" {
t.Errorf("Incorrect entry comments URL, got: %q", feed.Entries[0].CommentsURL)
}
}
func TestParseInvalidXml(t *testing.T) {
data := `garbage`
_, err := Parse(bytes.NewBufferString(data))

View file

@ -317,7 +317,12 @@ func (r *rssItem) entryEnclosures() model.EnclosureList {
func (r *rssItem) entryCommentsURL() string {
for _, commentLink := range r.CommentLinks {
if commentLink.XMLName.Space == "" {
return strings.TrimSpace(commentLink.Data)
commentsURL := strings.TrimSpace(commentLink.Data)
// The comments URL is supposed to be absolute (some feeds publishes incorrect comments URL)
// See https://cyber.harvard.edu/rss/rss.html#ltcommentsgtSubelementOfLtitemgt
if url.IsAbsoluteURL(commentsURL) {
return commentsURL
}
}
}

View file

@ -11,6 +11,15 @@ import (
"strings"
)
// IsAbsoluteURL returns true if the link is absolute.
func IsAbsoluteURL(link string) bool {
u, err := url.Parse(link)
if err != nil {
return false
}
return u.IsAbs()
}
// AbsoluteURL converts the input URL as absolute URL if necessary.
func AbsoluteURL(baseURL, input string) (string, error) {
if strings.HasPrefix(input, "//") {

View file

@ -6,6 +6,21 @@ package url // import "miniflux.app/url"
import "testing"
func TestIsAbsoluteURL(t *testing.T) {
scenarios := map[string]bool{
"https://example.org/file.pdf": true,
"magnet:?xt.1=urn:sha1:YNCKHTQCWBTRNJIV4WNAE52SJUQCZO5C&xt.2=urn:sha1:TXGCZQTH26NL6OUQAJJPFALHG2LTGBC7": true,
"invalid url": false,
}
for input, expected := range scenarios {
actual := IsAbsoluteURL(input)
if actual != expected {
t.Errorf(`Unexpected result, got %v instead of %v for %q`, actual, expected, input)
}
}
}
func TestAbsoluteURL(t *testing.T) {
scenarios := [][]string{
[]string{"https://example.org/path/file.ext", "https://example.org/folder/", "/path/file.ext"},