Strip HTML tags from DublinCore Creator tags

This commit is contained in:
Frédéric Guillot 2023-09-08 16:50:06 -07:00
parent 344a237af8
commit 36f013670e
5 changed files with 53 additions and 20 deletions

View file

@ -1,16 +1,30 @@
// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
package rdf // import "miniflux.app/v2/internal/reader/rdf"
package dublincore // import "miniflux.app/v2/internal/reader/dublincore"
import (
"strings"
"miniflux.app/v2/internal/reader/sanitizer"
)
// DublinCoreFeedElement represents Dublin Core feed XML elements.
type DublinCoreFeedElement struct {
DublinCoreCreator string `xml:"http://purl.org/dc/elements/1.1/ channel>creator"`
}
// DublinCoreEntryElement represents Dublin Core entry XML elements.
type DublinCoreEntryElement struct {
func (feed *DublinCoreFeedElement) GetSanitizedCreator() string {
return strings.TrimSpace(sanitizer.StripTags(feed.DublinCoreCreator))
}
// DublinCoreItemElement represents Dublin Core entry XML elements.
type DublinCoreItemElement struct {
DublinCoreDate string `xml:"http://purl.org/dc/elements/1.1/ date"`
DublinCoreCreator string `xml:"http://purl.org/dc/elements/1.1/ creator"`
DublinCoreContent string `xml:"http://purl.org/rss/1.0/modules/content/ encoded"`
}
func (item *DublinCoreItemElement) GetSanitizedCreator() string {
return strings.TrimSpace(sanitizer.StripTags(item.DublinCoreCreator))
}

View file

@ -349,6 +349,34 @@ func TestParseItemWithDublicCoreDate(t *testing.T) {
}
}
func TestParseItemWithEncodedHTMLInDCCreatorField(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?>
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:slash="http://purl.org/rss/1.0/modules/slash/">
<channel>
<title>Example</title>
<link>http://example.org</link>
</channel>
<item>
<title>Title</title>
<description>Test</description>
<link>http://example.org/test.html</link>
<dc:creator>&lt;a href=&quot;http://example.org/author1&quot;>Author 1&lt;/a&gt; (University 1), &lt;a href=&quot;http://example.org/author2&quot;>Author 2&lt;/a&gt; (University 2)</dc:creator>
<dc:date>2018-04-10T05:00:00+00:00</dc:date>
</item>
</rdf:RDF>`
feed, err := Parse("http://example.org", bytes.NewBufferString(data))
if err != nil {
t.Fatal(err)
}
expectedAuthor := "Author 1 (University 1), Author 2 (University 2)"
if feed.Entries[0].Author != expectedAuthor {
t.Errorf("Incorrect entry author, got: %s, want: %s", feed.Entries[0].Author, expectedAuthor)
}
}
func TestParseItemWithoutDate(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?>
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/">

View file

@ -13,6 +13,7 @@ import (
"miniflux.app/v2/internal/logger"
"miniflux.app/v2/internal/model"
"miniflux.app/v2/internal/reader/date"
"miniflux.app/v2/internal/reader/dublincore"
"miniflux.app/v2/internal/reader/sanitizer"
"miniflux.app/v2/internal/urllib"
)
@ -22,7 +23,7 @@ type rdfFeed struct {
Title string `xml:"channel>title"`
Link string `xml:"channel>link"`
Items []rdfItem `xml:"item"`
DublinCoreFeedElement
dublincore.DublinCoreFeedElement
}
func (r *rdfFeed) Transform(baseURL string) *model.Feed {
@ -38,7 +39,7 @@ func (r *rdfFeed) Transform(baseURL string) *model.Feed {
for _, item := range r.Items {
entry := item.Transform()
if entry.Author == "" && r.DublinCoreCreator != "" {
entry.Author = strings.TrimSpace(r.DublinCoreCreator)
entry.Author = r.GetSanitizedCreator()
}
if entry.URL == "" {
@ -60,7 +61,7 @@ type rdfItem struct {
Title string `xml:"title"`
Link string `xml:"link"`
Description string `xml:"description"`
DublinCoreEntryElement
dublincore.DublinCoreItemElement
}
func (r *rdfItem) Transform() *model.Entry {
@ -88,7 +89,7 @@ func (r *rdfItem) entryContent() string {
}
func (r *rdfItem) entryAuthor() string {
return strings.TrimSpace(r.DublinCoreCreator)
return r.GetSanitizedCreator()
}
func (r *rdfItem) entryURL() string {

View file

@ -1,11 +0,0 @@
// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
package rss // import "miniflux.app/v2/internal/reader/rss"
// DublinCoreElement represents Dublin Core XML elements.
type DublinCoreElement struct {
DublinCoreDate string `xml:"http://purl.org/dc/elements/1.1/ date"`
DublinCoreCreator string `xml:"http://purl.org/dc/elements/1.1/ creator"`
DublinCoreContent string `xml:"http://purl.org/rss/1.0/modules/content/ encoded"`
}

View file

@ -15,6 +15,7 @@ import (
"miniflux.app/v2/internal/logger"
"miniflux.app/v2/internal/model"
"miniflux.app/v2/internal/reader/date"
"miniflux.app/v2/internal/reader/dublincore"
"miniflux.app/v2/internal/reader/media"
"miniflux.app/v2/internal/reader/sanitizer"
"miniflux.app/v2/internal/urllib"
@ -182,7 +183,7 @@ type rssItem struct {
CommentLinks []rssCommentLink `xml:"comments"`
EnclosureLinks []rssEnclosure `xml:"enclosure"`
Categories []rssCategory `xml:"category"`
DublinCoreElement
dublincore.DublinCoreItemElement
FeedBurnerElement
PodcastEntryElement
media.Element
@ -250,7 +251,7 @@ func (r *rssItem) entryAuthor() string {
}
if author == "" {
author = r.DublinCoreCreator
author = r.GetSanitizedCreator()
}
return sanitizer.StripTags(strings.TrimSpace(author))