Improve feed parsers
This commit is contained in:
parent
3b40ce4960
commit
2b641cc224
4 changed files with 27 additions and 25 deletions
|
@ -15,7 +15,6 @@ import (
|
||||||
"github.com/miniflux/miniflux2/model"
|
"github.com/miniflux/miniflux2/model"
|
||||||
"github.com/miniflux/miniflux2/reader/date"
|
"github.com/miniflux/miniflux2/reader/date"
|
||||||
"github.com/miniflux/miniflux2/reader/processor"
|
"github.com/miniflux/miniflux2/reader/processor"
|
||||||
"github.com/miniflux/miniflux2/reader/sanitizer"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
type atomFeed struct {
|
type atomFeed struct {
|
||||||
|
@ -64,7 +63,7 @@ func (a *atomFeed) Transform() *model.Feed {
|
||||||
feed := new(model.Feed)
|
feed := new(model.Feed)
|
||||||
feed.FeedURL = getRelationURL(a.Links, "self")
|
feed.FeedURL = getRelationURL(a.Links, "self")
|
||||||
feed.SiteURL = getURL(a.Links)
|
feed.SiteURL = getURL(a.Links)
|
||||||
feed.Title = sanitizer.StripTags(a.Title)
|
feed.Title = strings.TrimSpace(a.Title)
|
||||||
|
|
||||||
if feed.Title == "" {
|
if feed.Title == "" {
|
||||||
feed.Title = feed.SiteURL
|
feed.Title = feed.SiteURL
|
||||||
|
@ -86,10 +85,10 @@ func (a *atomEntry) Transform() *model.Entry {
|
||||||
entry := new(model.Entry)
|
entry := new(model.Entry)
|
||||||
entry.URL = getURL(a.Links)
|
entry.URL = getURL(a.Links)
|
||||||
entry.Date = getDate(a)
|
entry.Date = getDate(a)
|
||||||
entry.Author = sanitizer.StripTags(getAuthor(a.Author))
|
entry.Author = getAuthor(a.Author)
|
||||||
entry.Hash = getHash(a)
|
entry.Hash = getHash(a)
|
||||||
entry.Content = processor.ItemContentProcessor(entry.URL, getContent(a))
|
entry.Content = processor.ItemContentProcessor(entry.URL, getContent(a))
|
||||||
entry.Title = sanitizer.StripTags(strings.Trim(a.Title, " \n\t"))
|
entry.Title = strings.TrimSpace(a.Title)
|
||||||
entry.Enclosures = getEnclosures(a)
|
entry.Enclosures = getEnclosures(a)
|
||||||
|
|
||||||
if entry.Title == "" {
|
if entry.Title == "" {
|
||||||
|
@ -102,11 +101,11 @@ func (a *atomEntry) Transform() *model.Entry {
|
||||||
func getURL(links []atomLink) string {
|
func getURL(links []atomLink) string {
|
||||||
for _, link := range links {
|
for _, link := range links {
|
||||||
if strings.ToLower(link.Rel) == "alternate" {
|
if strings.ToLower(link.Rel) == "alternate" {
|
||||||
return link.URL
|
return strings.TrimSpace(link.URL)
|
||||||
}
|
}
|
||||||
|
|
||||||
if link.Rel == "" && link.Type == "" {
|
if link.Rel == "" && link.Type == "" {
|
||||||
return link.URL
|
return strings.TrimSpace(link.URL)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -116,7 +115,7 @@ func getURL(links []atomLink) string {
|
||||||
func getRelationURL(links []atomLink, relation string) string {
|
func getRelationURL(links []atomLink, relation string) string {
|
||||||
for _, link := range links {
|
for _, link := range links {
|
||||||
if strings.ToLower(link.Rel) == relation {
|
if strings.ToLower(link.Rel) == relation {
|
||||||
return link.URL
|
return strings.TrimSpace(link.URL)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -182,11 +181,11 @@ func getEnclosures(a *atomEntry) model.EnclosureList {
|
||||||
|
|
||||||
func getAuthor(author atomAuthor) string {
|
func getAuthor(author atomAuthor) string {
|
||||||
if author.Name != "" {
|
if author.Name != "" {
|
||||||
return author.Name
|
return strings.TrimSpace(author.Name)
|
||||||
}
|
}
|
||||||
|
|
||||||
if author.Email != "" {
|
if author.Email != "" {
|
||||||
return author.Email
|
return strings.TrimSpace(author.Email)
|
||||||
}
|
}
|
||||||
|
|
||||||
return ""
|
return ""
|
||||||
|
|
|
@ -9,11 +9,12 @@ import (
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
"github.com/miniflux/miniflux2/reader/sanitizer"
|
||||||
|
|
||||||
"github.com/miniflux/miniflux2/helper"
|
"github.com/miniflux/miniflux2/helper"
|
||||||
"github.com/miniflux/miniflux2/model"
|
"github.com/miniflux/miniflux2/model"
|
||||||
"github.com/miniflux/miniflux2/reader/date"
|
"github.com/miniflux/miniflux2/reader/date"
|
||||||
"github.com/miniflux/miniflux2/reader/processor"
|
"github.com/miniflux/miniflux2/reader/processor"
|
||||||
"github.com/miniflux/miniflux2/reader/sanitizer"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
type jsonFeed struct {
|
type jsonFeed struct {
|
||||||
|
@ -59,7 +60,7 @@ func (j *jsonFeed) Transform() *model.Feed {
|
||||||
feed := new(model.Feed)
|
feed := new(model.Feed)
|
||||||
feed.FeedURL = j.FeedURL
|
feed.FeedURL = j.FeedURL
|
||||||
feed.SiteURL = j.SiteURL
|
feed.SiteURL = j.SiteURL
|
||||||
feed.Title = sanitizer.StripTags(j.Title)
|
feed.Title = strings.TrimSpace(j.Title)
|
||||||
|
|
||||||
if feed.Title == "" {
|
if feed.Title == "" {
|
||||||
feed.Title = feed.SiteURL
|
feed.Title = feed.SiteURL
|
||||||
|
@ -110,7 +111,7 @@ func (j *jsonItem) GetHash() string {
|
||||||
func (j *jsonItem) GetTitle() string {
|
func (j *jsonItem) GetTitle() string {
|
||||||
for _, value := range []string{j.Title, j.Summary, j.Text, j.HTML} {
|
for _, value := range []string{j.Title, j.Summary, j.Text, j.HTML} {
|
||||||
if value != "" {
|
if value != "" {
|
||||||
return truncate(value)
|
return truncate(sanitizer.StripTags(value))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -145,17 +146,17 @@ func (j *jsonItem) Transform() *model.Entry {
|
||||||
entry := new(model.Entry)
|
entry := new(model.Entry)
|
||||||
entry.URL = j.URL
|
entry.URL = j.URL
|
||||||
entry.Date = j.GetDate()
|
entry.Date = j.GetDate()
|
||||||
entry.Author = sanitizer.StripTags(j.GetAuthor())
|
entry.Author = j.GetAuthor()
|
||||||
entry.Hash = j.GetHash()
|
entry.Hash = j.GetHash()
|
||||||
entry.Content = processor.ItemContentProcessor(entry.URL, j.GetContent())
|
entry.Content = processor.ItemContentProcessor(entry.URL, j.GetContent())
|
||||||
entry.Title = sanitizer.StripTags(strings.Trim(j.GetTitle(), " \n\t"))
|
entry.Title = strings.TrimSpace(j.GetTitle())
|
||||||
entry.Enclosures = j.GetEnclosures()
|
entry.Enclosures = j.GetEnclosures()
|
||||||
return entry
|
return entry
|
||||||
}
|
}
|
||||||
|
|
||||||
func getAuthor(author jsonAuthor) string {
|
func getAuthor(author jsonAuthor) string {
|
||||||
if author.Name != "" {
|
if author.Name != "" {
|
||||||
return author.Name
|
return strings.TrimSpace(author.Name)
|
||||||
}
|
}
|
||||||
|
|
||||||
return ""
|
return ""
|
||||||
|
@ -163,6 +164,7 @@ func getAuthor(author jsonAuthor) string {
|
||||||
|
|
||||||
func truncate(str string) string {
|
func truncate(str string) string {
|
||||||
max := 100
|
max := 100
|
||||||
|
str = strings.TrimSpace(str)
|
||||||
if len(str) > max {
|
if len(str) > max {
|
||||||
return str[:max] + "..."
|
return str[:max] + "..."
|
||||||
}
|
}
|
||||||
|
|
|
@ -6,6 +6,7 @@ package rdf
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"encoding/xml"
|
"encoding/xml"
|
||||||
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/miniflux/miniflux2/helper"
|
"github.com/miniflux/miniflux2/helper"
|
||||||
|
@ -54,8 +55,8 @@ type rdfItem struct {
|
||||||
|
|
||||||
func (r *rdfItem) Transform() *model.Entry {
|
func (r *rdfItem) Transform() *model.Entry {
|
||||||
entry := new(model.Entry)
|
entry := new(model.Entry)
|
||||||
entry.Title = sanitizer.StripTags(r.Title)
|
entry.Title = strings.TrimSpace(r.Title)
|
||||||
entry.Author = sanitizer.StripTags(r.Creator)
|
entry.Author = strings.TrimSpace(r.Creator)
|
||||||
entry.URL = r.Link
|
entry.URL = r.Link
|
||||||
entry.Content = processor.ItemContentProcessor(entry.URL, r.Description)
|
entry.Content = processor.ItemContentProcessor(entry.URL, r.Description)
|
||||||
entry.Hash = getHash(r)
|
entry.Hash = getHash(r)
|
||||||
|
|
|
@ -16,7 +16,6 @@ import (
|
||||||
"github.com/miniflux/miniflux2/model"
|
"github.com/miniflux/miniflux2/model"
|
||||||
"github.com/miniflux/miniflux2/reader/date"
|
"github.com/miniflux/miniflux2/reader/date"
|
||||||
"github.com/miniflux/miniflux2/reader/processor"
|
"github.com/miniflux/miniflux2/reader/processor"
|
||||||
"github.com/miniflux/miniflux2/reader/sanitizer"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
type rssFeed struct {
|
type rssFeed struct {
|
||||||
|
@ -68,7 +67,7 @@ type rssEnclosure struct {
|
||||||
func (r *rssFeed) GetSiteURL() string {
|
func (r *rssFeed) GetSiteURL() string {
|
||||||
for _, element := range r.Links {
|
for _, element := range r.Links {
|
||||||
if element.XMLName.Space == "" {
|
if element.XMLName.Space == "" {
|
||||||
return element.Data
|
return strings.TrimSpace(element.Data)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -78,7 +77,7 @@ func (r *rssFeed) GetSiteURL() string {
|
||||||
func (r *rssFeed) GetFeedURL() string {
|
func (r *rssFeed) GetFeedURL() string {
|
||||||
for _, element := range r.Links {
|
for _, element := range r.Links {
|
||||||
if element.XMLName.Space == "http://www.w3.org/2005/Atom" {
|
if element.XMLName.Space == "http://www.w3.org/2005/Atom" {
|
||||||
return element.Href
|
return strings.TrimSpace(element.Href)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -89,7 +88,7 @@ func (r *rssFeed) Transform() *model.Feed {
|
||||||
feed := new(model.Feed)
|
feed := new(model.Feed)
|
||||||
feed.SiteURL = r.GetSiteURL()
|
feed.SiteURL = r.GetSiteURL()
|
||||||
feed.FeedURL = r.GetFeedURL()
|
feed.FeedURL = r.GetFeedURL()
|
||||||
feed.Title = sanitizer.StripTags(r.Title)
|
feed.Title = strings.TrimSpace(r.Title)
|
||||||
|
|
||||||
if feed.Title == "" {
|
if feed.Title == "" {
|
||||||
feed.Title = feed.SiteURL
|
feed.Title = feed.SiteURL
|
||||||
|
@ -101,7 +100,7 @@ func (r *rssFeed) Transform() *model.Feed {
|
||||||
if entry.Author == "" && r.ItunesAuthor != "" {
|
if entry.Author == "" && r.ItunesAuthor != "" {
|
||||||
entry.Author = r.ItunesAuthor
|
entry.Author = r.ItunesAuthor
|
||||||
}
|
}
|
||||||
entry.Author = sanitizer.StripTags(entry.Author)
|
entry.Author = strings.TrimSpace(entry.Author)
|
||||||
|
|
||||||
if entry.URL == "" {
|
if entry.URL == "" {
|
||||||
entry.URL = feed.SiteURL
|
entry.URL = feed.SiteURL
|
||||||
|
@ -112,6 +111,7 @@ func (r *rssFeed) Transform() *model.Feed {
|
||||||
|
|
||||||
return feed
|
return feed
|
||||||
}
|
}
|
||||||
|
|
||||||
func (r *rssItem) GetDate() time.Time {
|
func (r *rssItem) GetDate() time.Time {
|
||||||
value := r.PubDate
|
value := r.PubDate
|
||||||
if r.Date != "" {
|
if r.Date != "" {
|
||||||
|
@ -170,11 +170,11 @@ func (r *rssItem) GetURL() string {
|
||||||
|
|
||||||
for _, link := range r.Links {
|
for _, link := range r.Links {
|
||||||
if link.XMLName.Space == "http://www.w3.org/2005/Atom" && link.Href != "" && isValidLinkRelation(link.Rel) {
|
if link.XMLName.Space == "http://www.w3.org/2005/Atom" && link.Href != "" && isValidLinkRelation(link.Rel) {
|
||||||
return link.Href
|
return strings.TrimSpace(link.Href)
|
||||||
}
|
}
|
||||||
|
|
||||||
if link.Data != "" {
|
if link.Data != "" {
|
||||||
return link.Data
|
return strings.TrimSpace(link.Data)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -212,7 +212,7 @@ func (r *rssItem) Transform() *model.Entry {
|
||||||
entry.Author = r.GetAuthor()
|
entry.Author = r.GetAuthor()
|
||||||
entry.Hash = r.GetHash()
|
entry.Hash = r.GetHash()
|
||||||
entry.Content = processor.ItemContentProcessor(entry.URL, r.GetContent())
|
entry.Content = processor.ItemContentProcessor(entry.URL, r.GetContent())
|
||||||
entry.Title = sanitizer.StripTags(strings.Trim(r.Title, " \n\t"))
|
entry.Title = strings.TrimSpace(r.Title)
|
||||||
entry.Enclosures = r.GetEnclosures()
|
entry.Enclosures = r.GetEnclosures()
|
||||||
|
|
||||||
if entry.Title == "" {
|
if entry.Title == "" {
|
||||||
|
|
Loading…
Reference in a new issue