Simplify feed parser and format detection
- Avoid doing multiple buffer copies - Move parser and format detection logic to its own package
This commit is contained in:
parent
d5ff4191b6
commit
5870f04260
11 changed files with 229 additions and 221 deletions
|
@ -6,6 +6,7 @@ package client // import "miniflux.app/http/client"
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"io"
|
"io"
|
||||||
|
"io/ioutil"
|
||||||
"mime"
|
"mime"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
|
@ -56,23 +57,32 @@ func (r *Response) IsModified(etag, lastModified string) bool {
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
// NormalizeBodyEncoding make sure the body is encoded in UTF-8.
|
// EnsureUnicodeBody makes sure the body is encoded in UTF-8.
|
||||||
//
|
//
|
||||||
// If a charset other than UTF-8 is detected, we convert the document to UTF-8.
|
// If a charset other than UTF-8 is detected, we convert the document to UTF-8.
|
||||||
// This is used by the scraper and feed readers.
|
// This is used by the scraper and feed readers.
|
||||||
//
|
//
|
||||||
// Do not forget edge cases:
|
// Do not forget edge cases:
|
||||||
// - Some non-utf8 feeds specify encoding only in Content-Type, not in XML document.
|
// - Some non-utf8 feeds specify encoding only in Content-Type, not in XML document.
|
||||||
func (r *Response) NormalizeBodyEncoding() (io.Reader, error) {
|
func (r *Response) EnsureUnicodeBody() error {
|
||||||
_, params, err := mime.ParseMediaType(r.ContentType)
|
_, params, err := mime.ParseMediaType(r.ContentType)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
if enc, found := params["charset"]; found {
|
if enc, found := params["charset"]; found {
|
||||||
enc = strings.ToLower(enc)
|
enc = strings.ToLower(enc)
|
||||||
if enc != "utf-8" && enc != "utf8" && enc != "" {
|
if enc != "utf-8" && enc != "utf8" && enc != "" {
|
||||||
logger.Debug("[NormalizeBodyEncoding] Convert body to UTF-8 from %s", enc)
|
logger.Debug("[EnsureUnicodeBody] Convert body to utf-8 from %s", enc)
|
||||||
return charset.NewReader(r.Body, r.ContentType)
|
r.Body, err = charset.NewReader(r.Body, r.ContentType)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return r.Body, nil
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// String returns the response body as string.
|
||||||
|
func (r *Response) String() string {
|
||||||
|
bytes, _ := ioutil.ReadAll(r.Body)
|
||||||
|
return string(bytes)
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,7 +4,7 @@
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
||||||
Package feed provides a generic feed parser that abstracts all different formats.
|
Package feed handles feed updates and creation.
|
||||||
|
|
||||||
*/
|
*/
|
||||||
package feed // import "miniflux.app/reader/feed"
|
package feed // import "miniflux.app/reader/feed"
|
||||||
|
|
|
@ -14,6 +14,7 @@ import (
|
||||||
"miniflux.app/logger"
|
"miniflux.app/logger"
|
||||||
"miniflux.app/model"
|
"miniflux.app/model"
|
||||||
"miniflux.app/reader/icon"
|
"miniflux.app/reader/icon"
|
||||||
|
"miniflux.app/reader/parser"
|
||||||
"miniflux.app/reader/processor"
|
"miniflux.app/reader/processor"
|
||||||
"miniflux.app/storage"
|
"miniflux.app/storage"
|
||||||
"miniflux.app/timer"
|
"miniflux.app/timer"
|
||||||
|
@ -67,12 +68,11 @@ func (h *Handler) CreateFeed(userID, categoryID int64, url string, crawler bool,
|
||||||
return nil, errors.NewLocalizedError(errDuplicate, response.EffectiveURL)
|
return nil, errors.NewLocalizedError(errDuplicate, response.EffectiveURL)
|
||||||
}
|
}
|
||||||
|
|
||||||
body, err := response.NormalizeBodyEncoding()
|
if err := response.EnsureUnicodeBody(); err != nil {
|
||||||
if err != nil {
|
|
||||||
return nil, errors.NewLocalizedError(errEncoding, err)
|
return nil, errors.NewLocalizedError(errEncoding, err)
|
||||||
}
|
}
|
||||||
|
|
||||||
subscription, feedErr := parseFeed(body)
|
subscription, feedErr := parser.ParseFeed(response.String())
|
||||||
if feedErr != nil {
|
if feedErr != nil {
|
||||||
return nil, feedErr
|
return nil, feedErr
|
||||||
}
|
}
|
||||||
|
@ -183,12 +183,11 @@ func (h *Handler) RefreshFeed(userID, feedID int64) error {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
body, err := response.NormalizeBodyEncoding()
|
if err := response.EnsureUnicodeBody(); err != nil {
|
||||||
if err != nil {
|
|
||||||
return errors.NewLocalizedError(errEncoding, err)
|
return errors.NewLocalizedError(errEncoding, err)
|
||||||
}
|
}
|
||||||
|
|
||||||
subscription, parseErr := parseFeed(body)
|
subscription, parseErr := parser.ParseFeed(response.String())
|
||||||
if parseErr != nil {
|
if parseErr != nil {
|
||||||
originalFeed.ParsingErrorCount++
|
originalFeed.ParsingErrorCount++
|
||||||
originalFeed.ParsingErrorMsg = parseErr.Localize(printer)
|
originalFeed.ParsingErrorMsg = parseErr.Localize(printer)
|
||||||
|
|
|
@ -1,118 +0,0 @@
|
||||||
// Copyright 2017 Frédéric Guillot. All rights reserved.
|
|
||||||
// Use of this source code is governed by the Apache 2.0
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
package feed // import "miniflux.app/reader/feed"
|
|
||||||
|
|
||||||
import (
|
|
||||||
"bytes"
|
|
||||||
"encoding/xml"
|
|
||||||
"io"
|
|
||||||
"strings"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
"miniflux.app/errors"
|
|
||||||
"miniflux.app/logger"
|
|
||||||
"miniflux.app/model"
|
|
||||||
"miniflux.app/reader/atom"
|
|
||||||
"miniflux.app/reader/encoding"
|
|
||||||
"miniflux.app/reader/json"
|
|
||||||
"miniflux.app/reader/rdf"
|
|
||||||
"miniflux.app/reader/rss"
|
|
||||||
"miniflux.app/timer"
|
|
||||||
)
|
|
||||||
|
|
||||||
// List of feed formats.
|
|
||||||
const (
|
|
||||||
FormatRDF = "rdf"
|
|
||||||
FormatRSS = "rss"
|
|
||||||
FormatAtom = "atom"
|
|
||||||
FormatJSON = "json"
|
|
||||||
FormatUnknown = "unknown"
|
|
||||||
)
|
|
||||||
|
|
||||||
// DetectFeedFormat detect feed format from input data.
|
|
||||||
func DetectFeedFormat(r io.Reader) string {
|
|
||||||
defer timer.ExecutionTime(time.Now(), "[Feed:DetectFeedFormat]")
|
|
||||||
|
|
||||||
var buffer bytes.Buffer
|
|
||||||
tee := io.TeeReader(r, &buffer)
|
|
||||||
|
|
||||||
decoder := xml.NewDecoder(tee)
|
|
||||||
decoder.CharsetReader = encoding.CharsetReader
|
|
||||||
|
|
||||||
for {
|
|
||||||
token, _ := decoder.Token()
|
|
||||||
if token == nil {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
|
|
||||||
if element, ok := token.(xml.StartElement); ok {
|
|
||||||
switch element.Name.Local {
|
|
||||||
case "rss":
|
|
||||||
return FormatRSS
|
|
||||||
case "feed":
|
|
||||||
return FormatAtom
|
|
||||||
case "RDF":
|
|
||||||
return FormatRDF
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if strings.HasPrefix(strings.TrimSpace(buffer.String()), "{") {
|
|
||||||
return FormatJSON
|
|
||||||
}
|
|
||||||
|
|
||||||
return FormatUnknown
|
|
||||||
}
|
|
||||||
|
|
||||||
func parseFeed(r io.Reader) (*model.Feed, *errors.LocalizedError) {
|
|
||||||
defer timer.ExecutionTime(time.Now(), "[Feed:ParseFeed]")
|
|
||||||
|
|
||||||
var buffer bytes.Buffer
|
|
||||||
size, _ := io.Copy(&buffer, r)
|
|
||||||
if size == 0 {
|
|
||||||
return nil, errors.NewLocalizedError(errEmptyFeed)
|
|
||||||
}
|
|
||||||
|
|
||||||
str := stripInvalidXMLCharacters(buffer.String())
|
|
||||||
reader := strings.NewReader(str)
|
|
||||||
format := DetectFeedFormat(reader)
|
|
||||||
reader.Seek(0, io.SeekStart)
|
|
||||||
|
|
||||||
switch format {
|
|
||||||
case FormatAtom:
|
|
||||||
return atom.Parse(reader)
|
|
||||||
case FormatRSS:
|
|
||||||
return rss.Parse(reader)
|
|
||||||
case FormatJSON:
|
|
||||||
return json.Parse(reader)
|
|
||||||
case FormatRDF:
|
|
||||||
return rdf.Parse(reader)
|
|
||||||
default:
|
|
||||||
return nil, errors.NewLocalizedError("Unsupported feed format")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func stripInvalidXMLCharacters(input string) string {
|
|
||||||
return strings.Map(func(r rune) rune {
|
|
||||||
if isInCharacterRange(r) {
|
|
||||||
return r
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.Debug("Strip invalid XML characters: %U", r)
|
|
||||||
return -1
|
|
||||||
}, input)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Decide whether the given rune is in the XML Character Range, per
|
|
||||||
// the Char production of http://www.xml.com/axml/testaxml.htm,
|
|
||||||
// Section 2.2 Characters.
|
|
||||||
func isInCharacterRange(r rune) (inrange bool) {
|
|
||||||
return r == 0x09 ||
|
|
||||||
r == 0x0A ||
|
|
||||||
r == 0x0D ||
|
|
||||||
r >= 0x20 && r <= 0xDF77 ||
|
|
||||||
r >= 0xE000 && r <= 0xFFFD ||
|
|
||||||
r >= 0x10000 && r <= 0x10FFFF
|
|
||||||
}
|
|
10
reader/parser/doc.go
Normal file
10
reader/parser/doc.go
Normal file
|
@ -0,0 +1,10 @@
|
||||||
|
// Copyright 2018 Frédéric Guillot. All rights reserved.
|
||||||
|
// Use of this source code is governed by the Apache 2.0
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
/*
|
||||||
|
|
||||||
|
Package parser provides a generic feed parser that abstract all different formats.
|
||||||
|
|
||||||
|
*/
|
||||||
|
package parser // import "miniflux.app/reader/parser"
|
51
reader/parser/format.go
Normal file
51
reader/parser/format.go
Normal file
|
@ -0,0 +1,51 @@
|
||||||
|
// Copyright 2018 Frédéric Guillot. All rights reserved.
|
||||||
|
// Use of this source code is governed by the Apache 2.0
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
package parser // import "miniflux.app/reader/parser"
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/xml"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"miniflux.app/reader/encoding"
|
||||||
|
)
|
||||||
|
|
||||||
|
// List of feed formats.
|
||||||
|
const (
|
||||||
|
FormatRDF = "rdf"
|
||||||
|
FormatRSS = "rss"
|
||||||
|
FormatAtom = "atom"
|
||||||
|
FormatJSON = "json"
|
||||||
|
FormatUnknown = "unknown"
|
||||||
|
)
|
||||||
|
|
||||||
|
// DetectFeedFormat tries to guess the feed format from input data.
|
||||||
|
func DetectFeedFormat(data string) string {
|
||||||
|
if strings.HasPrefix(strings.TrimSpace(data), "{") {
|
||||||
|
return FormatJSON
|
||||||
|
}
|
||||||
|
|
||||||
|
decoder := xml.NewDecoder(strings.NewReader(data))
|
||||||
|
decoder.CharsetReader = encoding.CharsetReader
|
||||||
|
|
||||||
|
for {
|
||||||
|
token, _ := decoder.Token()
|
||||||
|
if token == nil {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
|
if element, ok := token.(xml.StartElement); ok {
|
||||||
|
switch element.Name.Local {
|
||||||
|
case "rss":
|
||||||
|
return FormatRSS
|
||||||
|
case "feed":
|
||||||
|
return FormatAtom
|
||||||
|
case "RDF":
|
||||||
|
return FormatRDF
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return FormatUnknown
|
||||||
|
}
|
70
reader/parser/format_test.go
Normal file
70
reader/parser/format_test.go
Normal file
|
@ -0,0 +1,70 @@
|
||||||
|
// Copyright 2018 Frédéric Guillot. All rights reserved.
|
||||||
|
// Use of this source code is governed by the Apache 2.0
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
package parser // import "miniflux.app/reader/parser"
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestDetectRDF(t *testing.T) {
|
||||||
|
data := `<?xml version="1.0"?><rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://my.netscape.com/rdf/simple/0.9/"></rdf:RDF>`
|
||||||
|
format := DetectFeedFormat(data)
|
||||||
|
|
||||||
|
if format != FormatRDF {
|
||||||
|
t.Errorf(`Wrong format detected: %q instead of %q`, format, FormatRDF)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestDetectRSS(t *testing.T) {
|
||||||
|
data := `<?xml version="1.0"?><rss version="2.0"><channel></channel></rss>`
|
||||||
|
format := DetectFeedFormat(data)
|
||||||
|
|
||||||
|
if format != FormatRSS {
|
||||||
|
t.Errorf(`Wrong format detected: %q instead of %q`, format, FormatRSS)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestDetectAtom(t *testing.T) {
|
||||||
|
data := `<?xml version="1.0" encoding="utf-8"?><feed xmlns="http://www.w3.org/2005/Atom"></feed>`
|
||||||
|
format := DetectFeedFormat(data)
|
||||||
|
|
||||||
|
if format != FormatAtom {
|
||||||
|
t.Errorf(`Wrong format detected: %q instead of %q`, format, FormatAtom)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestDetectAtomWithISOCharset(t *testing.T) {
|
||||||
|
data := `<?xml version="1.0" encoding="ISO-8859-15"?><feed xmlns="http://www.w3.org/2005/Atom"></feed>`
|
||||||
|
format := DetectFeedFormat(data)
|
||||||
|
|
||||||
|
if format != FormatAtom {
|
||||||
|
t.Errorf(`Wrong format detected: %q instead of %q`, format, FormatAtom)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestDetectJSON(t *testing.T) {
|
||||||
|
data := `
|
||||||
|
{
|
||||||
|
"version" : "https://jsonfeed.org/version/1",
|
||||||
|
"title" : "Example"
|
||||||
|
}
|
||||||
|
`
|
||||||
|
format := DetectFeedFormat(data)
|
||||||
|
|
||||||
|
if format != FormatJSON {
|
||||||
|
t.Errorf(`Wrong format detected: %q instead of %q`, format, FormatJSON)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestDetectUnknown(t *testing.T) {
|
||||||
|
data := `
|
||||||
|
<!DOCTYPE html> <html> </html>
|
||||||
|
`
|
||||||
|
format := DetectFeedFormat(data)
|
||||||
|
|
||||||
|
if format != FormatUnknown {
|
||||||
|
t.Errorf(`Wrong format detected: %q instead of %q`, format, FormatUnknown)
|
||||||
|
}
|
||||||
|
}
|
58
reader/parser/parser.go
Normal file
58
reader/parser/parser.go
Normal file
|
@ -0,0 +1,58 @@
|
||||||
|
// Copyright 2018 Frédéric Guillot. All rights reserved.
|
||||||
|
// Use of this source code is governed by the Apache 2.0
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
package parser // import "miniflux.app/reader/parser"
|
||||||
|
|
||||||
|
import (
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"miniflux.app/errors"
|
||||||
|
"miniflux.app/logger"
|
||||||
|
"miniflux.app/model"
|
||||||
|
"miniflux.app/reader/atom"
|
||||||
|
"miniflux.app/reader/json"
|
||||||
|
"miniflux.app/reader/rdf"
|
||||||
|
"miniflux.app/reader/rss"
|
||||||
|
)
|
||||||
|
|
||||||
|
// ParseFeed analyzes the input data and returns a normalized feed object.
|
||||||
|
func ParseFeed(data string) (*model.Feed, *errors.LocalizedError) {
|
||||||
|
data = stripInvalidXMLCharacters(data)
|
||||||
|
|
||||||
|
switch DetectFeedFormat(data) {
|
||||||
|
case FormatAtom:
|
||||||
|
return atom.Parse(strings.NewReader(data))
|
||||||
|
case FormatRSS:
|
||||||
|
return rss.Parse(strings.NewReader(data))
|
||||||
|
case FormatJSON:
|
||||||
|
return json.Parse(strings.NewReader(data))
|
||||||
|
case FormatRDF:
|
||||||
|
return rdf.Parse(strings.NewReader(data))
|
||||||
|
default:
|
||||||
|
return nil, errors.NewLocalizedError("Unsupported feed format")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func stripInvalidXMLCharacters(input string) string {
|
||||||
|
return strings.Map(func(r rune) rune {
|
||||||
|
if isInCharacterRange(r) {
|
||||||
|
return r
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.Debug("Strip invalid XML characters: %U", r)
|
||||||
|
return -1
|
||||||
|
}, input)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Decide whether the given rune is in the XML Character Range, per
|
||||||
|
// the Char production of http://www.xml.com/axml/testaxml.htm,
|
||||||
|
// Section 2.2 Characters.
|
||||||
|
func isInCharacterRange(r rune) (inrange bool) {
|
||||||
|
return r == 0x09 ||
|
||||||
|
r == 0x0A ||
|
||||||
|
r == 0x0D ||
|
||||||
|
r >= 0x20 && r <= 0xDF77 ||
|
||||||
|
r >= 0xE000 && r <= 0xFFFD ||
|
||||||
|
r >= 0x10000 && r <= 0x10FFFF
|
||||||
|
}
|
|
@ -2,74 +2,12 @@
|
||||||
// Use of this source code is governed by the Apache 2.0
|
// Use of this source code is governed by the Apache 2.0
|
||||||
// license that can be found in the LICENSE file.
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
package feed // import "miniflux.app/reader/feed"
|
package parser // import "miniflux.app/reader/parser"
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
|
||||||
"testing"
|
"testing"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestDetectRDF(t *testing.T) {
|
|
||||||
data := `<?xml version="1.0"?><rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://my.netscape.com/rdf/simple/0.9/"></rdf:RDF>`
|
|
||||||
format := DetectFeedFormat(bytes.NewBufferString(data))
|
|
||||||
|
|
||||||
if format != FormatRDF {
|
|
||||||
t.Errorf("Wrong format detected: %s instead of %s", format, FormatRDF)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestDetectRSS(t *testing.T) {
|
|
||||||
data := `<?xml version="1.0"?><rss version="2.0"><channel></channel></rss>`
|
|
||||||
format := DetectFeedFormat(bytes.NewBufferString(data))
|
|
||||||
|
|
||||||
if format != FormatRSS {
|
|
||||||
t.Errorf("Wrong format detected: %s instead of %s", format, FormatRSS)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestDetectAtom(t *testing.T) {
|
|
||||||
data := `<?xml version="1.0" encoding="utf-8"?><feed xmlns="http://www.w3.org/2005/Atom"></feed>`
|
|
||||||
format := DetectFeedFormat(bytes.NewBufferString(data))
|
|
||||||
|
|
||||||
if format != FormatAtom {
|
|
||||||
t.Errorf("Wrong format detected: %s instead of %s", format, FormatAtom)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestDetectAtomWithISOCharset(t *testing.T) {
|
|
||||||
data := `<?xml version="1.0" encoding="ISO-8859-15"?><feed xmlns="http://www.w3.org/2005/Atom"></feed>`
|
|
||||||
format := DetectFeedFormat(bytes.NewBufferString(data))
|
|
||||||
|
|
||||||
if format != FormatAtom {
|
|
||||||
t.Errorf("Wrong format detected: %s instead of %s", format, FormatAtom)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestDetectJSON(t *testing.T) {
|
|
||||||
data := `
|
|
||||||
{
|
|
||||||
"version" : "https://jsonfeed.org/version/1",
|
|
||||||
"title" : "Example"
|
|
||||||
}
|
|
||||||
`
|
|
||||||
format := DetectFeedFormat(bytes.NewBufferString(data))
|
|
||||||
|
|
||||||
if format != FormatJSON {
|
|
||||||
t.Errorf("Wrong format detected: %s instead of %s", format, FormatJSON)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestDetectUnknown(t *testing.T) {
|
|
||||||
data := `
|
|
||||||
<!DOCTYPE html> <html> </html>
|
|
||||||
`
|
|
||||||
format := DetectFeedFormat(bytes.NewBufferString(data))
|
|
||||||
|
|
||||||
if format != FormatUnknown {
|
|
||||||
t.Errorf("Wrong format detected: %s instead of %s", format, FormatUnknown)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestParseAtom(t *testing.T) {
|
func TestParseAtom(t *testing.T) {
|
||||||
data := `<?xml version="1.0" encoding="utf-8"?>
|
data := `<?xml version="1.0" encoding="utf-8"?>
|
||||||
<feed xmlns="http://www.w3.org/2005/Atom">
|
<feed xmlns="http://www.w3.org/2005/Atom">
|
||||||
|
@ -92,7 +30,7 @@ func TestParseAtom(t *testing.T) {
|
||||||
|
|
||||||
</feed>`
|
</feed>`
|
||||||
|
|
||||||
feed, err := parseFeed(bytes.NewBufferString(data))
|
feed, err := ParseFeed(data)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Error(err)
|
t.Error(err)
|
||||||
}
|
}
|
||||||
|
@ -118,7 +56,7 @@ func TestParseRSS(t *testing.T) {
|
||||||
</channel>
|
</channel>
|
||||||
</rss>`
|
</rss>`
|
||||||
|
|
||||||
feed, err := parseFeed(bytes.NewBufferString(data))
|
feed, err := ParseFeed(data)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Error(err)
|
t.Error(err)
|
||||||
}
|
}
|
||||||
|
@ -147,7 +85,7 @@ func TestParseRDF(t *testing.T) {
|
||||||
</item>
|
</item>
|
||||||
</rdf:RDF>`
|
</rdf:RDF>`
|
||||||
|
|
||||||
feed, err := parseFeed(bytes.NewBufferString(data))
|
feed, err := ParseFeed(data)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Error(err)
|
t.Error(err)
|
||||||
}
|
}
|
||||||
|
@ -177,7 +115,7 @@ func TestParseJson(t *testing.T) {
|
||||||
]
|
]
|
||||||
}`
|
}`
|
||||||
|
|
||||||
feed, err := parseFeed(bytes.NewBufferString(data))
|
feed, err := ParseFeed(data)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Error(err)
|
t.Error(err)
|
||||||
}
|
}
|
||||||
|
@ -200,14 +138,14 @@ func TestParseUnknownFeed(t *testing.T) {
|
||||||
</html>
|
</html>
|
||||||
`
|
`
|
||||||
|
|
||||||
_, err := parseFeed(bytes.NewBufferString(data))
|
_, err := ParseFeed(data)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
t.Error("ParseFeed must returns an error")
|
t.Error("ParseFeed must returns an error")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestParseEmptyFeed(t *testing.T) {
|
func TestParseEmptyFeed(t *testing.T) {
|
||||||
_, err := parseFeed(bytes.NewBufferString(""))
|
_, err := ParseFeed("")
|
||||||
if err == nil {
|
if err == nil {
|
||||||
t.Error("ParseFeed must returns an error")
|
t.Error("ParseFeed must returns an error")
|
||||||
}
|
}
|
|
@ -18,7 +18,7 @@ import (
|
||||||
"github.com/PuerkitoBio/goquery"
|
"github.com/PuerkitoBio/goquery"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Fetch downloads a web page a returns relevant contents.
|
// Fetch downloads a web page and returns relevant contents.
|
||||||
func Fetch(websiteURL, rules, userAgent string) (string, error) {
|
func Fetch(websiteURL, rules, userAgent string) (string, error) {
|
||||||
clt := client.New(websiteURL)
|
clt := client.New(websiteURL)
|
||||||
if userAgent != "" {
|
if userAgent != "" {
|
||||||
|
@ -38,8 +38,7 @@ func Fetch(websiteURL, rules, userAgent string) (string, error) {
|
||||||
return "", fmt.Errorf("scraper: this resource is not a HTML document (%s)", response.ContentType)
|
return "", fmt.Errorf("scraper: this resource is not a HTML document (%s)", response.ContentType)
|
||||||
}
|
}
|
||||||
|
|
||||||
page, err := response.NormalizeBodyEncoding()
|
if err = response.EnsureUnicodeBody(); err != nil {
|
||||||
if err != nil {
|
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -52,11 +51,11 @@ func Fetch(websiteURL, rules, userAgent string) (string, error) {
|
||||||
|
|
||||||
var content string
|
var content string
|
||||||
if rules != "" {
|
if rules != "" {
|
||||||
logger.Debug(`[Scraper] Using rules "%s" for "%s"`, rules, websiteURL)
|
logger.Debug(`[Scraper] Using rules %q for %q`, rules, websiteURL)
|
||||||
content, err = scrapContent(page, rules)
|
content, err = scrapContent(response.Body, rules)
|
||||||
} else {
|
} else {
|
||||||
logger.Debug(`[Scraper] Using readability for "%s"`, websiteURL)
|
logger.Debug(`[Scraper] Using readability for "%q`, websiteURL)
|
||||||
content, err = readability.ExtractContent(page)
|
content, err = readability.ExtractContent(response.Body)
|
||||||
}
|
}
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|
|
@ -5,15 +5,15 @@
|
||||||
package subscription // import "miniflux.app/reader/subscription"
|
package subscription // import "miniflux.app/reader/subscription"
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"miniflux.app/errors"
|
"miniflux.app/errors"
|
||||||
"miniflux.app/http/client"
|
"miniflux.app/http/client"
|
||||||
"miniflux.app/logger"
|
"miniflux.app/logger"
|
||||||
"miniflux.app/reader/feed"
|
"miniflux.app/reader/parser"
|
||||||
"miniflux.app/timer"
|
"miniflux.app/timer"
|
||||||
"miniflux.app/url"
|
"miniflux.app/url"
|
||||||
|
|
||||||
|
@ -56,20 +56,12 @@ func FindSubscriptions(websiteURL, userAgent, username, password string) (Subscr
|
||||||
return nil, errors.NewLocalizedError(errEmptyBody)
|
return nil, errors.NewLocalizedError(errEmptyBody)
|
||||||
}
|
}
|
||||||
|
|
||||||
body, err := response.NormalizeBodyEncoding()
|
if err := response.EnsureUnicodeBody(); err != nil {
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
var buffer bytes.Buffer
|
body := response.String()
|
||||||
size, _ := io.Copy(&buffer, body)
|
if format := parser.DetectFeedFormat(body); format != parser.FormatUnknown {
|
||||||
if size == 0 {
|
|
||||||
return nil, errors.NewLocalizedError(errEmptyBody)
|
|
||||||
}
|
|
||||||
|
|
||||||
reader := bytes.NewReader(buffer.Bytes())
|
|
||||||
|
|
||||||
if format := feed.DetectFeedFormat(reader); format != feed.FormatUnknown {
|
|
||||||
var subscriptions Subscriptions
|
var subscriptions Subscriptions
|
||||||
subscriptions = append(subscriptions, &Subscription{
|
subscriptions = append(subscriptions, &Subscription{
|
||||||
Title: response.EffectiveURL,
|
Title: response.EffectiveURL,
|
||||||
|
@ -80,8 +72,7 @@ func FindSubscriptions(websiteURL, userAgent, username, password string) (Subscr
|
||||||
return subscriptions, nil
|
return subscriptions, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
reader.Seek(0, io.SeekStart)
|
return parseDocument(response.EffectiveURL, strings.NewReader(body))
|
||||||
return parseDocument(response.EffectiveURL, bytes.NewReader(buffer.Bytes()))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func parseDocument(websiteURL string, data io.Reader) (Subscriptions, error) {
|
func parseDocument(websiteURL string, data io.Reader) (Subscriptions, error) {
|
||||||
|
|
Loading…
Reference in a new issue