2017-11-20 06:10:04 +01:00
|
|
|
// Copyright 2017 Frédéric Guillot. All rights reserved.
|
|
|
|
// Use of this source code is governed by the Apache 2.0
|
|
|
|
// license that can be found in the LICENSE file.
|
|
|
|
|
2018-08-25 06:51:50 +02:00
|
|
|
package feed // import "miniflux.app/reader/feed"
|
2017-11-20 06:10:04 +01:00
|
|
|
|
|
|
|
import (
|
|
|
|
"bytes"
|
|
|
|
"encoding/xml"
|
2017-11-21 03:34:11 +01:00
|
|
|
"io"
|
|
|
|
"strings"
|
|
|
|
"time"
|
|
|
|
|
2018-08-25 06:51:50 +02:00
|
|
|
"miniflux.app/errors"
|
|
|
|
"miniflux.app/logger"
|
|
|
|
"miniflux.app/model"
|
|
|
|
"miniflux.app/reader/atom"
|
|
|
|
"miniflux.app/reader/encoding"
|
|
|
|
"miniflux.app/reader/json"
|
|
|
|
"miniflux.app/reader/rdf"
|
|
|
|
"miniflux.app/reader/rss"
|
|
|
|
"miniflux.app/timer"
|
2017-11-20 06:10:04 +01:00
|
|
|
)
|
|
|
|
|
2017-11-21 03:34:11 +01:00
|
|
|
// List of feed formats.
|
2017-11-20 06:10:04 +01:00
|
|
|
const (
|
2017-11-21 03:34:11 +01:00
|
|
|
FormatRDF = "rdf"
|
|
|
|
FormatRSS = "rss"
|
2017-11-20 06:10:04 +01:00
|
|
|
FormatAtom = "atom"
|
2017-11-21 03:34:11 +01:00
|
|
|
FormatJSON = "json"
|
2017-11-20 06:10:04 +01:00
|
|
|
FormatUnknown = "unknown"
|
|
|
|
)
|
|
|
|
|
2017-11-21 03:34:11 +01:00
|
|
|
// DetectFeedFormat detect feed format from input data.
|
2018-01-20 07:42:55 +01:00
|
|
|
func DetectFeedFormat(r io.Reader) string {
|
2018-01-03 04:15:08 +01:00
|
|
|
defer timer.ExecutionTime(time.Now(), "[Feed:DetectFeedFormat]")
|
2017-11-20 06:10:04 +01:00
|
|
|
|
|
|
|
var buffer bytes.Buffer
|
2018-01-20 07:42:55 +01:00
|
|
|
tee := io.TeeReader(r, &buffer)
|
2017-11-20 06:10:04 +01:00
|
|
|
|
|
|
|
decoder := xml.NewDecoder(tee)
|
2018-01-20 07:42:55 +01:00
|
|
|
decoder.CharsetReader = encoding.CharsetReader
|
2017-11-20 06:10:04 +01:00
|
|
|
|
|
|
|
for {
|
|
|
|
token, _ := decoder.Token()
|
|
|
|
if token == nil {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
|
|
|
|
if element, ok := token.(xml.StartElement); ok {
|
|
|
|
switch element.Name.Local {
|
|
|
|
case "rss":
|
2017-11-21 03:34:11 +01:00
|
|
|
return FormatRSS
|
2017-11-20 06:10:04 +01:00
|
|
|
case "feed":
|
|
|
|
return FormatAtom
|
2017-11-21 03:34:11 +01:00
|
|
|
case "RDF":
|
|
|
|
return FormatRDF
|
2017-11-20 06:10:04 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if strings.HasPrefix(strings.TrimSpace(buffer.String()), "{") {
|
2017-11-21 03:34:11 +01:00
|
|
|
return FormatJSON
|
2017-11-20 06:10:04 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
return FormatUnknown
|
|
|
|
}
|
|
|
|
|
2018-02-28 06:08:32 +01:00
|
|
|
func parseFeed(r io.Reader) (*model.Feed, *errors.LocalizedError) {
|
2018-01-03 04:15:08 +01:00
|
|
|
defer timer.ExecutionTime(time.Now(), "[Feed:ParseFeed]")
|
2017-11-20 06:10:04 +01:00
|
|
|
|
|
|
|
var buffer bytes.Buffer
|
2018-02-08 03:47:47 +01:00
|
|
|
size, _ := io.Copy(&buffer, r)
|
|
|
|
if size == 0 {
|
2018-09-22 03:53:29 +02:00
|
|
|
return nil, errors.NewLocalizedError(errEmptyFeed)
|
2018-02-08 03:47:47 +01:00
|
|
|
}
|
2017-11-20 06:10:04 +01:00
|
|
|
|
2018-02-08 05:57:56 +01:00
|
|
|
str := stripInvalidXMLCharacters(buffer.String())
|
|
|
|
reader := strings.NewReader(str)
|
2017-11-20 06:10:04 +01:00
|
|
|
format := DetectFeedFormat(reader)
|
|
|
|
reader.Seek(0, io.SeekStart)
|
|
|
|
|
|
|
|
switch format {
|
|
|
|
case FormatAtom:
|
|
|
|
return atom.Parse(reader)
|
2017-11-21 03:34:11 +01:00
|
|
|
case FormatRSS:
|
2017-11-20 06:10:04 +01:00
|
|
|
return rss.Parse(reader)
|
2017-11-21 03:34:11 +01:00
|
|
|
case FormatJSON:
|
2017-11-20 06:10:04 +01:00
|
|
|
return json.Parse(reader)
|
2017-11-21 03:34:11 +01:00
|
|
|
case FormatRDF:
|
|
|
|
return rdf.Parse(reader)
|
2017-11-20 06:10:04 +01:00
|
|
|
default:
|
2018-02-28 06:08:32 +01:00
|
|
|
return nil, errors.NewLocalizedError("Unsupported feed format")
|
2017-11-20 06:10:04 +01:00
|
|
|
}
|
|
|
|
}
|
2018-02-08 05:57:56 +01:00
|
|
|
|
|
|
|
func stripInvalidXMLCharacters(input string) string {
|
|
|
|
return strings.Map(func(r rune) rune {
|
|
|
|
if isInCharacterRange(r) {
|
|
|
|
return r
|
|
|
|
}
|
|
|
|
|
|
|
|
logger.Debug("Strip invalid XML characters: %U", r)
|
|
|
|
return -1
|
|
|
|
}, input)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Decide whether the given rune is in the XML Character Range, per
|
|
|
|
// the Char production of http://www.xml.com/axml/testaxml.htm,
|
|
|
|
// Section 2.2 Characters.
|
|
|
|
func isInCharacterRange(r rune) (inrange bool) {
|
|
|
|
return r == 0x09 ||
|
|
|
|
r == 0x0A ||
|
|
|
|
r == 0x0D ||
|
|
|
|
r >= 0x20 && r <= 0xDF77 ||
|
|
|
|
r >= 0xE000 && r <= 0xFFFD ||
|
|
|
|
r >= 0x10000 && r <= 0x10FFFF
|
|
|
|
}
|