miniflux/reader/feed/parser.go

119 lines
2.7 KiB
Go
Raw Normal View History

2017-11-20 06:10:04 +01:00
// Copyright 2017 Frédéric Guillot. All rights reserved.
// Use of this source code is governed by the Apache 2.0
// license that can be found in the LICENSE file.
package feed
import (
"bytes"
"encoding/xml"
2017-11-21 03:34:11 +01:00
"io"
"strings"
"time"
"github.com/miniflux/miniflux/errors"
"github.com/miniflux/miniflux/logger"
2017-12-13 06:48:13 +01:00
"github.com/miniflux/miniflux/model"
"github.com/miniflux/miniflux/reader/atom"
"github.com/miniflux/miniflux/reader/encoding"
2017-12-13 06:48:13 +01:00
"github.com/miniflux/miniflux/reader/json"
"github.com/miniflux/miniflux/reader/rdf"
"github.com/miniflux/miniflux/reader/rss"
2018-01-03 04:15:08 +01:00
"github.com/miniflux/miniflux/timer"
2017-11-20 06:10:04 +01:00
)
2017-11-21 03:34:11 +01:00
// List of feed formats.
2017-11-20 06:10:04 +01:00
const (
2017-11-21 03:34:11 +01:00
FormatRDF = "rdf"
FormatRSS = "rss"
2017-11-20 06:10:04 +01:00
FormatAtom = "atom"
2017-11-21 03:34:11 +01:00
FormatJSON = "json"
2017-11-20 06:10:04 +01:00
FormatUnknown = "unknown"
)
2017-11-21 03:34:11 +01:00
// DetectFeedFormat detect feed format from input data.
func DetectFeedFormat(r io.Reader) string {
2018-01-03 04:15:08 +01:00
defer timer.ExecutionTime(time.Now(), "[Feed:DetectFeedFormat]")
2017-11-20 06:10:04 +01:00
var buffer bytes.Buffer
tee := io.TeeReader(r, &buffer)
2017-11-20 06:10:04 +01:00
decoder := xml.NewDecoder(tee)
decoder.CharsetReader = encoding.CharsetReader
2017-11-20 06:10:04 +01:00
for {
token, _ := decoder.Token()
if token == nil {
break
}
if element, ok := token.(xml.StartElement); ok {
switch element.Name.Local {
case "rss":
2017-11-21 03:34:11 +01:00
return FormatRSS
2017-11-20 06:10:04 +01:00
case "feed":
return FormatAtom
2017-11-21 03:34:11 +01:00
case "RDF":
return FormatRDF
2017-11-20 06:10:04 +01:00
}
}
}
if strings.HasPrefix(strings.TrimSpace(buffer.String()), "{") {
2017-11-21 03:34:11 +01:00
return FormatJSON
2017-11-20 06:10:04 +01:00
}
return FormatUnknown
}
func parseFeed(r io.Reader) (*model.Feed, *errors.LocalizedError) {
2018-01-03 04:15:08 +01:00
defer timer.ExecutionTime(time.Now(), "[Feed:ParseFeed]")
2017-11-20 06:10:04 +01:00
var buffer bytes.Buffer
size, _ := io.Copy(&buffer, r)
if size == 0 {
return nil, errors.NewLocalizedError("This feed is empty")
}
2017-11-20 06:10:04 +01:00
str := stripInvalidXMLCharacters(buffer.String())
reader := strings.NewReader(str)
2017-11-20 06:10:04 +01:00
format := DetectFeedFormat(reader)
reader.Seek(0, io.SeekStart)
switch format {
case FormatAtom:
return atom.Parse(reader)
2017-11-21 03:34:11 +01:00
case FormatRSS:
2017-11-20 06:10:04 +01:00
return rss.Parse(reader)
2017-11-21 03:34:11 +01:00
case FormatJSON:
2017-11-20 06:10:04 +01:00
return json.Parse(reader)
2017-11-21 03:34:11 +01:00
case FormatRDF:
return rdf.Parse(reader)
2017-11-20 06:10:04 +01:00
default:
return nil, errors.NewLocalizedError("Unsupported feed format")
2017-11-20 06:10:04 +01:00
}
}
func stripInvalidXMLCharacters(input string) string {
return strings.Map(func(r rune) rune {
if isInCharacterRange(r) {
return r
}
logger.Debug("Strip invalid XML characters: %U", r)
return -1
}, input)
}
// Decide whether the given rune is in the XML Character Range, per
// the Char production of http://www.xml.com/axml/testaxml.htm,
// Section 2.2 Characters.
func isInCharacterRange(r rune) (inrange bool) {
return r == 0x09 ||
r == 0x0A ||
r == 0x0D ||
r >= 0x20 && r <= 0xDF77 ||
r >= 0xE000 && r <= 0xFFFD ||
r >= 0x10000 && r <= 0x10FFFF
}