Filter valid XML characters for UTF-8 XML documents before decoding

This change should reduce "illegal character code" XML errors.
This commit is contained in:
Jebbs 2019-12-20 10:31:52 +08:00 committed by Frédéric Guillot
parent a4ebb33cd5
commit a155ab6deb
2 changed files with 96 additions and 4 deletions

View file

@ -10,13 +10,25 @@ import (
"fmt"
"io"
"io/ioutil"
"strings"
"miniflux.app/reader/encoding"
)
// NewDecoder returns a XML decoder that filters illegal characters.
func NewDecoder(data io.Reader) *xml.Decoder {
decoder := xml.NewDecoder(data)
var decoder *xml.Decoder
buffer, _ := ioutil.ReadAll(data)
enc := procInst("encoding", string(buffer))
if enc != "" && enc != "utf-8" && enc != "UTF-8" && !strings.EqualFold(enc, "utf-8") {
// filter invalid chars later within decoder.CharsetReader
decoder = xml.NewDecoder(bytes.NewReader(buffer))
} else {
// filter invalid chars now, since decoder.CharsetReader not called for utf-8 content
filteredBytes := bytes.Map(filterValidXMLChar, buffer)
decoder = xml.NewDecoder(bytes.NewReader(filteredBytes))
}
decoder.Entity = xml.HTMLEntity
decoder.Strict = false
decoder.CharsetReader = func(charset string, input io.Reader) (io.Reader, error) {
@ -48,3 +60,28 @@ func filterValidXMLChar(r rune) rune {
}
return -1
}
// This function is copied from encoding/xml package,
// procInst parses the `param="..."` or `param='...'`
// value out of the provided string, returning "" if not found.
func procInst(param, s string) string {
// TODO: this parsing is somewhat lame and not exact.
// It works for all actual cases, though.
param = param + "="
idx := strings.Index(s, param)
if idx == -1 {
return ""
}
v := s[idx+len(param):]
if v == "" {
return ""
}
if v[0] != '\'' && v[0] != '"' {
return ""
}
idx = strings.IndexRune(v[1:], rune(v[0]))
if idx == -1 {
return ""
}
return v[1 : idx+1]
}

View file

@ -11,19 +11,74 @@ import (
"testing"
)
func TestIllegalCharacters(t *testing.T) {
func TestUTF8WithIllegalCharacters(t *testing.T) {
type myxml struct {
XMLName xml.Name `xml:"rss"`
Version string `xml:"version,attr"`
Title string `xml:"title"`
}
data := fmt.Sprintf(`<?xml version="1.0" encoding="windows-1251"?><rss version="2.0"><title>%s</title></rss>`, "\x10")
expected := "Title & 中文标题"
data := fmt.Sprintf(`<?xml version="1.0" encoding="UTF-8"?><rss version="2.0"><title>Title & 中文%s标题</title></rss>`, "\x10")
reader := strings.NewReader(data)
var x myxml
decoder := NewDecoder(strings.NewReader(data))
decoder := NewDecoder(reader)
err := decoder.Decode(&x)
if err != nil {
t.Error(err)
return
}
if x.Title != expected {
t.Errorf("Incorrect entry title, expected: %s, got: %s", expected, x.Title)
}
}
func TestWindows251WithIllegalCharacters(t *testing.T) {
type myxml struct {
XMLName xml.Name `xml:"rss"`
Version string `xml:"version,attr"`
Title string `xml:"title"`
}
expected := "Title & 中文标题"
data := fmt.Sprintf(`<?xml version="1.0" encoding="windows-1251"?><rss version="2.0"><title>Title & 中文%s标题</title></rss>`, "\x10")
reader := strings.NewReader(data)
var x myxml
decoder := NewDecoder(reader)
err := decoder.Decode(&x)
if err != nil {
t.Error(err)
return
}
if x.Title != expected {
t.Errorf("Incorrect entry title, expected: %s, got: %s", expected, x.Title)
}
}
func TestIllegalEncodingField(t *testing.T) {
type myxml struct {
XMLName xml.Name `xml:"rss"`
Version string `xml:"version,attr"`
Title string `xml:"title"`
}
expected := "Title & 中文标题"
data := fmt.Sprintf(`<?xml version="1.0" encoding="invalid"?><rss version="2.0"><title>Title & 中文%s标题</title></rss>`, "\x10")
reader := strings.NewReader(data)
var x myxml
decoder := NewDecoder(reader)
err := decoder.Decode(&x)
if err != nil {
t.Error(err)
return
}
if x.Title != expected {
t.Errorf("Incorrect entry title, expected: %s, got: %s", expected, x.Title)
}
}