Filter valid XML characters for UTF-8 XML documents before decoding
This change should reduce "illegal character code" XML errors.
This commit is contained in:
parent
a4ebb33cd5
commit
a155ab6deb
2 changed files with 96 additions and 4 deletions
|
@ -10,13 +10,25 @@ import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"io/ioutil"
|
"io/ioutil"
|
||||||
|
"strings"
|
||||||
|
|
||||||
"miniflux.app/reader/encoding"
|
"miniflux.app/reader/encoding"
|
||||||
)
|
)
|
||||||
|
|
||||||
// NewDecoder returns a XML decoder that filters illegal characters.
|
// NewDecoder returns a XML decoder that filters illegal characters.
|
||||||
func NewDecoder(data io.Reader) *xml.Decoder {
|
func NewDecoder(data io.Reader) *xml.Decoder {
|
||||||
decoder := xml.NewDecoder(data)
|
var decoder *xml.Decoder
|
||||||
|
buffer, _ := ioutil.ReadAll(data)
|
||||||
|
enc := procInst("encoding", string(buffer))
|
||||||
|
if enc != "" && enc != "utf-8" && enc != "UTF-8" && !strings.EqualFold(enc, "utf-8") {
|
||||||
|
// filter invalid chars later within decoder.CharsetReader
|
||||||
|
decoder = xml.NewDecoder(bytes.NewReader(buffer))
|
||||||
|
} else {
|
||||||
|
// filter invalid chars now, since decoder.CharsetReader not called for utf-8 content
|
||||||
|
filteredBytes := bytes.Map(filterValidXMLChar, buffer)
|
||||||
|
decoder = xml.NewDecoder(bytes.NewReader(filteredBytes))
|
||||||
|
}
|
||||||
|
|
||||||
decoder.Entity = xml.HTMLEntity
|
decoder.Entity = xml.HTMLEntity
|
||||||
decoder.Strict = false
|
decoder.Strict = false
|
||||||
decoder.CharsetReader = func(charset string, input io.Reader) (io.Reader, error) {
|
decoder.CharsetReader = func(charset string, input io.Reader) (io.Reader, error) {
|
||||||
|
@ -48,3 +60,28 @@ func filterValidXMLChar(r rune) rune {
|
||||||
}
|
}
|
||||||
return -1
|
return -1
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// This function is copied from encoding/xml package,
|
||||||
|
// procInst parses the `param="..."` or `param='...'`
|
||||||
|
// value out of the provided string, returning "" if not found.
|
||||||
|
func procInst(param, s string) string {
|
||||||
|
// TODO: this parsing is somewhat lame and not exact.
|
||||||
|
// It works for all actual cases, though.
|
||||||
|
param = param + "="
|
||||||
|
idx := strings.Index(s, param)
|
||||||
|
if idx == -1 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
v := s[idx+len(param):]
|
||||||
|
if v == "" {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
if v[0] != '\'' && v[0] != '"' {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
idx = strings.IndexRune(v[1:], rune(v[0]))
|
||||||
|
if idx == -1 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return v[1 : idx+1]
|
||||||
|
}
|
||||||
|
|
|
@ -11,19 +11,74 @@ import (
|
||||||
"testing"
|
"testing"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestIllegalCharacters(t *testing.T) {
|
func TestUTF8WithIllegalCharacters(t *testing.T) {
|
||||||
type myxml struct {
|
type myxml struct {
|
||||||
XMLName xml.Name `xml:"rss"`
|
XMLName xml.Name `xml:"rss"`
|
||||||
Version string `xml:"version,attr"`
|
Version string `xml:"version,attr"`
|
||||||
Title string `xml:"title"`
|
Title string `xml:"title"`
|
||||||
}
|
}
|
||||||
|
|
||||||
data := fmt.Sprintf(`<?xml version="1.0" encoding="windows-1251"?><rss version="2.0"><title>%s</title></rss>`, "\x10")
|
expected := "Title & 中文标题"
|
||||||
|
data := fmt.Sprintf(`<?xml version="1.0" encoding="UTF-8"?><rss version="2.0"><title>Title & 中文%s标题</title></rss>`, "\x10")
|
||||||
|
reader := strings.NewReader(data)
|
||||||
|
|
||||||
var x myxml
|
var x myxml
|
||||||
|
|
||||||
decoder := NewDecoder(strings.NewReader(data))
|
decoder := NewDecoder(reader)
|
||||||
err := decoder.Decode(&x)
|
err := decoder.Decode(&x)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Error(err)
|
t.Error(err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if x.Title != expected {
|
||||||
|
t.Errorf("Incorrect entry title, expected: %s, got: %s", expected, x.Title)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestWindows251WithIllegalCharacters(t *testing.T) {
|
||||||
|
type myxml struct {
|
||||||
|
XMLName xml.Name `xml:"rss"`
|
||||||
|
Version string `xml:"version,attr"`
|
||||||
|
Title string `xml:"title"`
|
||||||
|
}
|
||||||
|
|
||||||
|
expected := "Title & 中文标题"
|
||||||
|
data := fmt.Sprintf(`<?xml version="1.0" encoding="windows-1251"?><rss version="2.0"><title>Title & 中文%s标题</title></rss>`, "\x10")
|
||||||
|
reader := strings.NewReader(data)
|
||||||
|
|
||||||
|
var x myxml
|
||||||
|
|
||||||
|
decoder := NewDecoder(reader)
|
||||||
|
err := decoder.Decode(&x)
|
||||||
|
if err != nil {
|
||||||
|
t.Error(err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if x.Title != expected {
|
||||||
|
t.Errorf("Incorrect entry title, expected: %s, got: %s", expected, x.Title)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestIllegalEncodingField(t *testing.T) {
|
||||||
|
type myxml struct {
|
||||||
|
XMLName xml.Name `xml:"rss"`
|
||||||
|
Version string `xml:"version,attr"`
|
||||||
|
Title string `xml:"title"`
|
||||||
|
}
|
||||||
|
|
||||||
|
expected := "Title & 中文标题"
|
||||||
|
data := fmt.Sprintf(`<?xml version="1.0" encoding="invalid"?><rss version="2.0"><title>Title & 中文%s标题</title></rss>`, "\x10")
|
||||||
|
reader := strings.NewReader(data)
|
||||||
|
|
||||||
|
var x myxml
|
||||||
|
|
||||||
|
decoder := NewDecoder(reader)
|
||||||
|
err := decoder.Decode(&x)
|
||||||
|
if err != nil {
|
||||||
|
t.Error(err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if x.Title != expected {
|
||||||
|
t.Errorf("Incorrect entry title, expected: %s, got: %s", expected, x.Title)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue