Make UTF-8 the default encoding for XML feeds
Consider the feed http://planet.haskell.org/atom.xml - This is a UTF-8 encoded XML file - No encoding declaration in the XML header - No Unicode byte order mark - Served with HTTP Content-Type "text/xml" (no charset parameter) Miniflux lets charset.NewReader handle this. The charset package implements the HTML5 character encoding algorithm, which, in this situation, defaults to windows-1252 encoding if there are no UTF-8 characters in the first 1000 bytes. So for this feed, we get the wrong encoding. I inserted an explicit "utf8.Valid()" check, which fixes this problem.
This commit is contained in:
parent
31e2669c4d
commit
15505ee4a2
1 changed files with 7 additions and 0 deletions
|
@ -11,6 +11,7 @@ import (
|
|||
"mime"
|
||||
"regexp"
|
||||
"strings"
|
||||
"unicode/utf8"
|
||||
|
||||
"golang.org/x/net/html/charset"
|
||||
)
|
||||
|
@ -97,6 +98,12 @@ func (r *Response) EnsureUnicodeBody() (err error) {
|
|||
if xmlEncodingRegex.Match(buffer[0:length]) {
|
||||
return
|
||||
}
|
||||
|
||||
// If no encoding is specified in the XML prolog and
|
||||
// the document is valid UTF-8, nothing needs to be done.
|
||||
if utf8.Valid(buffer) {
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in a new issue