miniflux/http/response.go
Frédéric Guillot 713b38e34c Handle more encoding edge cases
- Feeds with charset specified only in Content-Type header and not in XML document
- Feeds with charset specified in both places
- Feeds with charset specified only in XML document and not in HTTP header
2018-01-20 13:25:21 -08:00

68 lines
1.7 KiB
Go

// Copyright 2017 Frédéric Guillot. All rights reserved.
// Use of this source code is governed by the Apache 2.0
// license that can be found in the LICENSE file.
package http
import (
"io"
"mime"
"strings"
"github.com/miniflux/miniflux/logger"
"golang.org/x/net/html/charset"
)
// Response wraps a server response.
type Response struct {
Body io.Reader
StatusCode int
EffectiveURL string
LastModified string
ETag string
ContentType string
ContentLength int64
}
// HasServerFailure returns true if the status code represents a failure.
func (r *Response) HasServerFailure() bool {
return r.StatusCode >= 400
}
// IsModified returns true if the resource has been modified.
func (r *Response) IsModified(etag, lastModified string) bool {
if r.StatusCode == 304 {
return false
}
if r.ETag != "" && r.ETag == etag {
return false
}
if r.LastModified != "" && r.LastModified == lastModified {
return false
}
return true
}
// NormalizeBodyEncoding make sure the body is encoded in UTF-8.
//
// If a charset other than UTF-8 is detected, we convert the document to UTF-8.
// This is used by the scraper and feed readers.
//
// Do not forget edge cases:
// - Some non-utf8 feeds specify encoding only in Content-Type, not in XML document.
func (r *Response) NormalizeBodyEncoding() (io.Reader, error) {
_, params, err := mime.ParseMediaType(r.ContentType)
if err == nil {
if enc, found := params["charset"]; found {
enc = strings.ToLower(enc)
if enc != "utf-8" && enc != "utf8" && enc != "" {
logger.Debug("[NormalizeBodyEncoding] Convert body to UTF-8 from %s", enc)
return charset.NewReader(r.Body, r.ContentType)
}
}
}
return r.Body, nil
}