Normalize URL query string before executing HTTP requests

- Make sure query strings parameters are encoded
- As opposed to the standard library, do not append equal sign
for query parameters with empty value
- Strip URL fragments like Web browsers
This commit is contained in:
Frédéric Guillot 2019-12-26 15:26:23 -08:00
parent 200b1c304b
commit 3debf75eb9
8 changed files with 128 additions and 26 deletions

View file

@ -22,6 +22,7 @@ import (
"miniflux.app/errors"
"miniflux.app/logger"
"miniflux.app/timer"
url_helper "miniflux.app/url"
"miniflux.app/version"
)
@ -37,7 +38,8 @@ var (
// Client is a HTTP Client :)
type Client struct {
url string
inputURL string
requestURL string
etagHeader string
lastModifiedHeader string
authorizationHeader string
@ -47,6 +49,18 @@ type Client struct {
Insecure bool
}
func (c *Client) String() string {
return fmt.Sprintf(
`InputURL=%q RequestURL=%q ETag=%s LastModified=%q BasicAuth=%v UserAgent=%q`,
c.inputURL,
c.requestURL,
c.etagHeader,
c.lastModifiedHeader,
c.authorizationHeader != "" || (c.username != "" && c.password != ""),
c.userAgent,
)
}
// WithCredentials defines the username/password for HTTP Basic authentication.
func (c *Client) WithCredentials(username, password string) *Client {
if username != "" && password != "" {
@ -115,7 +129,12 @@ func (c *Client) PostJSON(data interface{}) (*Response, error) {
}
func (c *Client) executeRequest(request *http.Request) (*Response, error) {
defer timer.ExecutionTime(time.Now(), fmt.Sprintf("[HttpClient] url=%s", c.url))
defer timer.ExecutionTime(time.Now(), fmt.Sprintf("[HttpClient] inputURL=%s", c.inputURL))
logger.Debug("[HttpClient:Before] Method=%s %s",
request.Method,
c.String(),
)
client := c.buildClient()
resp, err := client.Do(request)
@ -162,21 +181,15 @@ func (c *Client) executeRequest(request *http.Request) (*Response, error) {
EffectiveURL: resp.Request.URL.String(),
LastModified: resp.Header.Get("Last-Modified"),
ETag: resp.Header.Get("ETag"),
Expires: resp.Header.Get("Expires"),
ContentType: resp.Header.Get("Content-Type"),
ContentLength: resp.ContentLength,
}
logger.Debug("[HttpClient:%s] URL=%s, EffectiveURL=%s, Code=%d, Length=%d, Type=%s, ETag=%s, LastMod=%s, Expires=%s, Auth=%v",
logger.Debug("[HttpClient:After] Method=%s %s; Response => %s",
request.Method,
c.url,
response.EffectiveURL,
response.StatusCode,
resp.ContentLength,
response.ContentType,
response.ETag,
response.LastModified,
resp.Header.Get("Expires"),
c.username != "",
c.String(),
response,
)
// Ignore caching headers for feeds that do not want any cache.
@ -190,7 +203,8 @@ func (c *Client) executeRequest(request *http.Request) (*Response, error) {
}
func (c *Client) buildRequest(method string, body io.Reader) (*http.Request, error) {
request, err := http.NewRequest(method, c.url, body)
c.requestURL = url_helper.RequestURI(c.inputURL)
request, err := http.NewRequest(method, c.requestURL, body)
if err != nil {
return nil, err
}
@ -238,5 +252,5 @@ func (c *Client) buildHeaders() http.Header {
// New returns a new HTTP client.
func New(url string) *Client {
return &Client{url: url, userAgent: DefaultUserAgent, Insecure: false}
return &Client{inputURL: url, userAgent: DefaultUserAgent, Insecure: false}
}

View file

@ -6,6 +6,7 @@ package client // import "miniflux.app/http/client"
import (
"bytes"
"fmt"
"io"
"io/ioutil"
"regexp"
@ -24,10 +25,24 @@ type Response struct {
EffectiveURL string
LastModified string
ETag string
Expires string
ContentType string
ContentLength int64
}
func (r *Response) String() string {
return fmt.Sprintf(
`StatusCode=%d EffectiveURL=%q LastModified=%q ETag=%s Expires=%s ContentType=%q ContentLength=%d`,
r.StatusCode,
r.EffectiveURL,
r.LastModified,
r.ETag,
r.Expires,
r.ContentType,
r.ContentLength,
)
}
// IsNotFound returns true if the resource doesn't exists anymore.
func (r *Response) IsNotFound() bool {
return r.StatusCode == 404 || r.StatusCode == 410
@ -105,8 +120,8 @@ func (r *Response) EnsureUnicodeBody() (err error) {
return err
}
// String returns the response body as string.
func (r *Response) String() string {
// BodyAsString returns the response body as string.
func (r *Response) BodyAsString() string {
bytes, _ := ioutil.ReadAll(r.Body)
return string(bytes)
}

View file

@ -95,7 +95,7 @@ func TestToString(t *testing.T) {
input := `test`
r := &Response{Body: strings.NewReader(input)}
if r.String() != input {
if r.BodyAsString() != input {
t.Error(`Unexpected ouput`)
}
}
@ -140,7 +140,7 @@ func TestEnsureUnicodeWithHTMLDocuments(t *testing.T) {
t.Fatalf(`Unicode conversion error for %q - %q: %v`, tc.filename, tc.contentType, parseErr)
}
isUnicode := utf8.ValidString(r.String())
isUnicode := utf8.ValidString(r.BodyAsString())
if isUnicode != tc.convertedToUnicode {
t.Errorf(`Unicode conversion %q - %q, got: %v, expected: %v`,
tc.filename, tc.contentType, isUnicode, tc.convertedToUnicode)

View file

@ -52,7 +52,7 @@ func (h *Handler) CreateFeed(userID, categoryID int64, url string, crawler bool,
return nil, errors.NewLocalizedError(errDuplicate, response.EffectiveURL)
}
subscription, parseErr := parser.ParseFeed(response.String())
subscription, parseErr := parser.ParseFeed(response.BodyAsString())
if parseErr != nil {
return nil, parseErr
}
@ -106,7 +106,7 @@ func (h *Handler) RefreshFeed(userID, feedID int64) error {
if response.IsModified(originalFeed.EtagHeader, originalFeed.LastModifiedHeader) {
logger.Debug("[Handler:RefreshFeed] Feed #%d has been modified", feedID)
updatedFeed, parseErr := parser.ParseFeed(response.String())
updatedFeed, parseErr := parser.ParseFeed(response.BodyAsString())
if parseErr != nil {
originalFeed.WithError(parseErr.Localize(printer))
h.store.UpdateFeedError(originalFeed)

View file

@ -191,7 +191,7 @@ func TestDifferentEncodingWithResponse(t *testing.T) {
t.Fatalf(`Encoding error for %q: %v`, tc.filename, encodingErr)
}
feed, parseErr := ParseFeed(r.String())
feed, parseErr := ParseFeed(r.BodyAsString())
if parseErr != nil {
t.Fatalf(`Parsing error for %q - %q: %v`, tc.filename, tc.contentType, parseErr)
}

View file

@ -18,7 +18,7 @@ import (
)
var (
errUnreadableDoc = "Unable to analyze this page: %v"
errUnreadableDoc = "Unable to analyze this page: %v"
)
// FindSubscriptions downloads and try to find one or more subscriptions from an URL.
@ -31,7 +31,7 @@ func FindSubscriptions(websiteURL, userAgent, username, password string) (Subscr
return nil, err
}
body := response.String()
body := response.BodyAsString()
if format := parser.DetectFeedFormat(body); format != parser.FormatUnknown {
var subscriptions Subscriptions
subscriptions = append(subscriptions, &Subscription{

View file

@ -4,9 +4,12 @@
package url // import "miniflux.app/url"
import "net/url"
import "fmt"
import "strings"
import (
"fmt"
"net/url"
"sort"
"strings"
)
// AbsoluteURL converts the input URL as absolute URL if necessary.
func AbsoluteURL(baseURL, input string) (string, error) {
@ -69,3 +72,51 @@ func Domain(websiteURL string) string {
return parsedURL.Host
}
// RequestURI returns the encoded URI to be used in HTTP requests.
func RequestURI(websiteURL string) string {
u, err := url.Parse(websiteURL)
if err != nil {
return websiteURL
}
queryValues := u.Query()
u.RawQuery = "" // Clear RawQuery to make sure it's encoded properly.
u.Fragment = "" // Clear fragment because Web browsers strip #fragment before sending the URL to a web server.
var buf strings.Builder
buf.WriteString(u.String())
if len(queryValues) > 0 {
buf.WriteByte('?')
// Sort keys.
keys := make([]string, 0, len(queryValues))
for k := range queryValues {
keys = append(keys, k)
}
sort.Strings(keys)
i := 0
for _, key := range keys {
keyEscaped := url.QueryEscape(key)
values := queryValues[key]
for _, value := range values {
if i > 0 {
buf.WriteByte('&')
}
buf.WriteString(keyEscaped)
// As opposed to the standard library, we append the = only if the value is not empty.
if value != "" {
buf.WriteByte('=')
buf.WriteString(url.QueryEscape(value))
}
i++
}
}
}
return buf.String()
}

View file

@ -71,3 +71,25 @@ func TestDomain(t *testing.T) {
}
}
}
func TestRequestURI(t *testing.T) {
scenarios := map[string]string{
"https://www.example.org": "https://www.example.org",
"https://user:password@www.example.org": "https://user:password@www.example.org",
"https://www.example.org/path with spaces": "https://www.example.org/path%20with%20spaces",
"https://www.example.org/path#test": "https://www.example.org/path",
"https://www.example.org/path?abc#test": "https://www.example.org/path?abc",
"https://www.example.org/path?a=b&a=c": "https://www.example.org/path?a=b&a=c",
"https://www.example.org/path?a=b&a=c&d": "https://www.example.org/path?a=b&a=c&d",
"https://www.example.org/path?atom": "https://www.example.org/path?atom",
"https://www.example.org/path?测试=测试": "https://www.example.org/path?%E6%B5%8B%E8%AF%95=%E6%B5%8B%E8%AF%95",
"https://www.example.org/url=http%3A%2F%2Fwww.example.com%2Ffeed%2F&max=20": "https://www.example.org/url=http%3A%2F%2Fwww.example.com%2Ffeed%2F&max=20",
}
for input, expected := range scenarios {
actual := RequestURI(input)
if actual != expected {
t.Errorf(`Unexpected result, got %q instead of %q`, actual, expected)
}
}
}