From 771f9d2b5fe40a034d206e166db0f040ac24fb22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Guillot?= Date: Thu, 18 Apr 2024 21:44:55 -0700 Subject: [PATCH] reader/fetcher: add brotli content encoding support --- go.mod | 1 + go.sum | 2 + internal/reader/fetcher/encoding_wrappers.go | 55 ++++++++++++++++++++ internal/reader/fetcher/request_builder.go | 1 + internal/reader/fetcher/response_handler.go | 23 +++++++- 5 files changed, 80 insertions(+), 2 deletions(-) create mode 100644 internal/reader/fetcher/encoding_wrappers.go diff --git a/go.mod b/go.mod index a63c6c2f..59feed98 100644 --- a/go.mod +++ b/go.mod @@ -27,6 +27,7 @@ require ( ) require ( + github.com/andybalholm/brotli v1.1.0 // indirect github.com/andybalholm/cascadia v1.3.2 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/cespare/xxhash/v2 v2.2.0 // indirect diff --git a/go.sum b/go.sum index f2013f52..bcda7538 100644 --- a/go.sum +++ b/go.sum @@ -2,6 +2,8 @@ github.com/PuerkitoBio/goquery v1.9.1 h1:mTL6XjbJTZdpfL+Gwl5U2h1l9yEkJjhmlTeV9VP github.com/PuerkitoBio/goquery v1.9.1/go.mod h1:cW1n6TmIMDoORQU5IU/P1T3tGFunOeXEpGP2WHRwkbY= github.com/abadojack/whatlanggo v1.0.1 h1:19N6YogDnf71CTHm3Mp2qhYfkRdyvbgwWdd2EPxJRG4= github.com/abadojack/whatlanggo v1.0.1/go.mod h1:66WiQbSbJBIlOZMsvbKe5m6pzQovxCH9B/K8tQB2uoc= +github.com/andybalholm/brotli v1.1.0 h1:eLKJA0d02Lf0mVpIDgYnqXcUn0GqVmEFny3VuID1U3M= +github.com/andybalholm/brotli v1.1.0/go.mod h1:sms7XGricyQI9K10gOSf56VKKWS4oLer58Q+mhRPtnY= github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss= github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= diff --git a/internal/reader/fetcher/encoding_wrappers.go b/internal/reader/fetcher/encoding_wrappers.go new file mode 100644 index 00000000..41820341 --- /dev/null +++ b/internal/reader/fetcher/encoding_wrappers.go @@ -0,0 +1,55 @@ +package fetcher + +import ( + "compress/gzip" + "io" + + "github.com/andybalholm/brotli" +) + +type brotliReadCloser struct { + body io.ReadCloser + brotliReader io.Reader +} + +func NewBrotliReadCloser(body io.ReadCloser) *brotliReadCloser { + return &brotliReadCloser{ + body: body, + brotliReader: brotli.NewReader(body), + } +} + +func (b *brotliReadCloser) Read(p []byte) (n int, err error) { + return b.brotliReader.Read(p) +} + +func (b *brotliReadCloser) Close() error { + return b.body.Close() +} + +type gzipReadCloser struct { + body io.ReadCloser + gzipReader io.Reader + gzipErr error +} + +func NewGzipReadCloser(body io.ReadCloser) *gzipReadCloser { + return &gzipReadCloser{body: body} +} + +func (gz *gzipReadCloser) Read(p []byte) (n int, err error) { + if gz.gzipReader == nil { + if gz.gzipErr == nil { + gz.gzipReader, gz.gzipErr = gzip.NewReader(gz.body) + } + if gz.gzipErr != nil { + return 0, gz.gzipErr + } + } + + return gz.gzipReader.Read(p) +} + +func (gz *gzipReadCloser) Close() error { + return gz.body.Close() +} diff --git a/internal/reader/fetcher/request_builder.go b/internal/reader/fetcher/request_builder.go index e2b2258b..77c18948 100644 --- a/internal/reader/fetcher/request_builder.go +++ b/internal/reader/fetcher/request_builder.go @@ -169,6 +169,7 @@ func (r *RequestBuilder) ExecuteRequest(requestURL string) (*http.Response, erro } req.Header = r.headers + req.Header.Set("Accept-Encoding", "br, gzip") req.Header.Set("Accept", defaultAcceptHeader) req.Header.Set("Connection", "close") diff --git a/internal/reader/fetcher/response_handler.go b/internal/reader/fetcher/response_handler.go index 03ab39ca..1aba5957 100644 --- a/internal/reader/fetcher/response_handler.go +++ b/internal/reader/fetcher/response_handler.go @@ -8,6 +8,7 @@ import ( "errors" "fmt" "io" + "log/slog" "net" "net/http" "net/url" @@ -71,12 +72,30 @@ func (r *ResponseHandler) Close() { } } +func (r *ResponseHandler) getReader(maxBodySize int64) io.ReadCloser { + slog.Debug("Request response", + slog.String("effective_url", r.EffectiveURL()), + slog.Int64("content_length", r.httpResponse.ContentLength), + slog.String("content_encoding", r.httpResponse.Header.Get("Content-Encoding")), + slog.String("content_type", r.httpResponse.Header.Get("Content-Type")), + ) + + reader := r.httpResponse.Body + switch r.httpResponse.Header.Get("Content-Encoding") { + case "br": + reader = NewBrotliReadCloser(r.httpResponse.Body) + case "gzip": + reader = NewGzipReadCloser(r.httpResponse.Body) + } + return http.MaxBytesReader(nil, reader, maxBodySize) +} + func (r *ResponseHandler) Body(maxBodySize int64) io.ReadCloser { - return http.MaxBytesReader(nil, r.httpResponse.Body, maxBodySize) + return r.getReader(maxBodySize) } func (r *ResponseHandler) ReadBody(maxBodySize int64) ([]byte, *locale.LocalizedErrorWrapper) { - limitedReader := http.MaxBytesReader(nil, r.httpResponse.Body, maxBodySize) + limitedReader := r.getReader(maxBodySize) buffer, err := io.ReadAll(limitedReader) if err != nil && err != io.EOF {