miniflux/internal/reader/scraper/scraper.go

// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

package scraper // import "miniflux.app/v2/internal/reader/scraper"

import (
	"errors"
	"fmt"
	"io"
	"strings"

	"miniflux.app/v2/internal/config"
	"miniflux.app/v2/internal/http/client"
	"miniflux.app/v2/internal/logger"
	"miniflux.app/v2/internal/reader/readability"
	"miniflux.app/v2/internal/urllib"

	"github.com/PuerkitoBio/goquery"
)

// Fetch downloads a web page and returns relevant contents.
func Fetch(websiteURL, rules, userAgent string, cookie string, allowSelfSignedCertificates, useProxy bool) (string, error) {
	clt := client.NewClientWithConfig(websiteURL, config.Opts)
	clt.WithUserAgent(userAgent)
	clt.WithCookie(cookie)
	if useProxy {
		clt.WithProxy()
	}
	clt.AllowSelfSignedCertificates = allowSelfSignedCertificates

	response, err := clt.Get()
	if err != nil {
		return "", err
	}

	if response.HasServerFailure() {
		return "", errors.New("scraper: unable to download web page")
	}

	if !isAllowedContentType(response.ContentType) {
		return "", fmt.Errorf("scraper: this resource is not a HTML document (%s)", response.ContentType)
	}

	if err = response.EnsureUnicodeBody(); err != nil {
		return "", err
	}

	// The entry URL could redirect somewhere else.
	sameSite := urllib.Domain(websiteURL) == urllib.Domain(response.EffectiveURL)
	websiteURL = response.EffectiveURL

	if rules == "" {
		rules = getPredefinedScraperRules(websiteURL)
	}

	var content string
	if sameSite && rules != "" {
		logger.Debug(`[Scraper] Using rules %q for %q`, rules, websiteURL)
		content, err = scrapContent(response.Body, rules)
	} else {
		logger.Debug(`[Scraper] Using readability for %q`, websiteURL)
		content, err = readability.ExtractContent(response.Body)
	}

	if err != nil {
		return "", err
	}

	return content, nil
}

func scrapContent(page io.Reader, rules string) (string, error) {
	document, err := goquery.NewDocumentFromReader(page)
	if err != nil {
		return "", err
	}

	contents := ""
	document.Find(rules).Each(func(i int, s *goquery.Selection) {
		var content string

		content, _ = goquery.OuterHtml(s)
		contents += content
	})

	return contents, nil
}

func getPredefinedScraperRules(websiteURL string) string {
	urlDomain := urllib.Domain(websiteURL)

	for domain, rules := range predefinedRules {
		if strings.Contains(urlDomain, domain) {
			return rules
		}
	}

	return ""
}

func isAllowedContentType(contentType string) bool {
	contentType = strings.ToLower(contentType)
	return strings.HasPrefix(contentType, "text/html") ||
		strings.HasPrefix(contentType, "application/xhtml+xml")
}
Replace copyright header with SPDX identifier 2023-06-19 23:42:47 +02:00			`// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.`
			`// SPDX-License-Identifier: Apache-2.0`
Add readability package to fetch original content 2017-12-11 04:01:38 +01:00
Move internal packages to an internal folder For reference: https://go.dev/doc/go1.4#internalpackages 2023-08-11 04:46:45 +02:00			`package scraper // import "miniflux.app/v2/internal/reader/scraper"`
Add readability package to fetch original content 2017-12-11 04:01:38 +01:00
			`import (`
			`"errors"`
Make sure the scraper parse only HTML documents 2018-01-03 03:32:01 +01:00			`"fmt"`
Add scraper rules 2017-12-11 05:51:04 +01:00			`"io"`
			`"strings"`
Add readability package to fetch original content 2017-12-11 04:01:38 +01:00
Move internal packages to an internal folder For reference: https://go.dev/doc/go1.4#internalpackages 2023-08-11 04:46:45 +02:00			`"miniflux.app/v2/internal/config"`
			`"miniflux.app/v2/internal/http/client"`
			`"miniflux.app/v2/internal/logger"`
			`"miniflux.app/v2/internal/reader/readability"`
Rename internal url package to avoid overlap with net/url 2023-08-14 04:09:01 +02:00			`"miniflux.app/v2/internal/urllib"`
Use canonical imports 2018-08-25 06:51:50 +02:00
Add scraper rules 2017-12-11 05:51:04 +01:00			`"github.com/PuerkitoBio/goquery"`
Add readability package to fetch original content 2017-12-11 04:01:38 +01:00			`)`

Simplify feed parser and format detection - Avoid doing multiple buffer copies - Move parser and format detection logic to its own package 2018-10-14 20:46:41 +02:00			`// Fetch downloads a web page and returns relevant contents.`
add proxy arg in scraper.Fetch 2021-08-28 11:30:04 +02:00			`func Fetch(websiteURL, rules, userAgent string, cookie string, allowSelfSignedCertificates, useProxy bool) (string, error) {`
http client: remove dependency on global config options 2020-09-27 23:29:48 +02:00			`clt := client.NewClientWithConfig(websiteURL, config.Opts)`
Add option to allow self-signed or invalid certificates 2021-02-21 22:42:49 +01:00			`clt.WithUserAgent(userAgent)`
Add per feed cookies option 2021-03-23 04:27:58 +01:00			`clt.WithCookie(cookie)`
add proxy arg in scraper.Fetch 2021-08-28 11:30:04 +02:00			`if useProxy {`
			`clt.WithProxy()`
			`}`
Add option to allow self-signed or invalid certificates 2021-02-21 22:42:49 +01:00			`clt.AllowSelfSignedCertificates = allowSelfSignedCertificates`
Add the possibility to override default user agent for each feed 2018-09-20 03:19:24 +02:00
Move HTTP client to its own package 2018-04-28 19:51:07 +02:00			`response, err := clt.Get()`
Add readability package to fetch original content 2017-12-11 04:01:38 +01:00			`if err != nil {`
			`return "", err`
			`}`

			`if response.HasServerFailure() {`
Make sure the scraper parse only HTML documents 2018-01-03 03:32:01 +01:00			`return "", errors.New("scraper: unable to download web page")`
			`}`

Add Prometheus exporter 2020-09-28 01:01:06 +02:00			`if !isAllowedContentType(response.ContentType) {`
Make sure the scraper parse only HTML documents 2018-01-03 03:32:01 +01:00			`return "", fmt.Errorf("scraper: this resource is not a HTML document (%s)", response.ContentType)`
Add readability package to fetch original content 2017-12-11 04:01:38 +01:00			`}`

Simplify feed parser and format detection - Avoid doing multiple buffer copies - Move parser and format detection logic to its own package 2018-10-14 20:46:41 +02:00			`if err = response.EnsureUnicodeBody(); err != nil {`
Add readability package to fetch original content 2017-12-11 04:01:38 +01:00			`return "", err`
			`}`

Make sure the scraper parse only HTML documents 2018-01-03 03:32:01 +01:00			`// The entry URL could redirect somewhere else.`
Rename internal url package to avoid overlap with net/url 2023-08-14 04:09:01 +02:00			`sameSite := urllib.Domain(websiteURL) == urllib.Domain(response.EffectiveURL)`
Improve content scraper 2017-12-14 06:30:40 +01:00			`websiteURL = response.EffectiveURL`

Add scraper rules 2017-12-11 05:51:04 +01:00			`if rules == "" {`
			`rules = getPredefinedScraperRules(websiteURL)`
			`}`

Add the possibility to enable crawler for feeds 2017-12-13 04:19:36 +01:00			`var content string`
scraper follow the only link * in some cases, what the scraper got is only a landing page, user can use scraper rules to extract the link of the landing page and follow it * it also fix the wrong scrape rule apply when the server redirects it to another host 2021-12-08 09:46:33 +01:00			`if sameSite && rules != "" {`
Simplify feed parser and format detection - Avoid doing multiple buffer copies - Move parser and format detection logic to its own package 2018-10-14 20:46:41 +02:00			logger.Debug(`[Scraper] Using rules %q for %q`, rules, websiteURL)
			`content, err = scrapContent(response.Body, rules)`
Add scraper rules 2017-12-11 05:51:04 +01:00			`} else {`
Refactor manual entry scraper 2018-12-03 05:51:06 +01:00			logger.Debug(`[Scraper] Using readability for %q`, websiteURL)
Simplify feed parser and format detection - Avoid doing multiple buffer copies - Move parser and format detection logic to its own package 2018-10-14 20:46:41 +02:00			`content, err = readability.ExtractContent(response.Body)`
Add scraper rules 2017-12-11 05:51:04 +01:00			`}`

Add readability package to fetch original content 2017-12-11 04:01:38 +01:00			`if err != nil {`
			`return "", err`
			`}`

Add the possibility to enable crawler for feeds 2017-12-13 04:19:36 +01:00			`return content, nil`
Add readability package to fetch original content 2017-12-11 04:01:38 +01:00			`}`
Add scraper rules 2017-12-11 05:51:04 +01:00
			`func scrapContent(page io.Reader, rules string) (string, error) {`
			`document, err := goquery.NewDocumentFromReader(page)`
			`if err != nil {`
			`return "", err`
			`}`

			`contents := ""`
			`document.Find(rules).Each(func(i int, s *goquery.Selection) {`
			`var content string`

Return outer HTML when scraping elements 2019-12-22 06:18:31 +01:00			`content, _ = goquery.OuterHtml(s)`
Add scraper rules 2017-12-11 05:51:04 +01:00			`contents += content`
			`})`

			`return contents, nil`
			`}`

			`func getPredefinedScraperRules(websiteURL string) string {`
Rename internal url package to avoid overlap with net/url 2023-08-14 04:09:01 +02:00			`urlDomain := urllib.Domain(websiteURL)`
Add scraper rules 2017-12-11 05:51:04 +01:00
			`for domain, rules := range predefinedRules {`
			`if strings.Contains(urlDomain, domain) {`
			`return rules`
			`}`
			`}`

			`return ""`
			`}`
Allow the scraper to parse XHTML documents Only "text/html" was authorized before. 2018-11-03 21:44:13 +01:00
Add Prometheus exporter 2020-09-28 01:01:06 +02:00			`func isAllowedContentType(contentType string) bool {`
Allow the scraper to parse XHTML documents Only "text/html" was authorized before. 2018-11-03 21:44:13 +01:00			`contentType = strings.ToLower(contentType)`
			`return strings.HasPrefix(contentType, "text/html") \|\|`
			`strings.HasPrefix(contentType, "application/xhtml+xml")`
			`}`