miniflux/reader/scraper/scraper.go

// Copyright 2017 Frédéric Guillot. All rights reserved.
// Use of this source code is governed by the Apache 2.0
// license that can be found in the LICENSE file.

package scraper // import "miniflux.app/reader/scraper"

import (
	"errors"
	"fmt"
	"io"
	"strings"

	"miniflux.app/http/client"
	"miniflux.app/logger"
	"miniflux.app/reader/readability"
	"miniflux.app/url"

	"github.com/PuerkitoBio/goquery"
)

// Fetch downloads a web page a returns relevant contents.
func Fetch(websiteURL, rules string) (string, error) {
	clt := client.New(websiteURL)
	response, err := clt.Get()
	if err != nil {
		return "", err
	}

	if response.HasServerFailure() {
		return "", errors.New("scraper: unable to download web page")
	}

	if !strings.Contains(response.ContentType, "text/html") {
		return "", fmt.Errorf("scraper: this resource is not a HTML document (%s)", response.ContentType)
	}

	page, err := response.NormalizeBodyEncoding()
	if err != nil {
		return "", err
	}

	// The entry URL could redirect somewhere else.
	websiteURL = response.EffectiveURL

	if rules == "" {
		rules = getPredefinedScraperRules(websiteURL)
	}

	var content string
	if rules != "" {
		logger.Debug(`[Scraper] Using rules "%s" for "%s"`, rules, websiteURL)
		content, err = scrapContent(page, rules)
	} else {
		logger.Debug(`[Scraper] Using readability for "%s"`, websiteURL)
		content, err = readability.ExtractContent(page)
	}

	if err != nil {
		return "", err
	}

	return content, nil
}

func scrapContent(page io.Reader, rules string) (string, error) {
	document, err := goquery.NewDocumentFromReader(page)
	if err != nil {
		return "", err
	}

	contents := ""
	document.Find(rules).Each(func(i int, s *goquery.Selection) {
		var content string

		// For some inline elements, we get the parent.
		if s.Is("img") || s.Is("iframe") {
			content, _ = s.Parent().Html()
		} else {
			content, _ = s.Html()
		}

		contents += content
	})

	return contents, nil
}

func getPredefinedScraperRules(websiteURL string) string {
	urlDomain := url.Domain(websiteURL)

	for domain, rules := range predefinedRules {
		if strings.Contains(urlDomain, domain) {
			return rules
		}
	}

	return ""
}
Add readability package to fetch original content 2017-12-11 04:01:38 +01:00			`// Copyright 2017 Frédéric Guillot. All rights reserved.`
			`// Use of this source code is governed by the Apache 2.0`
			`// license that can be found in the LICENSE file.`

Use canonical imports 2018-08-25 06:51:50 +02:00			`package scraper // import "miniflux.app/reader/scraper"`
Add readability package to fetch original content 2017-12-11 04:01:38 +01:00
			`import (`
			`"errors"`
Make sure the scraper parse only HTML documents 2018-01-03 03:32:01 +01:00			`"fmt"`
Add scraper rules 2017-12-11 05:51:04 +01:00			`"io"`
			`"strings"`
Add readability package to fetch original content 2017-12-11 04:01:38 +01:00
Use canonical imports 2018-08-25 06:51:50 +02:00			`"miniflux.app/http/client"`
			`"miniflux.app/logger"`
			`"miniflux.app/reader/readability"`
			`"miniflux.app/url"`

Add scraper rules 2017-12-11 05:51:04 +01:00			`"github.com/PuerkitoBio/goquery"`
Add readability package to fetch original content 2017-12-11 04:01:38 +01:00			`)`

Make sure the scraper parse only HTML documents 2018-01-03 03:32:01 +01:00			`// Fetch downloads a web page a returns relevant contents.`
Add scraper rules 2017-12-11 05:51:04 +01:00			`func Fetch(websiteURL, rules string) (string, error) {`
Move HTTP client to its own package 2018-04-28 19:51:07 +02:00			`clt := client.New(websiteURL)`
			`response, err := clt.Get()`
Add readability package to fetch original content 2017-12-11 04:01:38 +01:00			`if err != nil {`
			`return "", err`
			`}`

			`if response.HasServerFailure() {`
Make sure the scraper parse only HTML documents 2018-01-03 03:32:01 +01:00			`return "", errors.New("scraper: unable to download web page")`
			`}`

			`if !strings.Contains(response.ContentType, "text/html") {`
			`return "", fmt.Errorf("scraper: this resource is not a HTML document (%s)", response.ContentType)`
Add readability package to fetch original content 2017-12-11 04:01:38 +01:00			`}`

			`page, err := response.NormalizeBodyEncoding()`
			`if err != nil {`
			`return "", err`
			`}`

Make sure the scraper parse only HTML documents 2018-01-03 03:32:01 +01:00			`// The entry URL could redirect somewhere else.`
Improve content scraper 2017-12-14 06:30:40 +01:00			`websiteURL = response.EffectiveURL`

Add scraper rules 2017-12-11 05:51:04 +01:00			`if rules == "" {`
			`rules = getPredefinedScraperRules(websiteURL)`
			`}`

Add the possibility to enable crawler for feeds 2017-12-13 04:19:36 +01:00			`var content string`
Add scraper rules 2017-12-11 05:51:04 +01:00			`if rules != "" {`
Add logger 2017-12-16 03:55:57 +01:00			logger.Debug(`[Scraper] Using rules "%s" for "%s"`, rules, websiteURL)
Add scraper rules 2017-12-11 05:51:04 +01:00			`content, err = scrapContent(page, rules)`
			`} else {`
Add logger 2017-12-16 03:55:57 +01:00			logger.Debug(`[Scraper] Using readability for "%s"`, websiteURL)
Add scraper rules 2017-12-11 05:51:04 +01:00			`content, err = readability.ExtractContent(page)`
			`}`

Add readability package to fetch original content 2017-12-11 04:01:38 +01:00			`if err != nil {`
			`return "", err`
			`}`

Add the possibility to enable crawler for feeds 2017-12-13 04:19:36 +01:00			`return content, nil`
Add readability package to fetch original content 2017-12-11 04:01:38 +01:00			`}`
Add scraper rules 2017-12-11 05:51:04 +01:00
			`func scrapContent(page io.Reader, rules string) (string, error) {`
			`document, err := goquery.NewDocumentFromReader(page)`
			`if err != nil {`
			`return "", err`
			`}`

			`contents := ""`
			`document.Find(rules).Each(func(i int, s *goquery.Selection) {`
			`var content string`

			`// For some inline elements, we get the parent.`
Scrape parent element for iframe Current behavior: if you have an `iframe` scraper rule, `scrapContent` tries to return the inner HTML of the `iframe`, which turns up blank. New behavior: like `img` elements, if an `iframe` is matched by a scraper rule, the parent element's inner HTML (i.e. the `iframe` is returned). 2018-04-26 22:51:07 +02:00			`if s.Is("img") \|\| s.Is("iframe") {`
Add scraper rules 2017-12-11 05:51:04 +01:00			`content, _ = s.Parent().Html()`
			`} else {`
			`content, _ = s.Html()`
			`}`

			`contents += content`
			`})`

			`return contents, nil`
			`}`

			`func getPredefinedScraperRules(websiteURL string) string {`
			`urlDomain := url.Domain(websiteURL)`

			`for domain, rules := range predefinedRules {`
			`if strings.Contains(urlDomain, domain) {`
			`return rules`
			`}`
			`}`

			`return ""`
			`}`