miniflux/internal/reader/scraper/scraper_test.go

// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

package scraper // import "miniflux.app/v2/internal/reader/scraper"

import (
	"bytes"
	"os"
	"strings"
	"testing"
)

func TestGetPredefinedRules(t *testing.T) {
	if getPredefinedScraperRules("http://www.phoronix.com/") == "" {
		t.Error("Unable to find rule for phoronix.com")
	}

	if getPredefinedScraperRules("https://www.linux.com/") == "" {
		t.Error("Unable to find rule for linux.com")
	}

	if getPredefinedScraperRules("https://example.org/") != "" {
		t.Error("A rule not defined should not return anything")
	}
}

func TestWhitelistedContentTypes(t *testing.T) {
	scenarios := map[string]bool{
		"text/html":                            true,
		"TeXt/hTmL":                            true,
		"application/xhtml+xml":                true,
		"text/html; charset=utf-8":             true,
		"application/xhtml+xml; charset=utf-8": true,
		"text/css":                             false,
		"application/javascript":               false,
		"image/png":                            false,
		"application/pdf":                      false,
	}

	for inputValue, expectedResult := range scenarios {
		actualResult := isAllowedContentType(inputValue)
		if actualResult != expectedResult {
			t.Errorf(`Unexpected result for content type whitelist, got "%v" instead of "%v"`, actualResult, expectedResult)
		}
	}
}

func TestSelectorRules(t *testing.T) {
	var ruleTestCases = map[string]string{
		"img.html":    "article > img",
		"iframe.html": "article > iframe",
		"p.html":      "article > p",
	}

	for filename, rule := range ruleTestCases {
		html, err := os.ReadFile("testdata/" + filename)
		if err != nil {
			t.Fatalf(`Unable to read file %q: %v`, filename, err)
		}

		actualResult, err := findContentUsingCustomRules(bytes.NewReader(html), rule)
		if err != nil {
			t.Fatalf(`Scraping error for %q - %q: %v`, filename, rule, err)
		}

		expectedResult, err := os.ReadFile("testdata/" + filename + "-result")
		if err != nil {
			t.Fatalf(`Unable to read file %q: %v`, filename, err)
		}

		if actualResult != strings.TrimSpace(string(expectedResult)) {
			t.Errorf(`Unexpected result for %q, got "%s" instead of "%s"`, rule, actualResult, expectedResult)
		}
	}
}
Replace copyright header with SPDX identifier 2023-06-19 17:42:47 -04:00			`// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.`
			`// SPDX-License-Identifier: Apache-2.0`
Add scraper rules 2017-12-10 23:51:04 -05:00
Move internal packages to an internal folder For reference: https://go.dev/doc/go1.4#internalpackages 2023-08-10 22:46:45 -04:00			`package scraper // import "miniflux.app/v2/internal/reader/scraper"`
Add scraper rules 2017-12-10 23:51:04 -05:00
Return outer HTML when scraping elements 2019-12-22 00:18:31 -05:00			`import (`
			`"bytes"`
Remove deprecated io/ioutil package Miniflux now requires at least Go 1.16 and io/util is deprecated. https://golang.org/doc/go1.16#ioutil 2021-02-17 00:19:03 -05:00			`"os"`
Return outer HTML when scraping elements 2019-12-22 00:18:31 -05:00			`"strings"`
			`"testing"`
			`)`
Add scraper rules 2017-12-10 23:51:04 -05:00
			`func TestGetPredefinedRules(t *testing.T) {`
			`if getPredefinedScraperRules("http://www.phoronix.com/") == "" {`
			`t.Error("Unable to find rule for phoronix.com")`
			`}`

			`if getPredefinedScraperRules("https://www.linux.com/") == "" {`
			`t.Error("Unable to find rule for linux.com")`
			`}`

			`if getPredefinedScraperRules("https://example.org/") != "" {`
			`t.Error("A rule not defined should not return anything")`
			`}`
			`}`
Allow the scraper to parse XHTML documents Only "text/html" was authorized before. 2018-11-03 16:44:13 -04:00
			`func TestWhitelistedContentTypes(t *testing.T) {`
			`scenarios := map[string]bool{`
			`"text/html": true,`
			`"TeXt/hTmL": true,`
			`"application/xhtml+xml": true,`
			`"text/html; charset=utf-8": true,`
			`"application/xhtml+xml; charset=utf-8": true,`
			`"text/css": false,`
			`"application/javascript": false,`
			`"image/png": false,`
			`"application/pdf": false,`
			`}`

			`for inputValue, expectedResult := range scenarios {`
Add Prometheus exporter 2020-09-27 19:01:06 -04:00			`actualResult := isAllowedContentType(inputValue)`
Allow the scraper to parse XHTML documents Only "text/html" was authorized before. 2018-11-03 16:44:13 -04:00			`if actualResult != expectedResult {`
			t.Errorf(`Unexpected result for content type whitelist, got "%v" instead of "%v"`, actualResult, expectedResult)
			`}`
			`}`
			`}`
Return outer HTML when scraping elements 2019-12-22 00:18:31 -05:00
			`func TestSelectorRules(t *testing.T) {`
Add Prometheus exporter 2020-09-27 19:01:06 -04:00			`var ruleTestCases = map[string]string{`
			`"img.html": "article > img",`
			`"iframe.html": "article > iframe",`
			`"p.html": "article > p",`
Return outer HTML when scraping elements 2019-12-22 00:18:31 -05:00			`}`

			`for filename, rule := range ruleTestCases {`
Remove deprecated io/ioutil package Miniflux now requires at least Go 1.16 and io/util is deprecated. https://golang.org/doc/go1.16#ioutil 2021-02-17 00:19:03 -05:00			`html, err := os.ReadFile("testdata/" + filename)`
Return outer HTML when scraping elements 2019-12-22 00:18:31 -05:00			`if err != nil {`
			t.Fatalf(`Unable to read file %q: %v`, filename, err)
			`}`

Refactor HTTP Client and LocalizedError packages 2023-10-21 22:50:29 -04:00			`actualResult, err := findContentUsingCustomRules(bytes.NewReader(html), rule)`
Return outer HTML when scraping elements 2019-12-22 00:18:31 -05:00			`if err != nil {`
			t.Fatalf(`Scraping error for %q - %q: %v`, filename, rule, err)
			`}`

Remove deprecated io/ioutil package Miniflux now requires at least Go 1.16 and io/util is deprecated. https://golang.org/doc/go1.16#ioutil 2021-02-17 00:19:03 -05:00			`expectedResult, err := os.ReadFile("testdata/" + filename + "-result")`
Return outer HTML when scraping elements 2019-12-22 00:18:31 -05:00			`if err != nil {`
			t.Fatalf(`Unable to read file %q: %v`, filename, err)
			`}`

			`if actualResult != strings.TrimSpace(string(expectedResult)) {`
			t.Errorf(`Unexpected result for %q, got "%s" instead of "%s"`, rule, actualResult, expectedResult)
			`}`
			`}`
			`}`