Return outer HTML when scraping elements
This commit is contained in:
parent
30f22fbd78
commit
8e1ed8bef3
8 changed files with 73 additions and 8 deletions
|
@ -75,13 +75,7 @@ func scrapContent(page io.Reader, rules string) (string, error) {
|
|||
document.Find(rules).Each(func(i int, s *goquery.Selection) {
|
||||
var content string
|
||||
|
||||
// For some inline elements, we get the parent.
|
||||
if s.Is("img") || s.Is("iframe") {
|
||||
content, _ = s.Parent().Html()
|
||||
} else {
|
||||
content, _ = s.Html()
|
||||
}
|
||||
|
||||
content, _ = goquery.OuterHtml(s)
|
||||
contents += content
|
||||
})
|
||||
|
||||
|
|
|
@ -4,7 +4,12 @@
|
|||
|
||||
package scraper // import "miniflux.app/reader/scraper"
|
||||
|
||||
import "testing"
|
||||
import (
|
||||
"bytes"
|
||||
"io/ioutil"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestGetPredefinedRules(t *testing.T) {
|
||||
if getPredefinedScraperRules("http://www.phoronix.com/") == "" {
|
||||
|
@ -40,3 +45,32 @@ func TestWhitelistedContentTypes(t *testing.T) {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestSelectorRules(t *testing.T) {
|
||||
var ruleTestCases = map[string]string {
|
||||
"img.html": "article > img",
|
||||
"iframe.html": "article > iframe",
|
||||
"p.html": "article > p",
|
||||
}
|
||||
|
||||
for filename, rule := range ruleTestCases {
|
||||
html, err := ioutil.ReadFile("testdata/" + filename)
|
||||
if err != nil {
|
||||
t.Fatalf(`Unable to read file %q: %v`, filename, err)
|
||||
}
|
||||
|
||||
actualResult, err := scrapContent(bytes.NewReader(html), rule)
|
||||
if err != nil {
|
||||
t.Fatalf(`Scraping error for %q - %q: %v`, filename, rule, err)
|
||||
}
|
||||
|
||||
expectedResult, err := ioutil.ReadFile("testdata/" + filename + "-result")
|
||||
if err != nil {
|
||||
t.Fatalf(`Unable to read file %q: %v`, filename, err)
|
||||
}
|
||||
|
||||
if actualResult != strings.TrimSpace(string(expectedResult)) {
|
||||
t.Errorf(`Unexpected result for %q, got "%s" instead of "%s"`, rule, actualResult, expectedResult)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
12
reader/scraper/testdata/iframe.html
vendored
Normal file
12
reader/scraper/testdata/iframe.html
vendored
Normal file
|
@ -0,0 +1,12 @@
|
|||
<!DOCTYPE html>
|
||||
<html lang="en-US">
|
||||
<body>
|
||||
<article>
|
||||
<iframe id="1" src="about:blank"></iframe>
|
||||
<iframe id="2" src="about:blank"></iframe>
|
||||
<iframe id="3" src="about:blank"></iframe>
|
||||
<iframe id="4" src="about:blank"></iframe>
|
||||
<iframe id="5" src="about:blank"></iframe>
|
||||
</article>
|
||||
</body>
|
||||
</html>
|
1
reader/scraper/testdata/iframe.html-result
vendored
Normal file
1
reader/scraper/testdata/iframe.html-result
vendored
Normal file
|
@ -0,0 +1 @@
|
|||
<iframe id="1" src="about:blank"></iframe><iframe id="2" src="about:blank"></iframe><iframe id="3" src="about:blank"></iframe><iframe id="4" src="about:blank"></iframe><iframe id="5" src="about:blank"></iframe>
|
12
reader/scraper/testdata/img.html
vendored
Normal file
12
reader/scraper/testdata/img.html
vendored
Normal file
|
@ -0,0 +1,12 @@
|
|||
<!DOCTYPE html>
|
||||
<html lang="en-US">
|
||||
<body>
|
||||
<article>
|
||||
<img id="1" src="#" alt="" />
|
||||
<img id="2" src="#" alt="" />
|
||||
<img id="3" src="#" alt="" />
|
||||
<img id="4" src="#" alt="" />
|
||||
<img id="5" src="#" alt="" />
|
||||
</article>
|
||||
</body>
|
||||
</html>
|
1
reader/scraper/testdata/img.html-result
vendored
Normal file
1
reader/scraper/testdata/img.html-result
vendored
Normal file
|
@ -0,0 +1 @@
|
|||
<img id="1" src="#" alt=""/><img id="2" src="#" alt=""/><img id="3" src="#" alt=""/><img id="4" src="#" alt=""/><img id="5" src="#" alt=""/>
|
10
reader/scraper/testdata/p.html
vendored
Normal file
10
reader/scraper/testdata/p.html
vendored
Normal file
|
@ -0,0 +1,10 @@
|
|||
<!DOCTYPE html>
|
||||
<html lang="en-US">
|
||||
<body>
|
||||
<article>
|
||||
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing ept.</p>
|
||||
<p>Apquam tincidunt mauris eu risus.</p>
|
||||
<p>Vestibulum auctor dapibus neque.</p>
|
||||
</article>
|
||||
</body>
|
||||
</html>
|
1
reader/scraper/testdata/p.html-result
vendored
Normal file
1
reader/scraper/testdata/p.html-result
vendored
Normal file
|
@ -0,0 +1 @@
|
|||
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing ept.</p><p>Apquam tincidunt mauris eu risus.</p><p>Vestibulum auctor dapibus neque.</p>
|
Loading…
Add table
Reference in a new issue