Improve content scraper

2017-12-13 21:30:40 -08:00 · 2017-12-13 21:30:40 -08:00 · c6d9eb3614
commit c6d9eb3614
parent 827683ab59
7 changed files with 39 additions and 3 deletions
--- a/reader/rewrite/rewrite_functions.go
+++ b/reader/rewrite/rewrite_functions.go
@ -5,6 +5,7 @@
 package rewrite

 import (
+	"fmt"
 	"regexp"
 	"strings"

@ -38,3 +39,10 @@ func addYoutubeVideo(entryURL, entryContent string) string {
 	}
 	return entryContent
 }
+
+func addPDFLink(entryURL, entryContent string) string {
+	if strings.HasSuffix(entryURL, ".pdf") {
+		return fmt.Sprintf(`<a href="%s">PDF</a><br>%s`, entryURL, entryContent)
+	}
+	return entryContent
+}
--- a/reader/rewrite/rewriter.go
+++ b/reader/rewrite/rewriter.go
@ -18,12 +18,16 @@ func Rewriter(entryURL, entryContent, customRewriteRules string) string {
 	}

 	rules := strings.Split(rulesList, ",")
+	rules = append(rules, "add_pdf_download_link")
+
 	for _, rule := range rules {
 		switch strings.TrimSpace(rule) {
 		case "add_image_title":
 			entryContent = addImageTitle(entryURL, entryContent)
 		case "add_youtube_video":
 			entryContent = addYoutubeVideo(entryURL, entryContent)
+		case "add_pdf_download_link":
+			entryContent = addPDFLink(entryURL, entryContent)
 		}
 	}

--- a/reader/rewrite/rewriter_test.go
+++ b/reader/rewrite/rewriter_test.go
@ -58,3 +58,13 @@ func TestRewriteWithXkcdAndNoImage(t *testing.T) {
 		t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected)
 	}
 }
+
+func TestRewriteWithPDFLink(t *testing.T) {
+	description := "test"
+	output := Rewriter("https://example.org/document.pdf", description, ``)
+	expected := `<a href="https://example.org/document.pdf">PDF</a><br>test`
+
+	if expected != output {
+		t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected)
+	}
+}
--- a/reader/scraper/rules.go
+++ b/reader/scraper/rules.go
@ -7,10 +7,16 @@ package scraper
 // List of predefined scraper rules (alphabetically sorted)
 // domain => CSS selectors
 var predefinedRules = map[string]string{
+	"github.com":        "article.entry-content",
+	"igen.fr":           "section.corps",
 	"lemonde.fr":        "div#articleBody",
 	"lesjoiesducode.fr": ".blog-post-content img",
 	"linux.com":         "div.content, div[property]",
+	"medium.com":        ".section-content",
 	"opensource.com":    "div[property]",
+	"osnews.com":        "div.newscontent1",
 	"phoronix.com":      "div.content",
 	"techcrunch.com":    "div.article-entry",
+	"theregister.co.uk": "#body",
+	"wired.com":         "main figure, article",
 }
--- a/reader/scraper/scraper.go
+++ b/reader/scraper/scraper.go
@ -33,6 +33,9 @@ func Fetch(websiteURL, rules string) (string, error) {
 		return "", err
 	}

+	// The entry URL could be a redirect somewhere else.
+	websiteURL = response.EffectiveURL
+
 	if rules == "" {
 		rules = getPredefinedScraperRules(websiteURL)
 	}
--- a/server/static/css.go
+++ b/server/static/css.go
--- a/server/static/css/common.css
+++ b/server/static/css/common.css
@ -568,6 +568,11 @@ a.button {
    max-width: 100%;
 }

+.entry-content figure {
+    margin-top: 15px;
+    margin-bottom: 15px;
+}
+
 .entry-content figure img {
    border: 1px solid #000;
 }