From 31435ef83e8fc1fd90126bd8eed03073ee600753 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Guillot?= Date: Tue, 29 Sep 2020 22:22:25 -0700 Subject: [PATCH] Add rewrite rule to fix Medium.com images --- reader/readability/readability.go | 2 +- reader/rewrite/rewrite_functions.go | 15 ++++++ reader/rewrite/rewriter.go | 2 + reader/rewrite/rewriter_test.go | 34 ++++++++++++- reader/rewrite/rules.go | 1 + reader/scraper/rules.go | 75 ++++++++++++++--------------- 6 files changed, 89 insertions(+), 40 deletions(-) diff --git a/reader/readability/readability.go b/reader/readability/readability.go index d58ff221..b9891a19 100644 --- a/reader/readability/readability.go +++ b/reader/readability/readability.go @@ -76,7 +76,7 @@ func ExtractContent(page io.Reader) (string, error) { return "", err } - document.Find("script,style,noscript").Each(func(i int, s *goquery.Selection) { + document.Find("script,style").Each(func(i int, s *goquery.Selection) { removeNodes(s) }) diff --git a/reader/rewrite/rewrite_functions.go b/reader/rewrite/rewrite_functions.go index 55016a71..ea162af5 100644 --- a/reader/rewrite/rewrite_functions.go +++ b/reader/rewrite/rewrite_functions.go @@ -139,6 +139,21 @@ func addDynamicImage(entryURL, entryContent string) string { return entryContent } +func fixMediumImages(entryURL, entryContent string) string { + doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent)) + if err != nil { + return entryContent + } + + doc.Find("figure.paragraph-image").Each(func(i int, paragraphImage *goquery.Selection) { + noscriptElement := paragraphImage.Find("noscript") + paragraphImage.ReplaceWithHtml(noscriptElement.Text()) + }) + + output, _ := doc.Find("body").First().Html() + return output +} + func addYoutubeVideo(entryURL, entryContent string) string { matches := youtubeRegex.FindStringSubmatch(entryURL) diff --git a/reader/rewrite/rewriter.go b/reader/rewrite/rewriter.go index a34b0adf..8c26719c 100644 --- a/reader/rewrite/rewriter.go +++ b/reader/rewrite/rewriter.go @@ -43,6 +43,8 @@ func Rewriter(entryURL, entryContent, customRewriteRules string) string { entryContent = replaceLineFeeds(entryContent) case "convert_text_link", "convert_text_links": entryContent = replaceTextLinks(entryContent) + case "fix_medium_images": + entryContent = fixMediumImages(entryURL, entryContent) } } diff --git a/reader/rewrite/rewriter_test.go b/reader/rewrite/rewriter_test.go index 8c0db6cf..04f4c657 100644 --- a/reader/rewrite/rewriter_test.go +++ b/reader/rewrite/rewriter_test.go @@ -4,7 +4,10 @@ package rewrite // import "miniflux.app/reader/rewrite" -import "testing" +import ( + "strings" + "testing" +) func TestReplaceTextLinks(t *testing.T) { scenarios := map[string]string{ @@ -176,3 +179,32 @@ func TestConvertTextLinkRewriteRule(t *testing.T) { t.Errorf(`Not expected output: got %q instead of %q`, output, expected) } } + +func TestMediumImage(t *testing.T) { + content := ` +
+
+
+
+
+
+ Image for post +
+ Image for post + +
+
+
+
+
+ ` + expected := `Image for post` + output := Rewriter("https://example.org/article", content, "fix_medium_images") + output = strings.TrimSpace(output) + + if expected != output { + t.Errorf(`Not expected output: %s`, output) + } +} diff --git a/reader/rewrite/rules.go b/reader/rewrite/rules.go index fb644a48..b954b765 100644 --- a/reader/rewrite/rules.go +++ b/reader/rewrite/rules.go @@ -30,4 +30,5 @@ var predefinedRules = map[string]string{ "invidio.us": "add_invidious_video", "xkcd.com": "add_image_title", "framatube.org": "nl2br,convert_text_link", + "medium.com": "fix_medium_images", } diff --git a/reader/scraper/rules.go b/reader/scraper/rules.go index a8dda7d8..e0f2f582 100644 --- a/reader/scraper/rules.go +++ b/reader/scraper/rules.go @@ -7,43 +7,42 @@ package scraper // import "miniflux.app/reader/scraper" // List of predefined scraper rules (alphabetically sorted) // domain => CSS selectors var predefinedRules = map[string]string{ - "bbc.co.uk": "div.vxp-column--single, div.story-body__inner, ul.gallery-images__list", - "cbc.ca": ".story-content", - "darkreading.com": "#article-main:not(header)", - "developpez.com": "div[itemprop=articleBody]", - "dilbert.com": "span.comic-title-name, img.img-comic", + "bbc.co.uk": "div.vxp-column--single, div.story-body__inner, ul.gallery-images__list", + "cbc.ca": ".story-content", + "darkreading.com": "#article-main:not(header)", + "developpez.com": "div[itemprop=articleBody]", + "dilbert.com": "span.comic-title-name, img.img-comic", "financialsamurai.com": "article", - "francetvinfo.fr": ".text", - "github.com": "article.entry-content", - "heise.de": "header .article-content__lead, header .article-image, div.article-layout__content.article-content", - "igen.fr": "section.corps", - "ing.dk": "section.body", - "lapresse.ca": ".amorce, .entry", - "lemonde.fr": "article", - "lepoint.fr": ".art-text", - "lesjoiesducode.fr": ".blog-post-content img", - "lesnumeriques.com": ".text", - "linux.com": "div.content, div[property]", - "medium.com": ".section-content", - "mac4ever.com": "div[itemprop=articleBody]", - "monwindows.com": ".blog-post-body", - "npr.org": "#storytext", - "oneindia.com": ".io-article-body", - "opensource.com": "div[property]", - "osnews.com": "div.newscontent1", - "phoronix.com": "div.content", - "pseudo-sciences.org": "#art_main", - "raywenderlich.com": "article", - "slate.fr": ".field-items", - "techcrunch.com": "div.article-entry", - "theoatmeal.com": "div#comic", - "theregister.co.uk": "#body", - "turnoff.us": "article.post-content", - "universfreebox.com": "#corps_corps", - "version2.dk": "section.body", - "wdwnt.com": "div.entry-content", - "wired.com": "main figure, article", - "zeit.de": ".summary, .article-body", - "zdnet.com": "div.storyBody", - "openingsource.org": "article.suxing-popup-gallery", + "francetvinfo.fr": ".text", + "github.com": "article.entry-content", + "heise.de": "header .article-content__lead, header .article-image, div.article-layout__content.article-content", + "igen.fr": "section.corps", + "ing.dk": "section.body", + "lapresse.ca": ".amorce, .entry", + "lemonde.fr": "article", + "lepoint.fr": ".art-text", + "lesjoiesducode.fr": ".blog-post-content img", + "lesnumeriques.com": ".text", + "linux.com": "div.content, div[property]", + "mac4ever.com": "div[itemprop=articleBody]", + "monwindows.com": ".blog-post-body", + "npr.org": "#storytext", + "oneindia.com": ".io-article-body", + "opensource.com": "div[property]", + "osnews.com": "div.newscontent1", + "phoronix.com": "div.content", + "pseudo-sciences.org": "#art_main", + "raywenderlich.com": "article", + "slate.fr": ".field-items", + "techcrunch.com": "div.article-entry", + "theoatmeal.com": "div#comic", + "theregister.co.uk": "#body", + "turnoff.us": "article.post-content", + "universfreebox.com": "#corps_corps", + "version2.dk": "section.body", + "wdwnt.com": "div.entry-content", + "wired.com": "main figure, article", + "zeit.de": ".summary, .article-body", + "zdnet.com": "div.storyBody", + "openingsource.org": "article.suxing-popup-gallery", }