Add rewrite rule to fix Medium.com images
This commit is contained in:
parent
d75ff0c5ab
commit
31435ef83e
6 changed files with 89 additions and 40 deletions
|
@ -76,7 +76,7 @@ func ExtractContent(page io.Reader) (string, error) {
|
|||
return "", err
|
||||
}
|
||||
|
||||
document.Find("script,style,noscript").Each(func(i int, s *goquery.Selection) {
|
||||
document.Find("script,style").Each(func(i int, s *goquery.Selection) {
|
||||
removeNodes(s)
|
||||
})
|
||||
|
||||
|
|
|
@ -139,6 +139,21 @@ func addDynamicImage(entryURL, entryContent string) string {
|
|||
return entryContent
|
||||
}
|
||||
|
||||
func fixMediumImages(entryURL, entryContent string) string {
|
||||
doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent))
|
||||
if err != nil {
|
||||
return entryContent
|
||||
}
|
||||
|
||||
doc.Find("figure.paragraph-image").Each(func(i int, paragraphImage *goquery.Selection) {
|
||||
noscriptElement := paragraphImage.Find("noscript")
|
||||
paragraphImage.ReplaceWithHtml(noscriptElement.Text())
|
||||
})
|
||||
|
||||
output, _ := doc.Find("body").First().Html()
|
||||
return output
|
||||
}
|
||||
|
||||
func addYoutubeVideo(entryURL, entryContent string) string {
|
||||
matches := youtubeRegex.FindStringSubmatch(entryURL)
|
||||
|
||||
|
|
|
@ -43,6 +43,8 @@ func Rewriter(entryURL, entryContent, customRewriteRules string) string {
|
|||
entryContent = replaceLineFeeds(entryContent)
|
||||
case "convert_text_link", "convert_text_links":
|
||||
entryContent = replaceTextLinks(entryContent)
|
||||
case "fix_medium_images":
|
||||
entryContent = fixMediumImages(entryURL, entryContent)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -4,7 +4,10 @@
|
|||
|
||||
package rewrite // import "miniflux.app/reader/rewrite"
|
||||
|
||||
import "testing"
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestReplaceTextLinks(t *testing.T) {
|
||||
scenarios := map[string]string{
|
||||
|
@ -176,3 +179,32 @@ func TestConvertTextLinkRewriteRule(t *testing.T) {
|
|||
t.Errorf(`Not expected output: got %q instead of %q`, output, expected)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMediumImage(t *testing.T) {
|
||||
content := `
|
||||
<figure class="ht hu hv hw hx hy cy cz paragraph-image">
|
||||
<div class="hz ia ib ic aj">
|
||||
<div class="cy cz hs">
|
||||
<div class="ii s ib ij">
|
||||
<div class="ik il s">
|
||||
<div class="id ie t u v if aj bk ig ih">
|
||||
<img alt="Image for post" class="t u v if aj im in io" src="https://miro.medium.com/max/60/1*ephLSqSzQYLvb7faDwzRbw.jpeg?q=20" width="1280" height="720"/>
|
||||
</div>
|
||||
<img alt="Image for post" class="id ie t u v if aj c" width="1280" height="720"/>
|
||||
<noscript>
|
||||
<img alt="Image for post" class="t u v if aj" src="https://miro.medium.com/max/2560/1*ephLSqSzQYLvb7faDwzRbw.jpeg" width="1280" height="720" srcSet="https://miro.medium.com/max/552/1*ephLSqSzQYLvb7faDwzRbw.jpeg 276w, https://miro.medium.com/max/1104/1*ephLSqSzQYLvb7faDwzRbw.jpeg 552w, https://miro.medium.com/max/1280/1*ephLSqSzQYLvb7faDwzRbw.jpeg 640w, https://miro.medium.com/max/1400/1*ephLSqSzQYLvb7faDwzRbw.jpeg 700w" sizes="700px"/>
|
||||
</noscript>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</figure>
|
||||
`
|
||||
expected := `<img alt="Image for post" class="t u v if aj" src="https://miro.medium.com/max/2560/1*ephLSqSzQYLvb7faDwzRbw.jpeg" width="1280" height="720" srcset="https://miro.medium.com/max/552/1*ephLSqSzQYLvb7faDwzRbw.jpeg 276w, https://miro.medium.com/max/1104/1*ephLSqSzQYLvb7faDwzRbw.jpeg 552w, https://miro.medium.com/max/1280/1*ephLSqSzQYLvb7faDwzRbw.jpeg 640w, https://miro.medium.com/max/1400/1*ephLSqSzQYLvb7faDwzRbw.jpeg 700w" sizes="700px"/>`
|
||||
output := Rewriter("https://example.org/article", content, "fix_medium_images")
|
||||
output = strings.TrimSpace(output)
|
||||
|
||||
if expected != output {
|
||||
t.Errorf(`Not expected output: %s`, output)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -30,4 +30,5 @@ var predefinedRules = map[string]string{
|
|||
"invidio.us": "add_invidious_video",
|
||||
"xkcd.com": "add_image_title",
|
||||
"framatube.org": "nl2br,convert_text_link",
|
||||
"medium.com": "fix_medium_images",
|
||||
}
|
||||
|
|
|
@ -7,43 +7,42 @@ package scraper // import "miniflux.app/reader/scraper"
|
|||
// List of predefined scraper rules (alphabetically sorted)
|
||||
// domain => CSS selectors
|
||||
var predefinedRules = map[string]string{
|
||||
"bbc.co.uk": "div.vxp-column--single, div.story-body__inner, ul.gallery-images__list",
|
||||
"cbc.ca": ".story-content",
|
||||
"darkreading.com": "#article-main:not(header)",
|
||||
"developpez.com": "div[itemprop=articleBody]",
|
||||
"dilbert.com": "span.comic-title-name, img.img-comic",
|
||||
"bbc.co.uk": "div.vxp-column--single, div.story-body__inner, ul.gallery-images__list",
|
||||
"cbc.ca": ".story-content",
|
||||
"darkreading.com": "#article-main:not(header)",
|
||||
"developpez.com": "div[itemprop=articleBody]",
|
||||
"dilbert.com": "span.comic-title-name, img.img-comic",
|
||||
"financialsamurai.com": "article",
|
||||
"francetvinfo.fr": ".text",
|
||||
"github.com": "article.entry-content",
|
||||
"heise.de": "header .article-content__lead, header .article-image, div.article-layout__content.article-content",
|
||||
"igen.fr": "section.corps",
|
||||
"ing.dk": "section.body",
|
||||
"lapresse.ca": ".amorce, .entry",
|
||||
"lemonde.fr": "article",
|
||||
"lepoint.fr": ".art-text",
|
||||
"lesjoiesducode.fr": ".blog-post-content img",
|
||||
"lesnumeriques.com": ".text",
|
||||
"linux.com": "div.content, div[property]",
|
||||
"medium.com": ".section-content",
|
||||
"mac4ever.com": "div[itemprop=articleBody]",
|
||||
"monwindows.com": ".blog-post-body",
|
||||
"npr.org": "#storytext",
|
||||
"oneindia.com": ".io-article-body",
|
||||
"opensource.com": "div[property]",
|
||||
"osnews.com": "div.newscontent1",
|
||||
"phoronix.com": "div.content",
|
||||
"pseudo-sciences.org": "#art_main",
|
||||
"raywenderlich.com": "article",
|
||||
"slate.fr": ".field-items",
|
||||
"techcrunch.com": "div.article-entry",
|
||||
"theoatmeal.com": "div#comic",
|
||||
"theregister.co.uk": "#body",
|
||||
"turnoff.us": "article.post-content",
|
||||
"universfreebox.com": "#corps_corps",
|
||||
"version2.dk": "section.body",
|
||||
"wdwnt.com": "div.entry-content",
|
||||
"wired.com": "main figure, article",
|
||||
"zeit.de": ".summary, .article-body",
|
||||
"zdnet.com": "div.storyBody",
|
||||
"openingsource.org": "article.suxing-popup-gallery",
|
||||
"francetvinfo.fr": ".text",
|
||||
"github.com": "article.entry-content",
|
||||
"heise.de": "header .article-content__lead, header .article-image, div.article-layout__content.article-content",
|
||||
"igen.fr": "section.corps",
|
||||
"ing.dk": "section.body",
|
||||
"lapresse.ca": ".amorce, .entry",
|
||||
"lemonde.fr": "article",
|
||||
"lepoint.fr": ".art-text",
|
||||
"lesjoiesducode.fr": ".blog-post-content img",
|
||||
"lesnumeriques.com": ".text",
|
||||
"linux.com": "div.content, div[property]",
|
||||
"mac4ever.com": "div[itemprop=articleBody]",
|
||||
"monwindows.com": ".blog-post-body",
|
||||
"npr.org": "#storytext",
|
||||
"oneindia.com": ".io-article-body",
|
||||
"opensource.com": "div[property]",
|
||||
"osnews.com": "div.newscontent1",
|
||||
"phoronix.com": "div.content",
|
||||
"pseudo-sciences.org": "#art_main",
|
||||
"raywenderlich.com": "article",
|
||||
"slate.fr": ".field-items",
|
||||
"techcrunch.com": "div.article-entry",
|
||||
"theoatmeal.com": "div#comic",
|
||||
"theregister.co.uk": "#body",
|
||||
"turnoff.us": "article.post-content",
|
||||
"universfreebox.com": "#corps_corps",
|
||||
"version2.dk": "section.body",
|
||||
"wdwnt.com": "div.entry-content",
|
||||
"wired.com": "main figure, article",
|
||||
"zeit.de": ".summary, .article-body",
|
||||
"zdnet.com": "div.storyBody",
|
||||
"openingsource.org": "article.suxing-popup-gallery",
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue