From 48aa0d07ef98fe5ebc1a762a876f140cb9528a6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Guillot?= Date: Thu, 4 Jan 2018 19:32:24 -0800 Subject: [PATCH] Add more scraper rules --- reader/scraper/rules.go | 46 +++++++++++++++++++++++++++-------------- 1 file changed, 30 insertions(+), 16 deletions(-) diff --git a/reader/scraper/rules.go b/reader/scraper/rules.go index 6fddd1e7..f988c244 100644 --- a/reader/scraper/rules.go +++ b/reader/scraper/rules.go @@ -7,20 +7,34 @@ package scraper // List of predefined scraper rules (alphabetically sorted) // domain => CSS selectors var predefinedRules = map[string]string{ - "cbc.ca": ".story-content", - "github.com": "article.entry-content", - "igen.fr": "section.corps", - "ing.dk": "section.body", - "lapresse.ca": ".amorce, .entry", - "lemonde.fr": "div#articleBody", - "lesjoiesducode.fr": ".blog-post-content img", - "linux.com": "div.content, div[property]", - "medium.com": ".section-content", - "opensource.com": "div[property]", - "osnews.com": "div.newscontent1", - "phoronix.com": "div.content", - "techcrunch.com": "div.article-entry", - "theregister.co.uk": "#body", - "version2.dk": "section.body", - "wired.com": "main figure, article", + "cbc.ca": ".story-content", + "developpez.com": "div[itemprop=articleBody]", + "francetvinfo.fr": ".text", + "github.com": "article.entry-content", + "heise.de": "div.article-content", + "igen.fr": "section.corps", + "ing.dk": "section.body", + "lapresse.ca": ".amorce, .entry", + "lemonde.fr": "div#articleBody", + "lepoint.fr": ".art-text", + "lesjoiesducode.fr": ".blog-post-content img", + "lesnumeriques.com": ".text", + "linux.com": "div.content, div[property]", + "medium.com": ".section-content", + "mac4ever.com": "div[itemprop=articleBody]", + "monwindows.com": ".blog-post-body", + "npr.org": "#storytext", + "oneindia.com": ".io-article-body", + "opensource.com": "div[property]", + "osnews.com": "div.newscontent1", + "phoronix.com": "div.content", + "pseudo-sciences.org": "#art_main", + "slate.fr": ".field-items", + "techcrunch.com": "div.article-entry", + "theregister.co.uk": "#body", + "universfreebox.com": "#corps_corps", + "version2.dk": "section.body", + "wired.com": "main figure, article", + "zeit.de": ".summary, .article-body", + "zdnet.com": "div.storyBody", }