From 311a133ab87423958d850a91dea0b49543b97c2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Guillot?= Date: Sun, 2 Dec 2018 20:51:06 -0800 Subject: [PATCH] Refactor manual entry scraper --- reader/feed/handler.go | 6 +++--- reader/{filter => processor}/doc.go | 4 ++-- reader/{filter => processor}/filter.go | 27 +++++++++++++++++++++----- reader/rewrite/rewriter.go | 3 +++ reader/scraper/scraper.go | 2 +- ui/entry_scraper.go | 11 +++-------- 6 files changed, 34 insertions(+), 19 deletions(-) rename reader/{filter => processor}/doc.go (57%) rename reader/{filter => processor}/filter.go (56%) diff --git a/reader/feed/handler.go b/reader/feed/handler.go index 401048bb..587f76da 100644 --- a/reader/feed/handler.go +++ b/reader/feed/handler.go @@ -14,9 +14,9 @@ import ( "miniflux.app/logger" "miniflux.app/model" "miniflux.app/reader/browser" - "miniflux.app/reader/filter" "miniflux.app/reader/icon" "miniflux.app/reader/parser" + "miniflux.app/reader/processor" "miniflux.app/storage" "miniflux.app/timer" ) @@ -63,7 +63,7 @@ func (h *Handler) CreateFeed(userID, categoryID int64, url string, crawler bool, subscription.WithClientResponse(response) subscription.CheckedNow() - filter.Apply(h.store, subscription) + processor.ProcessFeedEntries(h.store, subscription) if storeErr := h.store.CreateFeed(subscription); storeErr != nil { return nil, storeErr @@ -114,7 +114,7 @@ func (h *Handler) RefreshFeed(userID, feedID int64) error { } originalFeed.Entries = updatedFeed.Entries - filter.Apply(h.store, originalFeed) + processor.ProcessFeedEntries(h.store, originalFeed) // We don't update existing entries when the crawler is enabled (we crawl only inexisting entries). if storeErr := h.store.UpdateEntries(originalFeed.UserID, originalFeed.ID, originalFeed.Entries, !originalFeed.Crawler); storeErr != nil { diff --git a/reader/filter/doc.go b/reader/processor/doc.go similarity index 57% rename from reader/filter/doc.go rename to reader/processor/doc.go index 92c18c01..df57e980 100644 --- a/reader/filter/doc.go +++ b/reader/processor/doc.go @@ -4,7 +4,7 @@ /* -Package filter applies a set of filters to feed entries. +Package processor applies rules and sanitize content for feed entries. */ -package filter // import "miniflux.app/reader/filter" +package processor // import "miniflux.app/reader/processor" diff --git a/reader/filter/filter.go b/reader/processor/filter.go similarity index 56% rename from reader/filter/filter.go rename to reader/processor/filter.go index a0b94298..faceccc4 100644 --- a/reader/filter/filter.go +++ b/reader/processor/filter.go @@ -2,7 +2,7 @@ // Use of this source code is governed by the Apache 2.0 // license that can be found in the LICENSE file. -package filter +package processor import ( "miniflux.app/logger" @@ -13,15 +13,15 @@ import ( "miniflux.app/storage" ) -// Apply executes all entry filters. -func Apply(store *storage.Storage, feed *model.Feed) { +// ProcessFeedEntries downloads original web page for entries and apply filters. +func ProcessFeedEntries(store *storage.Storage, feed *model.Feed) { for _, entry := range feed.Entries { if feed.Crawler { if !store.EntryURLExists(feed.UserID, entry.URL) { content, err := scraper.Fetch(entry.URL, feed.ScraperRules, feed.UserAgent) if err != nil { - logger.Error("Unable to crawl this entry: %q => %v", entry.URL, err) - } else { + logger.Error(`[Filter] Unable to crawl this entry: %q => %v`, entry.URL, err) + } else if content != "" { // We replace the entry content only if the scraper doesn't return any error. entry.Content = content } @@ -34,3 +34,20 @@ func Apply(store *storage.Storage, feed *model.Feed) { entry.Content = sanitizer.Sanitize(entry.URL, entry.Content) } } + +// ProcessEntryWebPage downloads the entry web page and apply rewrite rules. +func ProcessEntryWebPage(entry *model.Entry) error { + content, err := scraper.Fetch(entry.URL, entry.Feed.ScraperRules, entry.Feed.UserAgent) + if err != nil { + return err + } + + content = rewrite.Rewriter(entry.URL, content, entry.Feed.RewriteRules) + content = sanitizer.Sanitize(entry.URL, content) + + if content != "" { + entry.Content = content + } + + return nil +} diff --git a/reader/rewrite/rewriter.go b/reader/rewrite/rewriter.go index e9b1fae1..25c742d5 100644 --- a/reader/rewrite/rewriter.go +++ b/reader/rewrite/rewriter.go @@ -7,6 +7,7 @@ package rewrite // import "miniflux.app/reader/rewrite" import ( "strings" + "miniflux.app/logger" "miniflux.app/url" ) @@ -20,6 +21,8 @@ func Rewriter(entryURL, entryContent, customRewriteRules string) string { rules := strings.Split(rulesList, ",") rules = append(rules, "add_pdf_download_link") + logger.Debug(`[Rewrite] Applying rules %v for %q`, rules, entryURL) + for _, rule := range rules { switch strings.TrimSpace(rule) { case "add_image_title": diff --git a/reader/scraper/scraper.go b/reader/scraper/scraper.go index 58f37d5a..fd5ba337 100644 --- a/reader/scraper/scraper.go +++ b/reader/scraper/scraper.go @@ -54,7 +54,7 @@ func Fetch(websiteURL, rules, userAgent string) (string, error) { logger.Debug(`[Scraper] Using rules %q for %q`, rules, websiteURL) content, err = scrapContent(response.Body, rules) } else { - logger.Debug(`[Scraper] Using readability for "%q`, websiteURL) + logger.Debug(`[Scraper] Using readability for %q`, websiteURL) content, err = readability.ExtractContent(response.Body) } diff --git a/ui/entry_scraper.go b/ui/entry_scraper.go index 48fb7dbb..0e7904a2 100644 --- a/ui/entry_scraper.go +++ b/ui/entry_scraper.go @@ -6,12 +6,11 @@ package ui // import "miniflux.app/ui" import ( "net/http" + "miniflux.app/http/request" "miniflux.app/http/response/json" "miniflux.app/model" - "miniflux.app/reader/rewrite" - "miniflux.app/reader/sanitizer" - "miniflux.app/reader/scraper" + "miniflux.app/reader/processor" ) func (h *handler) fetchContent(w http.ResponseWriter, r *http.Request) { @@ -31,15 +30,11 @@ func (h *handler) fetchContent(w http.ResponseWriter, r *http.Request) { return } - content, err := scraper.Fetch(entry.URL, entry.Feed.ScraperRules, entry.Feed.UserAgent) - if err != nil { + if err := processor.ProcessEntryWebPage(entry); err != nil { json.ServerError(w, r, err) return } - - content = rewrite.Rewriter(entry.URL, content, entry.Feed.RewriteRules) - entry.Content = sanitizer.Sanitize(entry.URL, content) h.store.UpdateEntryContent(entry) json.OK(w, r, map[string]string{"content": entry.Content})