From 93596c1218e18e3350e4a41b3c6b1f9b681df2b9 Mon Sep 17 00:00:00 2001 From: Lukas Dietrich Date: Wed, 1 Sep 2021 23:42:23 +0200 Subject: [PATCH] Add rewrite rule to remove dom elements --- reader/rewrite/rewrite_functions.go | 12 +++ reader/rewrite/rewriter.go | 110 ++++++++++++++++++---------- reader/rewrite/rewriter_test.go | 28 ++++++- 3 files changed, 112 insertions(+), 38 deletions(-) diff --git a/reader/rewrite/rewrite_functions.go b/reader/rewrite/rewrite_functions.go index bd257c90..c01545c9 100644 --- a/reader/rewrite/rewrite_functions.go +++ b/reader/rewrite/rewrite_functions.go @@ -229,3 +229,15 @@ func replaceCustom(entryContent string, searchTerm string, replaceTerm string) s } return entryContent } + +func removeCustom(entryContent string, selector string) string { + doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent)) + if err != nil { + return entryContent + } + + doc.Find(selector).Remove() + + output, _ := doc.Find("body").First().Html() + return output +} diff --git a/reader/rewrite/rewriter.go b/reader/rewrite/rewriter.go index 77ac147d..27058b55 100644 --- a/reader/rewrite/rewriter.go +++ b/reader/rewrite/rewriter.go @@ -5,14 +5,18 @@ package rewrite // import "miniflux.app/reader/rewrite" import ( - "regexp" + "strconv" "strings" + "text/scanner" "miniflux.app/logger" "miniflux.app/url" ) -var customReplaceRuleRegex = regexp.MustCompile(`replace\("(.*)"\|"(.*)"\)`) +type rule struct { + name string + args []string +} // Rewriter modify item contents with a set of rewriting rules. func Rewriter(entryURL, entryContent, customRewriteRules string) string { @@ -21,46 +25,78 @@ func Rewriter(entryURL, entryContent, customRewriteRules string) string { rulesList = customRewriteRules } - rules := strings.Split(rulesList, ",") - rules = append(rules, "add_pdf_download_link") + rules := parseRules(rulesList) + rules = append(rules, rule{name: "add_pdf_download_link"}) logger.Debug(`[Rewrite] Applying rules %v for %q`, rules, entryURL) for _, rule := range rules { - rule := strings.TrimSpace(rule) - switch rule { - case "add_image_title": - entryContent = addImageTitle(entryURL, entryContent) - case "add_mailto_subject": - entryContent = addMailtoSubject(entryURL, entryContent) - case "add_dynamic_image": - entryContent = addDynamicImage(entryURL, entryContent) - case "add_youtube_video": - entryContent = addYoutubeVideo(entryURL, entryContent) - case "add_invidious_video": - entryContent = addInvidiousVideo(entryURL, entryContent) - case "add_youtube_video_using_invidious_player": - entryContent = addYoutubeVideoUsingInvidiousPlayer(entryURL, entryContent) - case "add_pdf_download_link": - entryContent = addPDFLink(entryURL, entryContent) - case "nl2br": - entryContent = replaceLineFeeds(entryContent) - case "convert_text_link", "convert_text_links": - entryContent = replaceTextLinks(entryContent) - case "fix_medium_images": - entryContent = fixMediumImages(entryURL, entryContent) - case "use_noscript_figure_images": - entryContent = useNoScriptImages(entryURL, entryContent) - default: - if strings.Contains(rule, "replace") { - // Format: replace("search-term"|"replace-term") - args := customReplaceRuleRegex.FindStringSubmatch(rule) - if len(args) >= 3 { - entryContent = replaceCustom(entryContent, args[1], args[2]) - } else { - logger.Debug("[Rewrite] Cannot find search and replace terms for replace rule %s", rule) - } + entryContent = applyRule(entryURL, entryContent, rule) + } + + return entryContent +} + +func parseRules(rulesText string) (rules []rule) { + scan := scanner.Scanner{Mode: scanner.ScanIdents | scanner.ScanStrings} + scan.Init(strings.NewReader(rulesText)) + + for { + switch scan.Scan() { + case scanner.Ident: + rules = append(rules, rule{name: scan.TokenText()}) + + case scanner.String: + if l := len(rules) - 1; l >= 0 { + text := scan.TokenText() + text, _ = strconv.Unquote(text) + + rules[l].args = append(rules[l].args, text) } + + case scanner.EOF: + return + } + } +} + +func applyRule(entryURL, entryContent string, rule rule) string { + switch rule.name { + case "add_image_title": + entryContent = addImageTitle(entryURL, entryContent) + case "add_mailto_subject": + entryContent = addMailtoSubject(entryURL, entryContent) + case "add_dynamic_image": + entryContent = addDynamicImage(entryURL, entryContent) + case "add_youtube_video": + entryContent = addYoutubeVideo(entryURL, entryContent) + case "add_invidious_video": + entryContent = addInvidiousVideo(entryURL, entryContent) + case "add_youtube_video_using_invidious_player": + entryContent = addYoutubeVideoUsingInvidiousPlayer(entryURL, entryContent) + case "add_pdf_download_link": + entryContent = addPDFLink(entryURL, entryContent) + case "nl2br": + entryContent = replaceLineFeeds(entryContent) + case "convert_text_link", "convert_text_links": + entryContent = replaceTextLinks(entryContent) + case "fix_medium_images": + entryContent = fixMediumImages(entryURL, entryContent) + case "use_noscript_figure_images": + entryContent = useNoScriptImages(entryURL, entryContent) + case "replace": + // Format: replace("search-term"|"replace-term") + if len(rule.args) >= 2 { + entryContent = replaceCustom(entryContent, rule.args[0], rule.args[1]) + } else { + logger.Debug("[Rewrite] Cannot find search and replace terms for replace rule %s", rule) + } + case "remove": + // Format: remove("#selector > .element, .another") + if len(rule.args) >= 1 { + entryContent = removeCustom(entryContent, rule.args[0]) + } else { + logger.Debug("[Rewrite] Cannot find selector for remove rule %s", rule) } } diff --git a/reader/rewrite/rewriter_test.go b/reader/rewrite/rewriter_test.go index aebaf9f9..7f63473e 100644 --- a/reader/rewrite/rewriter_test.go +++ b/reader/rewrite/rewriter_test.go @@ -5,10 +5,26 @@ package rewrite // import "miniflux.app/reader/rewrite" import ( + "reflect" "strings" "testing" ) +func TestParseRules(t *testing.T) { + rulesText := `add_dynamic_image,replace("article/(.*).svg"|"article/$1.png"),remove(".spam, .ads:not(.keep)")` + expected := []rule{ + {name: "add_dynamic_image"}, + {name: "replace", args: []string{"article/(.*).svg", "article/$1.png"}}, + {name: "remove", args: []string{".spam, .ads:not(.keep)"}}, + } + + actual := parseRules(rulesText) + + if !reflect.DeepEqual(expected, actual) { + t.Errorf(`Parsed rules do not match expected rules: got %v instead of %v`, actual, expected) + } +} + func TestReplaceTextLinks(t *testing.T) { scenarios := map[string]string{ `This is a link to example.org`: `This is a link to example.org`, @@ -234,7 +250,17 @@ func TestRewriteNoScriptImageWithNoScriptTag(t *testing.T) { func TestRewriteReplaceCustom(t *testing.T) { content := `` expected := `` - output := Rewriter("https://example.org/artcle", content, `replace("article/(.*).svg"|"article/$1.png")`) + output := Rewriter("https://example.org/article", content, `replace("article/(.*).svg"|"article/$1.png")`) + + if expected != output { + t.Errorf(`Not expected output: %s`, output) + } +} + +func TestRewriteRemoveCustom(t *testing.T) { + content := `
Lorem Ipsum I dont want to see thisSuper important info
` + expected := `
Lorem Ipsum Super important info
` + output := Rewriter("https://example.org/article", content, `remove(".spam, .ads:not(.keep)")`) if expected != output { t.Errorf(`Not expected output: %s`, output)