dive into mark

From 1eb01b39e718439072fd8e81d98387a5db260ad6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Guillot?= Date: Fri, 4 Mar 2022 16:49:44 -0800 Subject: [PATCH] Use truncated entry description as title if unavailable --- reader/atom/atom_03.go | 4 ++ reader/atom/atom_03_test.go | 58 +++++++++++++++++++++++++- reader/atom/atom_10.go | 5 +++ reader/atom/atom_10_test.go | 67 ++++++++++++++++++++++++++++++- reader/json/json.go | 22 ++++------ reader/json/parser_test.go | 64 ++++++++++++++++++++++++++--- reader/rss/parser_test.go | 26 +++++++++++- reader/rss/rss.go | 4 ++ reader/sanitizer/truncate.go | 23 +++++++++++ reader/sanitizer/truncate_test.go | 65 ++++++++++++++++++++++++++++++ 10 files changed, 314 insertions(+), 24 deletions(-) create mode 100644 reader/sanitizer/truncate.go create mode 100644 reader/sanitizer/truncate_test.go diff --git a/reader/atom/atom_03.go b/reader/atom/atom_03.go index d10d5cc8..3e8dc6d0 100644 --- a/reader/atom/atom_03.go +++ b/reader/atom/atom_03.go @@ -60,6 +60,10 @@ func (a *atom03Feed) Transform(baseURL string) *model.Feed { item.Author = a.Author.String() } + if item.Title == "" { + item.Title = sanitizer.TruncateHTML(item.Content, 100) + } + if item.Title == "" { item.Title = item.URL } diff --git a/reader/atom/atom_03_test.go b/reader/atom/atom_03_test.go index 75083d93..f88424c3 100644 --- a/reader/atom/atom_03_test.go +++ b/reader/atom/atom_03_test.go @@ -98,7 +98,7 @@ func TestParseAtom03WithoutFeedTitle(t *testing.T) { } } -func TestParseAtom03WithoutEntryTitle(t *testing.T) { +func TestParseAtom03WithoutEntryTitleButWithLink(t *testing.T) { data := ` dive into mark @@ -125,6 +125,62 @@ func TestParseAtom03WithoutEntryTitle(t *testing.T) { } } +func TestParseAtom03WithoutEntryTitleButWithSummary(t *testing.T) { + data := ` + + dive into mark + + 2003-12-13T18:30:02Z + Mark Pilgrim + + + tag:diveintomark.org,2003:3.2397 +

It's a test

+ + ` + + feed, err := Parse("http://diveintomark.org/", bytes.NewBufferString(data)) + if err != nil { + t.Fatal(err) + } + + if len(feed.Entries) != 1 { + t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries)) + } + + if feed.Entries[0].Title != "It's a test" { + t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title) + } +} + +func TestParseAtom03WithoutEntryTitleButWithXMLContent(t *testing.T) { + data := ` + + dive into mark + + 2003-12-13T18:30:02Z + Mark Pilgrim + + + tag:diveintomark.org,2003:3.2397 +

Some text.

+ + ` + + feed, err := Parse("http://diveintomark.org/", bytes.NewBufferString(data)) + if err != nil { + t.Fatal(err) + } + + if len(feed.Entries) != 1 { + t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries)) + } + + if feed.Entries[0].Title != "Some text." { + t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title) + } +} + func TestParseAtom03WithSummaryOnly(t *testing.T) { data := ` diff --git a/reader/atom/atom_10.go b/reader/atom/atom_10.go index 4b45603b..441f8a51 100644 --- a/reader/atom/atom_10.go +++ b/reader/atom/atom_10.go @@ -16,6 +16,7 @@ import ( "miniflux.app/model" "miniflux.app/reader/date" "miniflux.app/reader/media" + "miniflux.app/reader/sanitizer" "miniflux.app/url" ) @@ -64,6 +65,10 @@ func (a *atom10Feed) Transform(baseURL string) *model.Feed { item.Author = a.Authors.String() } + if item.Title == "" { + item.Title = sanitizer.TruncateHTML(item.Content, 100) + } + if item.Title == "" { item.Title = item.URL } diff --git a/reader/atom/atom_10_test.go b/reader/atom/atom_10_test.go index 51381765..a0ee192b 100644 --- a/reader/atom/atom_10_test.go +++ b/reader/atom/atom_10_test.go @@ -100,7 +100,37 @@ func TestParseFeedWithoutTitle(t *testing.T) { } } -func TestParseEntryWithoutTitle(t *testing.T) { +func TestParseEntryWithoutTitleButWithURL(t *testing.T) { + data := ` + + + Example Feed + + 2003-12-13T18:30:02Z + + John Doe + + urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6 + + + + urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a + 2003-12-13T18:30:02Z + + + ` + + feed, err := Parse("https://example.org/", bytes.NewBufferString(data)) + if err != nil { + t.Fatal(err) + } + + if feed.Entries[0].Title != "http://example.org/2003/12/13/atom03" { + t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title) + } +} + +func TestParseEntryWithoutTitleButWithSummary(t *testing.T) { data := ` @@ -126,7 +156,40 @@ func TestParseEntryWithoutTitle(t *testing.T) { t.Fatal(err) } - if feed.Entries[0].Title != "http://example.org/2003/12/13/atom03" { + if feed.Entries[0].Title != "Some text." { + t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title) + } +} + +func TestParseEntryWithoutTitleButWithXHTMLContent(t *testing.T) { + data := ` + + + Example Feed + + 2003-12-13T18:30:02Z + + John Doe + + urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6 + + + + urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a + 2003-12-13T18:30:02Z + +

AT&T bought by SBC!

+ + + + ` + + feed, err := Parse("https://example.org/", bytes.NewBufferString(data)) + if err != nil { + t.Fatal(err) + } + + if feed.Entries[0].Title != "AT&T bought by SBC!" { t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title) } } diff --git a/reader/json/json.go b/reader/json/json.go index 31ba961c..589d7cb0 100644 --- a/reader/json/json.go +++ b/reader/json/json.go @@ -12,6 +12,7 @@ import ( "miniflux.app/logger" "miniflux.app/model" "miniflux.app/reader/date" + "miniflux.app/reader/sanitizer" "miniflux.app/url" ) @@ -130,9 +131,13 @@ func (j *jsonItem) GetHash() string { } func (j *jsonItem) GetTitle() string { - for _, value := range []string{j.Title, j.Summary, j.Text, j.URL} { + if j.Title != "" { + return j.Title + } + + for _, value := range []string{j.Summary, j.Text, j.HTML} { if value != "" { - return truncate(value) + return sanitizer.TruncateHTML(value, 100) } } @@ -186,16 +191,3 @@ func getAuthor(author jsonAuthor) string { return "" } - -func truncate(str string) string { - max := 100 - str = strings.TrimSpace(str) - - // Convert to runes to be safe with unicode - runes := []rune(str) - if len(runes) > max { - return string(runes[:max]) + "…" - } - - return str -} diff --git a/reader/json/parser_test.go b/reader/json/parser_test.go index 0bd6e6c7..5ba82d45 100644 --- a/reader/json/parser_test.go +++ b/reader/json/parser_test.go @@ -76,7 +76,7 @@ func TestParseJsonFeed(t *testing.T) { t.Errorf("Incorrect entry URL, got: %s", feed.Entries[1].URL) } - if feed.Entries[1].Title != "https://example.org/initial-post" { + if feed.Entries[1].Title != "Hello, world!" { t.Errorf(`Incorrect entry title, got: "%s"`, feed.Entries[1].Title) } @@ -398,7 +398,7 @@ func TestParseFeedItemWithoutID(t *testing.T) { } } -func TestParseFeedItemWithoutTitle(t *testing.T) { +func TestParseFeedItemWithoutTitleButWithURL(t *testing.T) { data := `{ "version": "https://jsonfeed.org/version/1", "title": "My Example Feed", @@ -425,7 +425,7 @@ func TestParseFeedItemWithoutTitle(t *testing.T) { } } -func TestParseTruncateItemTitle(t *testing.T) { +func TestParseFeedItemWithoutTitleButWithSummary(t *testing.T) { data := `{ "version": "https://jsonfeed.org/version/1", "title": "My Example Feed", @@ -433,7 +433,61 @@ func TestParseTruncateItemTitle(t *testing.T) { "feed_url": "https://example.org/feed.json", "items": [ { - "title": "` + strings.Repeat("a", 200) + `" + "summary": "This is some text content." + } + ] + }` + + feed, err := Parse("https://example.org/feed.json", bytes.NewBufferString(data)) + if err != nil { + t.Fatal(err) + } + + if len(feed.Entries) != 1 { + t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries)) + } + + if feed.Entries[0].Title != "This is some text content." { + t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title) + } +} + +func TestParseFeedItemWithoutTitleButWithHTMLContent(t *testing.T) { + data := `{ + "version": "https://jsonfeed.org/version/1", + "title": "My Example Feed", + "home_page_url": "https://example.org/", + "feed_url": "https://example.org/feed.json", + "items": [ + { + "content_html": "This is HTML." + } + ] + }` + + feed, err := Parse("https://example.org/feed.json", bytes.NewBufferString(data)) + if err != nil { + t.Fatal(err) + } + + if len(feed.Entries) != 1 { + t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries)) + } + + if feed.Entries[0].Title != "This is HTML." { + t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title) + } +} + +func TestParseFeedItemWithoutTitleButWithTextContent(t *testing.T) { + data := `{ + "version": "https://jsonfeed.org/version/1", + "title": "My Example Feed", + "home_page_url": "https://example.org/", + "feed_url": "https://example.org/feed.json", + "items": [ + { + "content_text": "` + strings.Repeat("a", 200) + `" } ] }` @@ -448,7 +502,7 @@ func TestParseTruncateItemTitle(t *testing.T) { } if len(feed.Entries[0].Title) != 103 { - t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title) + t.Errorf("Incorrect entry title, got: %d", len(feed.Entries[0].Title)) } if len([]rune(feed.Entries[0].Title)) != 101 { diff --git a/reader/rss/parser_test.go b/reader/rss/parser_test.go index 197994c7..9be293b4 100644 --- a/reader/rss/parser_test.go +++ b/reader/rss/parser_test.go @@ -115,7 +115,7 @@ func TestParseFeedWithoutTitle(t *testing.T) { } } -func TestParseEntryWithoutTitle(t *testing.T) { +func TestParseEntryWithoutTitleAndDescription(t *testing.T) { data := ` @@ -136,6 +136,30 @@ func TestParseEntryWithoutTitle(t *testing.T) { } } +func TestParseEntryWithoutTitleButWithDescription(t *testing.T) { + data := ` + + + https://example.org/ + + https://example.org/item + + This is the description + + + + ` + + feed, err := Parse("https://example.org/", bytes.NewBufferString(data)) + if err != nil { + t.Fatal(err) + } + + if feed.Entries[0].Title != "This is the description" { + t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title) + } +} + func TestParseEntryWithMediaTitle(t *testing.T) { data := ` diff --git a/reader/rss/rss.go b/reader/rss/rss.go index db082393..fb042632 100644 --- a/reader/rss/rss.go +++ b/reader/rss/rss.go @@ -73,6 +73,10 @@ func (r *rssFeed) Transform(baseURL string) *model.Feed { } } + if entry.Title == "" { + entry.Title = sanitizer.TruncateHTML(entry.Content, 100) + } + if entry.Title == "" { entry.Title = entry.URL } diff --git a/reader/sanitizer/truncate.go b/reader/sanitizer/truncate.go new file mode 100644 index 00000000..04acc1d6 --- /dev/null +++ b/reader/sanitizer/truncate.go @@ -0,0 +1,23 @@ +// Copyright 2022 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package sanitizer + +import "strings" + +func TruncateHTML(input string, max int) string { + text := StripTags(input) + text = strings.ReplaceAll(text, "\n", " ") + text = strings.ReplaceAll(text, "\t", " ") + text = strings.ReplaceAll(text, " ", " ") + text = strings.TrimSpace(text) + + // Convert to runes to be safe with unicode + runes := []rune(text) + if len(runes) > max { + return strings.TrimSpace(string(runes[:max])) + "…" + } + + return text +} diff --git a/reader/sanitizer/truncate_test.go b/reader/sanitizer/truncate_test.go new file mode 100644 index 00000000..2c7e87b6 --- /dev/null +++ b/reader/sanitizer/truncate_test.go @@ -0,0 +1,65 @@ +// Copyright 2022 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package sanitizer + +import "testing" + +func TestTruncateHTMWithTextLowerThanLimitL(t *testing.T) { + input := `This is a bug 🐛.` + expected := `This is a bug 🐛.` + output := TruncateHTML(input, 50) + + if expected != output { + t.Errorf(`Wrong output: %q != %q`, expected, output) + } +} + +func TestTruncateHTMLWithTextAboveLimit(t *testing.T) { + input := `This is HTML.` + expected := `This…` + output := TruncateHTML(input, 4) + + if expected != output { + t.Errorf(`Wrong output: %q != %q`, expected, output) + } +} + +func TestTruncateHTMLWithUnicodeTextAboveLimit(t *testing.T) { + input := `This is a bike 🚲.` + expected := `This…` + output := TruncateHTML(input, 4) + + if expected != output { + t.Errorf(`Wrong output: %q != %q`, expected, output) + } +} + +func TestTruncateHTMLWithMultilineTextAboveLimit(t *testing.T) { + input := ` + This is a bike + 🚲. + + ` + expected := `This is a bike…` + output := TruncateHTML(input, 15) + + if expected != output { + t.Errorf(`Wrong output: %q != %q`, expected, output) + } +} + +func TestTruncateHTMLWithMultilineTextLowerThanLimit(t *testing.T) { + input := ` + This is a bike + 🚲. + + ` + expected := `This is a bike 🚲.` + output := TruncateHTML(input, 20) + + if expected != output { + t.Errorf(`Wrong output: %q != %q`, expected, output) + } +}