More robust Atom text handling

Miniflux couldn't deal with XHTML Summary elements. - Make Summary an 'atomContent' field - Define an atomContentToString function rather than inling it three times - Also properly escape special characters in plain text fields.
2019-01-01 22:01:19 +01:00 · 2019-01-01 22:01:19 +01:00 · 0cdcec10ca
commit 0cdcec10ca
parent 15505ee4a2
2 changed files with 103 additions and 16 deletions
--- a/reader/atom/atom.go
+++ b/reader/atom/atom.go
@ -6,6 +6,7 @@ package atom // import "miniflux.app/reader/atom"

 import (
 	"encoding/xml"
+	"html"
 	"strconv"
 	"strings"
 	"time"
@ -33,7 +34,7 @@ type atomEntry struct {
 	Published  string         `xml:"published"`
 	Updated    string         `xml:"updated"`
 	Links      []atomLink     `xml:"link"`
-	Summary    string         `xml:"summary"`
+	Summary    atomContent    `xml:"summary"`
 	Content    atomContent    `xml:"content"`
 	MediaGroup atomMediaGroup `xml:"http://search.yahoo.com/mrss/ group"`
 	Author     atomAuthor     `xml:"author"`
@ -147,17 +148,31 @@ func getDate(a *atomEntry) time.Time {
 	return time.Now()
 }

+func atomContentToString(c atomContent) string {
+	if c.Type == "xhtml" {
+		return c.XML
+	}
+
+	if c.Type == "html" {
+		return c.Data
+	}
+
+	if c.Type == "text" || c.Type == "" {
+		return html.EscapeString(c.Data)
+	}
+
+	return ""
+}
+
 func getContent(a *atomEntry) string {
-	if a.Content.Type == "html" || a.Content.Type == "text" {
-		return a.Content.Data
+	r := atomContentToString(a.Content)
+	if r != "" {
+		return r
 	}

-	if a.Content.Type == "xhtml" {
-		return a.Content.XML
-	}
-
-	if a.Summary != "" {
-		return a.Summary
+	r = atomContentToString(a.Summary)
+	if r != "" {
+		return r
 	}

 	if a.MediaGroup.Description != "" {
@ -168,13 +183,7 @@ func getContent(a *atomEntry) string {
 }

 func getTitle(a *atomEntry) string {
-	title := ""
-	if a.Title.Type == "xhtml" {
-		title = a.Title.XML
-	} else {
-		title = a.Title.Data
-	}
-
+	title := atomContentToString(a.Title)
 	return strings.TrimSpace(sanitizer.StripTags(title))
 }

--- a/reader/atom/parser_test.go
+++ b/reader/atom/parser_test.go
@ -282,6 +282,84 @@ func TestParseEntryTitleWithXHTML(t *testing.T) {
 	}
 }

+func TestParseEntrySummaryWithXHTML(t *testing.T) {
+	data := `<?xml version="1.0" encoding="utf-8"?>
+	<feed xmlns="http://www.w3.org/2005/Atom">
+	  <title>Example Feed</title>
+	  <link href="http://example.org/"/>
+
+	  <entry>
+		<title type="xhtml"><code>Test</code> Test</title>
+		<link href="http://example.org/2003/12/13/atom03"/>
+		<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
+		<updated>2003-12-13T18:30:02Z</updated>
+		<summary type="xhtml"><p>Some text.</p></summary>
+	  </entry>
+
+	</feed>`
+
+	feed, err := Parse(bytes.NewBufferString(data))
+	if err != nil {
+		t.Error(err)
+	}
+
+	if feed.Entries[0].Content != "<p>Some text.</p>" {
+		t.Errorf("Incorrect entry content, got: %s", feed.Entries[0].Content)
+	}
+}
+
+func TestParseEntrySummaryWithHTML(t *testing.T) {
+	data := `<?xml version="1.0" encoding="utf-8"?>
+	<feed xmlns="http://www.w3.org/2005/Atom">
+	  <title>Example Feed</title>
+	  <link href="http://example.org/"/>
+
+	  <entry>
+		<title type="html">&lt;code&gt;Test&lt;/code&gt; Test</title>
+		<link href="http://example.org/2003/12/13/atom03"/>
+		<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
+		<updated>2003-12-13T18:30:02Z</updated>
+		<summary type="html"><![CDATA[<p>Some text.</p>]]></summary>
+	  </entry>
+
+	</feed>`
+
+	feed, err := Parse(bytes.NewBufferString(data))
+	if err != nil {
+		t.Error(err)
+	}
+
+	if feed.Entries[0].Content != "<p>Some text.</p>" {
+		t.Errorf("Incorrect entry content, got: %s", feed.Entries[0].Content)
+	}
+}
+
+func TestParseEntrySummaryWithPlainText(t *testing.T) {
+	data := `<?xml version="1.0" encoding="utf-8"?>
+	<feed xmlns="http://www.w3.org/2005/Atom">
+	  <title>Example Feed</title>
+	  <link href="http://example.org/"/>
+
+	  <entry>
+		<title type="html">&lt;code&gt;Test&lt;/code&gt; Test</title>
+		<link href="http://example.org/2003/12/13/atom03"/>
+		<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
+		<updated>2003-12-13T18:30:02Z</updated>
+		<summary type="text"><![CDATA[<Some text.>]]></summary>
+	  </entry>
+
+	</feed>`
+
+	feed, err := Parse(bytes.NewBufferString(data))
+	if err != nil {
+		t.Error(err)
+	}
+
+	if feed.Entries[0].Content != "&lt;Some text.&gt;" {
+		t.Errorf("Incorrect entry content, got: %s", feed.Entries[0].Content)
+	}
+}
+
 func TestParseEntryWithAuthorName(t *testing.T) {
 	data := `<?xml version="1.0" encoding="utf-8"?>
 	<feed xmlns="http://www.w3.org/2005/Atom">