Simplify feed parser and format detection

- Avoid doing multiple buffer copies - Move parser and format detection logic to its own package
2018-10-14 11:46:41 -07:00 · 2018-10-14 11:46:41 -07:00 · 5870f04260
commit 5870f04260
parent d5ff4191b6
11 changed files with 229 additions and 221 deletions
--- a/http/client/response.go
+++ b/http/client/response.go
@ -6,6 +6,7 @@ package client // import "miniflux.app/http/client"

 import (
 	"io"
+	"io/ioutil"
 	"mime"
 	"strings"

@ -56,23 +57,32 @@ func (r *Response) IsModified(etag, lastModified string) bool {
 	return true
 }

-// NormalizeBodyEncoding make sure the body is encoded in UTF-8.
+// EnsureUnicodeBody makes sure the body is encoded in UTF-8.
 //
 // If a charset other than UTF-8 is detected, we convert the document to UTF-8.
 // This is used by the scraper and feed readers.
 //
 // Do not forget edge cases:
 // - Some non-utf8 feeds specify encoding only in Content-Type, not in XML document.
-func (r *Response) NormalizeBodyEncoding() (io.Reader, error) {
+func (r *Response) EnsureUnicodeBody() error {
 	_, params, err := mime.ParseMediaType(r.ContentType)
 	if err == nil {
 		if enc, found := params["charset"]; found {
 			enc = strings.ToLower(enc)
 			if enc != "utf-8" && enc != "utf8" && enc != "" {
-				logger.Debug("[NormalizeBodyEncoding] Convert body to UTF-8 from %s", enc)
-				return charset.NewReader(r.Body, r.ContentType)
+				logger.Debug("[EnsureUnicodeBody] Convert body to utf-8 from %s", enc)
+				r.Body, err = charset.NewReader(r.Body, r.ContentType)
+				if err != nil {
+					return err
 				}
 			}
 		}
-	return r.Body, nil
+	}
+	return nil
+}
+
+// String returns the response body as string.
+func (r *Response) String() string {
+	bytes, _ := ioutil.ReadAll(r.Body)
+	return string(bytes)
 }
--- a/reader/feed/doc.go
+++ b/reader/feed/doc.go
@ -4,7 +4,7 @@

 /*

-Package feed provides a generic feed parser that abstracts all different formats.
+Package feed handles feed updates and creation.

 */
 package feed // import "miniflux.app/reader/feed"
--- a/reader/feed/handler.go
+++ b/reader/feed/handler.go
@ -14,6 +14,7 @@ import (
 	"miniflux.app/logger"
 	"miniflux.app/model"
 	"miniflux.app/reader/icon"
+	"miniflux.app/reader/parser"
 	"miniflux.app/reader/processor"
 	"miniflux.app/storage"
 	"miniflux.app/timer"
@ -67,12 +68,11 @@ func (h *Handler) CreateFeed(userID, categoryID int64, url string, crawler bool,
 		return nil, errors.NewLocalizedError(errDuplicate, response.EffectiveURL)
 	}

-	body, err := response.NormalizeBodyEncoding()
-	if err != nil {
+	if err := response.EnsureUnicodeBody(); err != nil {
 		return nil, errors.NewLocalizedError(errEncoding, err)
 	}

-	subscription, feedErr := parseFeed(body)
+	subscription, feedErr := parser.ParseFeed(response.String())
 	if feedErr != nil {
 		return nil, feedErr
 	}
@ -183,12 +183,11 @@ func (h *Handler) RefreshFeed(userID, feedID int64) error {
 			return err
 		}

-		body, err := response.NormalizeBodyEncoding()
-		if err != nil {
+		if err := response.EnsureUnicodeBody(); err != nil {
 			return errors.NewLocalizedError(errEncoding, err)
 		}

-		subscription, parseErr := parseFeed(body)
+		subscription, parseErr := parser.ParseFeed(response.String())
 		if parseErr != nil {
 			originalFeed.ParsingErrorCount++
 			originalFeed.ParsingErrorMsg = parseErr.Localize(printer)
--- a/reader/feed/parser.go
+++ b/reader/feed/parser.go
@ -1,118 +0,0 @@
-// Copyright 2017 Frédéric Guillot. All rights reserved.
-// Use of this source code is governed by the Apache 2.0
-// license that can be found in the LICENSE file.
-
-package feed // import "miniflux.app/reader/feed"
-
-import (
-	"bytes"
-	"encoding/xml"
-	"io"
-	"strings"
-	"time"
-
-	"miniflux.app/errors"
-	"miniflux.app/logger"
-	"miniflux.app/model"
-	"miniflux.app/reader/atom"
-	"miniflux.app/reader/encoding"
-	"miniflux.app/reader/json"
-	"miniflux.app/reader/rdf"
-	"miniflux.app/reader/rss"
-	"miniflux.app/timer"
-)
-
-// List of feed formats.
-const (
-	FormatRDF     = "rdf"
-	FormatRSS     = "rss"
-	FormatAtom    = "atom"
-	FormatJSON    = "json"
-	FormatUnknown = "unknown"
-)
-
-// DetectFeedFormat detect feed format from input data.
-func DetectFeedFormat(r io.Reader) string {
-	defer timer.ExecutionTime(time.Now(), "[Feed:DetectFeedFormat]")
-
-	var buffer bytes.Buffer
-	tee := io.TeeReader(r, &buffer)
-
-	decoder := xml.NewDecoder(tee)
-	decoder.CharsetReader = encoding.CharsetReader
-
-	for {
-		token, _ := decoder.Token()
-		if token == nil {
-			break
-		}
-
-		if element, ok := token.(xml.StartElement); ok {
-			switch element.Name.Local {
-			case "rss":
-				return FormatRSS
-			case "feed":
-				return FormatAtom
-			case "RDF":
-				return FormatRDF
-			}
-		}
-	}
-
-	if strings.HasPrefix(strings.TrimSpace(buffer.String()), "{") {
-		return FormatJSON
-	}
-
-	return FormatUnknown
-}
-
-func parseFeed(r io.Reader) (*model.Feed, *errors.LocalizedError) {
-	defer timer.ExecutionTime(time.Now(), "[Feed:ParseFeed]")
-
-	var buffer bytes.Buffer
-	size, _ := io.Copy(&buffer, r)
-	if size == 0 {
-		return nil, errors.NewLocalizedError(errEmptyFeed)
-	}
-
-	str := stripInvalidXMLCharacters(buffer.String())
-	reader := strings.NewReader(str)
-	format := DetectFeedFormat(reader)
-	reader.Seek(0, io.SeekStart)
-
-	switch format {
-	case FormatAtom:
-		return atom.Parse(reader)
-	case FormatRSS:
-		return rss.Parse(reader)
-	case FormatJSON:
-		return json.Parse(reader)
-	case FormatRDF:
-		return rdf.Parse(reader)
-	default:
-		return nil, errors.NewLocalizedError("Unsupported feed format")
-	}
-}
-
-func stripInvalidXMLCharacters(input string) string {
-	return strings.Map(func(r rune) rune {
-		if isInCharacterRange(r) {
-			return r
-		}
-
-		logger.Debug("Strip invalid XML characters: %U", r)
-		return -1
-	}, input)
-}
-
-// Decide whether the given rune is in the XML Character Range, per
-// the Char production of http://www.xml.com/axml/testaxml.htm,
-// Section 2.2 Characters.
-func isInCharacterRange(r rune) (inrange bool) {
-	return r == 0x09 ||
-		r == 0x0A ||
-		r == 0x0D ||
-		r >= 0x20 && r <= 0xDF77 ||
-		r >= 0xE000 && r <= 0xFFFD ||
-		r >= 0x10000 && r <= 0x10FFFF
-}
--- a/reader/parser/doc.go
+++ b/reader/parser/doc.go
@ -0,0 +1,10 @@
+// Copyright 2018 Frédéric Guillot. All rights reserved.
+// Use of this source code is governed by the Apache 2.0
+// license that can be found in the LICENSE file.
+
+/*
+
+Package parser provides a generic feed parser that abstract all different formats.
+
+*/
+package parser // import "miniflux.app/reader/parser"
--- a/reader/parser/format.go
+++ b/reader/parser/format.go
@ -0,0 +1,51 @@
+// Copyright 2018 Frédéric Guillot. All rights reserved.
+// Use of this source code is governed by the Apache 2.0
+// license that can be found in the LICENSE file.
+
+package parser // import "miniflux.app/reader/parser"
+
+import (
+	"encoding/xml"
+	"strings"
+
+	"miniflux.app/reader/encoding"
+)
+
+// List of feed formats.
+const (
+	FormatRDF     = "rdf"
+	FormatRSS     = "rss"
+	FormatAtom    = "atom"
+	FormatJSON    = "json"
+	FormatUnknown = "unknown"
+)
+
+// DetectFeedFormat tries to guess the feed format from input data.
+func DetectFeedFormat(data string) string {
+	if strings.HasPrefix(strings.TrimSpace(data), "{") {
+		return FormatJSON
+	}
+
+	decoder := xml.NewDecoder(strings.NewReader(data))
+	decoder.CharsetReader = encoding.CharsetReader
+
+	for {
+		token, _ := decoder.Token()
+		if token == nil {
+			break
+		}
+
+		if element, ok := token.(xml.StartElement); ok {
+			switch element.Name.Local {
+			case "rss":
+				return FormatRSS
+			case "feed":
+				return FormatAtom
+			case "RDF":
+				return FormatRDF
+			}
+		}
+	}
+
+	return FormatUnknown
+}
--- a/reader/parser/format_test.go
+++ b/reader/parser/format_test.go
@ -0,0 +1,70 @@
+// Copyright 2018 Frédéric Guillot. All rights reserved.
+// Use of this source code is governed by the Apache 2.0
+// license that can be found in the LICENSE file.
+
+package parser // import "miniflux.app/reader/parser"
+
+import (
+	"testing"
+)
+
+func TestDetectRDF(t *testing.T) {
+	data := `<?xml version="1.0"?><rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://my.netscape.com/rdf/simple/0.9/"></rdf:RDF>`
+	format := DetectFeedFormat(data)
+
+	if format != FormatRDF {
+		t.Errorf(`Wrong format detected: %q instead of %q`, format, FormatRDF)
+	}
+}
+
+func TestDetectRSS(t *testing.T) {
+	data := `<?xml version="1.0"?><rss version="2.0"><channel></channel></rss>`
+	format := DetectFeedFormat(data)
+
+	if format != FormatRSS {
+		t.Errorf(`Wrong format detected: %q instead of %q`, format, FormatRSS)
+	}
+}
+
+func TestDetectAtom(t *testing.T) {
+	data := `<?xml version="1.0" encoding="utf-8"?><feed xmlns="http://www.w3.org/2005/Atom"></feed>`
+	format := DetectFeedFormat(data)
+
+	if format != FormatAtom {
+		t.Errorf(`Wrong format detected: %q instead of %q`, format, FormatAtom)
+	}
+}
+
+func TestDetectAtomWithISOCharset(t *testing.T) {
+	data := `<?xml version="1.0" encoding="ISO-8859-15"?><feed xmlns="http://www.w3.org/2005/Atom"></feed>`
+	format := DetectFeedFormat(data)
+
+	if format != FormatAtom {
+		t.Errorf(`Wrong format detected: %q instead of %q`, format, FormatAtom)
+	}
+}
+
+func TestDetectJSON(t *testing.T) {
+	data := `
+	{
+		"version" : "https://jsonfeed.org/version/1",
+		"title" : "Example"
+	}
+	`
+	format := DetectFeedFormat(data)
+
+	if format != FormatJSON {
+		t.Errorf(`Wrong format detected: %q instead of %q`, format, FormatJSON)
+	}
+}
+
+func TestDetectUnknown(t *testing.T) {
+	data := `
+	<!DOCTYPE html> <html> </html>
+	`
+	format := DetectFeedFormat(data)
+
+	if format != FormatUnknown {
+		t.Errorf(`Wrong format detected: %q instead of %q`, format, FormatUnknown)
+	}
+}
--- a/reader/parser/parser.go
+++ b/reader/parser/parser.go
@ -0,0 +1,58 @@
+// Copyright 2018 Frédéric Guillot. All rights reserved.
+// Use of this source code is governed by the Apache 2.0
+// license that can be found in the LICENSE file.
+
+package parser // import "miniflux.app/reader/parser"
+
+import (
+	"strings"
+
+	"miniflux.app/errors"
+	"miniflux.app/logger"
+	"miniflux.app/model"
+	"miniflux.app/reader/atom"
+	"miniflux.app/reader/json"
+	"miniflux.app/reader/rdf"
+	"miniflux.app/reader/rss"
+)
+
+// ParseFeed analyzes the input data and returns a normalized feed object.
+func ParseFeed(data string) (*model.Feed, *errors.LocalizedError) {
+	data = stripInvalidXMLCharacters(data)
+
+	switch DetectFeedFormat(data) {
+	case FormatAtom:
+		return atom.Parse(strings.NewReader(data))
+	case FormatRSS:
+		return rss.Parse(strings.NewReader(data))
+	case FormatJSON:
+		return json.Parse(strings.NewReader(data))
+	case FormatRDF:
+		return rdf.Parse(strings.NewReader(data))
+	default:
+		return nil, errors.NewLocalizedError("Unsupported feed format")
+	}
+}
+
+func stripInvalidXMLCharacters(input string) string {
+	return strings.Map(func(r rune) rune {
+		if isInCharacterRange(r) {
+			return r
+		}
+
+		logger.Debug("Strip invalid XML characters: %U", r)
+		return -1
+	}, input)
+}
+
+// Decide whether the given rune is in the XML Character Range, per
+// the Char production of http://www.xml.com/axml/testaxml.htm,
+// Section 2.2 Characters.
+func isInCharacterRange(r rune) (inrange bool) {
+	return r == 0x09 ||
+		r == 0x0A ||
+		r == 0x0D ||
+		r >= 0x20 && r <= 0xDF77 ||
+		r >= 0xE000 && r <= 0xFFFD ||
+		r >= 0x10000 && r <= 0x10FFFF
+}
--- a/reader/parser/parser_test.go
+++ b/reader/parser/parser_test.go
@ -2,74 +2,12 @@
 // Use of this source code is governed by the Apache 2.0
 // license that can be found in the LICENSE file.

-package feed // import "miniflux.app/reader/feed"
+package parser // import "miniflux.app/reader/parser"

 import (
-	"bytes"
 	"testing"
 )

-func TestDetectRDF(t *testing.T) {
-	data := `<?xml version="1.0"?><rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://my.netscape.com/rdf/simple/0.9/"></rdf:RDF>`
-	format := DetectFeedFormat(bytes.NewBufferString(data))
-
-	if format != FormatRDF {
-		t.Errorf("Wrong format detected: %s instead of %s", format, FormatRDF)
-	}
-}
-
-func TestDetectRSS(t *testing.T) {
-	data := `<?xml version="1.0"?><rss version="2.0"><channel></channel></rss>`
-	format := DetectFeedFormat(bytes.NewBufferString(data))
-
-	if format != FormatRSS {
-		t.Errorf("Wrong format detected: %s instead of %s", format, FormatRSS)
-	}
-}
-
-func TestDetectAtom(t *testing.T) {
-	data := `<?xml version="1.0" encoding="utf-8"?><feed xmlns="http://www.w3.org/2005/Atom"></feed>`
-	format := DetectFeedFormat(bytes.NewBufferString(data))
-
-	if format != FormatAtom {
-		t.Errorf("Wrong format detected: %s instead of %s", format, FormatAtom)
-	}
-}
-
-func TestDetectAtomWithISOCharset(t *testing.T) {
-	data := `<?xml version="1.0" encoding="ISO-8859-15"?><feed xmlns="http://www.w3.org/2005/Atom"></feed>`
-	format := DetectFeedFormat(bytes.NewBufferString(data))
-
-	if format != FormatAtom {
-		t.Errorf("Wrong format detected: %s instead of %s", format, FormatAtom)
-	}
-}
-
-func TestDetectJSON(t *testing.T) {
-	data := `
-	{
-		"version" : "https://jsonfeed.org/version/1",
-		"title" : "Example"
-	}
-	`
-	format := DetectFeedFormat(bytes.NewBufferString(data))
-
-	if format != FormatJSON {
-		t.Errorf("Wrong format detected: %s instead of %s", format, FormatJSON)
-	}
-}
-
-func TestDetectUnknown(t *testing.T) {
-	data := `
-	<!DOCTYPE html> <html> </html>
-	`
-	format := DetectFeedFormat(bytes.NewBufferString(data))
-
-	if format != FormatUnknown {
-		t.Errorf("Wrong format detected: %s instead of %s", format, FormatUnknown)
-	}
-}
-
 func TestParseAtom(t *testing.T) {
 	data := `<?xml version="1.0" encoding="utf-8"?>
 	<feed xmlns="http://www.w3.org/2005/Atom">
@ -92,7 +30,7 @@ func TestParseAtom(t *testing.T) {

 	</feed>`

-	feed, err := parseFeed(bytes.NewBufferString(data))
+	feed, err := ParseFeed(data)
 	if err != nil {
 		t.Error(err)
 	}
@ -118,7 +56,7 @@ func TestParseRSS(t *testing.T) {
 	</channel>
 	</rss>`

-	feed, err := parseFeed(bytes.NewBufferString(data))
+	feed, err := ParseFeed(data)
 	if err != nil {
 		t.Error(err)
 	}
@ -147,7 +85,7 @@ func TestParseRDF(t *testing.T) {
 		  </item>
 		</rdf:RDF>`

-	feed, err := parseFeed(bytes.NewBufferString(data))
+	feed, err := ParseFeed(data)
 	if err != nil {
 		t.Error(err)
 	}
@ -177,7 +115,7 @@ func TestParseJson(t *testing.T) {
 		]
 	}`

-	feed, err := parseFeed(bytes.NewBufferString(data))
+	feed, err := ParseFeed(data)
 	if err != nil {
 		t.Error(err)
 	}
@ -200,14 +138,14 @@ func TestParseUnknownFeed(t *testing.T) {
 		</html>
 	`

-	_, err := parseFeed(bytes.NewBufferString(data))
+	_, err := ParseFeed(data)
 	if err == nil {
 		t.Error("ParseFeed must returns an error")
 	}
 }

 func TestParseEmptyFeed(t *testing.T) {
-	_, err := parseFeed(bytes.NewBufferString(""))
+	_, err := ParseFeed("")
 	if err == nil {
 		t.Error("ParseFeed must returns an error")
 	}
--- a/reader/scraper/scraper.go
+++ b/reader/scraper/scraper.go
@ -18,7 +18,7 @@ import (
 	"github.com/PuerkitoBio/goquery"
 )

-// Fetch downloads a web page a returns relevant contents.
+// Fetch downloads a web page and returns relevant contents.
 func Fetch(websiteURL, rules, userAgent string) (string, error) {
 	clt := client.New(websiteURL)
 	if userAgent != "" {
@ -38,8 +38,7 @@ func Fetch(websiteURL, rules, userAgent string) (string, error) {
 		return "", fmt.Errorf("scraper: this resource is not a HTML document (%s)", response.ContentType)
 	}

-	page, err := response.NormalizeBodyEncoding()
-	if err != nil {
+	if err = response.EnsureUnicodeBody(); err != nil {
 		return "", err
 	}

@ -52,11 +51,11 @@ func Fetch(websiteURL, rules, userAgent string) (string, error) {

 	var content string
 	if rules != "" {
-		logger.Debug(`[Scraper] Using rules "%s" for "%s"`, rules, websiteURL)
-		content, err = scrapContent(page, rules)
+		logger.Debug(`[Scraper] Using rules %q for %q`, rules, websiteURL)
+		content, err = scrapContent(response.Body, rules)
 	} else {
-		logger.Debug(`[Scraper] Using readability for "%s"`, websiteURL)
-		content, err = readability.ExtractContent(page)
+		logger.Debug(`[Scraper] Using readability for "%q`, websiteURL)
+		content, err = readability.ExtractContent(response.Body)
 	}

 	if err != nil {
--- a/reader/subscription/finder.go
+++ b/reader/subscription/finder.go
@ -5,15 +5,15 @@
 package subscription // import "miniflux.app/reader/subscription"

 import (
-	"bytes"
 	"fmt"
 	"io"
+	"strings"
 	"time"

 	"miniflux.app/errors"
 	"miniflux.app/http/client"
 	"miniflux.app/logger"
-	"miniflux.app/reader/feed"
+	"miniflux.app/reader/parser"
 	"miniflux.app/timer"
 	"miniflux.app/url"

@ -56,20 +56,12 @@ func FindSubscriptions(websiteURL, userAgent, username, password string) (Subscr
 		return nil, errors.NewLocalizedError(errEmptyBody)
 	}

-	body, err := response.NormalizeBodyEncoding()
-	if err != nil {
+	if err := response.EnsureUnicodeBody(); err != nil {
 		return nil, err
 	}

-	var buffer bytes.Buffer
-	size, _ := io.Copy(&buffer, body)
-	if size == 0 {
-		return nil, errors.NewLocalizedError(errEmptyBody)
-	}
-
-	reader := bytes.NewReader(buffer.Bytes())
-
-	if format := feed.DetectFeedFormat(reader); format != feed.FormatUnknown {
+	body := response.String()
+	if format := parser.DetectFeedFormat(body); format != parser.FormatUnknown {
 		var subscriptions Subscriptions
 		subscriptions = append(subscriptions, &Subscription{
 			Title: response.EffectiveURL,
@ -80,8 +72,7 @@ func FindSubscriptions(websiteURL, userAgent, username, password string) (Subscr
 		return subscriptions, nil
 	}

-	reader.Seek(0, io.SeekStart)
-	return parseDocument(response.EffectiveURL, bytes.NewReader(buffer.Bytes()))
+	return parseDocument(response.EffectiveURL, strings.NewReader(body))
 }

 func parseDocument(websiteURL string, data io.Reader) (Subscriptions, error) {