From 5870f0426002c8e26a9ff472b23e15d7bf1235f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Guillot?= Date: Sun, 14 Oct 2018 11:46:41 -0700 Subject: [PATCH] Simplify feed parser and format detection - Avoid doing multiple buffer copies - Move parser and format detection logic to its own package --- http/client/response.go | 20 +++-- reader/feed/doc.go | 2 +- reader/feed/handler.go | 11 ++- reader/feed/parser.go | 118 ------------------------- reader/parser/doc.go | 10 +++ reader/parser/format.go | 51 +++++++++++ reader/parser/format_test.go | 70 +++++++++++++++ reader/parser/parser.go | 58 ++++++++++++ reader/{feed => parser}/parser_test.go | 76 ++-------------- reader/scraper/scraper.go | 13 ++- reader/subscription/finder.go | 21 ++--- 11 files changed, 229 insertions(+), 221 deletions(-) delete mode 100644 reader/feed/parser.go create mode 100644 reader/parser/doc.go create mode 100644 reader/parser/format.go create mode 100644 reader/parser/format_test.go create mode 100644 reader/parser/parser.go rename reader/{feed => parser}/parser_test.go (61%) diff --git a/http/client/response.go b/http/client/response.go index 0efd8e24..c084824e 100644 --- a/http/client/response.go +++ b/http/client/response.go @@ -6,6 +6,7 @@ package client // import "miniflux.app/http/client" import ( "io" + "io/ioutil" "mime" "strings" @@ -56,23 +57,32 @@ func (r *Response) IsModified(etag, lastModified string) bool { return true } -// NormalizeBodyEncoding make sure the body is encoded in UTF-8. +// EnsureUnicodeBody makes sure the body is encoded in UTF-8. // // If a charset other than UTF-8 is detected, we convert the document to UTF-8. // This is used by the scraper and feed readers. // // Do not forget edge cases: // - Some non-utf8 feeds specify encoding only in Content-Type, not in XML document. -func (r *Response) NormalizeBodyEncoding() (io.Reader, error) { +func (r *Response) EnsureUnicodeBody() error { _, params, err := mime.ParseMediaType(r.ContentType) if err == nil { if enc, found := params["charset"]; found { enc = strings.ToLower(enc) if enc != "utf-8" && enc != "utf8" && enc != "" { - logger.Debug("[NormalizeBodyEncoding] Convert body to UTF-8 from %s", enc) - return charset.NewReader(r.Body, r.ContentType) + logger.Debug("[EnsureUnicodeBody] Convert body to utf-8 from %s", enc) + r.Body, err = charset.NewReader(r.Body, r.ContentType) + if err != nil { + return err + } } } } - return r.Body, nil + return nil +} + +// String returns the response body as string. +func (r *Response) String() string { + bytes, _ := ioutil.ReadAll(r.Body) + return string(bytes) } diff --git a/reader/feed/doc.go b/reader/feed/doc.go index de2e5ab7..0b101fda 100644 --- a/reader/feed/doc.go +++ b/reader/feed/doc.go @@ -4,7 +4,7 @@ /* -Package feed provides a generic feed parser that abstracts all different formats. +Package feed handles feed updates and creation. */ package feed // import "miniflux.app/reader/feed" diff --git a/reader/feed/handler.go b/reader/feed/handler.go index 0b81c678..0945948b 100644 --- a/reader/feed/handler.go +++ b/reader/feed/handler.go @@ -14,6 +14,7 @@ import ( "miniflux.app/logger" "miniflux.app/model" "miniflux.app/reader/icon" + "miniflux.app/reader/parser" "miniflux.app/reader/processor" "miniflux.app/storage" "miniflux.app/timer" @@ -67,12 +68,11 @@ func (h *Handler) CreateFeed(userID, categoryID int64, url string, crawler bool, return nil, errors.NewLocalizedError(errDuplicate, response.EffectiveURL) } - body, err := response.NormalizeBodyEncoding() - if err != nil { + if err := response.EnsureUnicodeBody(); err != nil { return nil, errors.NewLocalizedError(errEncoding, err) } - subscription, feedErr := parseFeed(body) + subscription, feedErr := parser.ParseFeed(response.String()) if feedErr != nil { return nil, feedErr } @@ -183,12 +183,11 @@ func (h *Handler) RefreshFeed(userID, feedID int64) error { return err } - body, err := response.NormalizeBodyEncoding() - if err != nil { + if err := response.EnsureUnicodeBody(); err != nil { return errors.NewLocalizedError(errEncoding, err) } - subscription, parseErr := parseFeed(body) + subscription, parseErr := parser.ParseFeed(response.String()) if parseErr != nil { originalFeed.ParsingErrorCount++ originalFeed.ParsingErrorMsg = parseErr.Localize(printer) diff --git a/reader/feed/parser.go b/reader/feed/parser.go deleted file mode 100644 index 0c7f51c0..00000000 --- a/reader/feed/parser.go +++ /dev/null @@ -1,118 +0,0 @@ -// Copyright 2017 Frédéric Guillot. All rights reserved. -// Use of this source code is governed by the Apache 2.0 -// license that can be found in the LICENSE file. - -package feed // import "miniflux.app/reader/feed" - -import ( - "bytes" - "encoding/xml" - "io" - "strings" - "time" - - "miniflux.app/errors" - "miniflux.app/logger" - "miniflux.app/model" - "miniflux.app/reader/atom" - "miniflux.app/reader/encoding" - "miniflux.app/reader/json" - "miniflux.app/reader/rdf" - "miniflux.app/reader/rss" - "miniflux.app/timer" -) - -// List of feed formats. -const ( - FormatRDF = "rdf" - FormatRSS = "rss" - FormatAtom = "atom" - FormatJSON = "json" - FormatUnknown = "unknown" -) - -// DetectFeedFormat detect feed format from input data. -func DetectFeedFormat(r io.Reader) string { - defer timer.ExecutionTime(time.Now(), "[Feed:DetectFeedFormat]") - - var buffer bytes.Buffer - tee := io.TeeReader(r, &buffer) - - decoder := xml.NewDecoder(tee) - decoder.CharsetReader = encoding.CharsetReader - - for { - token, _ := decoder.Token() - if token == nil { - break - } - - if element, ok := token.(xml.StartElement); ok { - switch element.Name.Local { - case "rss": - return FormatRSS - case "feed": - return FormatAtom - case "RDF": - return FormatRDF - } - } - } - - if strings.HasPrefix(strings.TrimSpace(buffer.String()), "{") { - return FormatJSON - } - - return FormatUnknown -} - -func parseFeed(r io.Reader) (*model.Feed, *errors.LocalizedError) { - defer timer.ExecutionTime(time.Now(), "[Feed:ParseFeed]") - - var buffer bytes.Buffer - size, _ := io.Copy(&buffer, r) - if size == 0 { - return nil, errors.NewLocalizedError(errEmptyFeed) - } - - str := stripInvalidXMLCharacters(buffer.String()) - reader := strings.NewReader(str) - format := DetectFeedFormat(reader) - reader.Seek(0, io.SeekStart) - - switch format { - case FormatAtom: - return atom.Parse(reader) - case FormatRSS: - return rss.Parse(reader) - case FormatJSON: - return json.Parse(reader) - case FormatRDF: - return rdf.Parse(reader) - default: - return nil, errors.NewLocalizedError("Unsupported feed format") - } -} - -func stripInvalidXMLCharacters(input string) string { - return strings.Map(func(r rune) rune { - if isInCharacterRange(r) { - return r - } - - logger.Debug("Strip invalid XML characters: %U", r) - return -1 - }, input) -} - -// Decide whether the given rune is in the XML Character Range, per -// the Char production of http://www.xml.com/axml/testaxml.htm, -// Section 2.2 Characters. -func isInCharacterRange(r rune) (inrange bool) { - return r == 0x09 || - r == 0x0A || - r == 0x0D || - r >= 0x20 && r <= 0xDF77 || - r >= 0xE000 && r <= 0xFFFD || - r >= 0x10000 && r <= 0x10FFFF -} diff --git a/reader/parser/doc.go b/reader/parser/doc.go new file mode 100644 index 00000000..11670fad --- /dev/null +++ b/reader/parser/doc.go @@ -0,0 +1,10 @@ +// Copyright 2018 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +/* + +Package parser provides a generic feed parser that abstract all different formats. + +*/ +package parser // import "miniflux.app/reader/parser" diff --git a/reader/parser/format.go b/reader/parser/format.go new file mode 100644 index 00000000..87963d3b --- /dev/null +++ b/reader/parser/format.go @@ -0,0 +1,51 @@ +// Copyright 2018 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package parser // import "miniflux.app/reader/parser" + +import ( + "encoding/xml" + "strings" + + "miniflux.app/reader/encoding" +) + +// List of feed formats. +const ( + FormatRDF = "rdf" + FormatRSS = "rss" + FormatAtom = "atom" + FormatJSON = "json" + FormatUnknown = "unknown" +) + +// DetectFeedFormat tries to guess the feed format from input data. +func DetectFeedFormat(data string) string { + if strings.HasPrefix(strings.TrimSpace(data), "{") { + return FormatJSON + } + + decoder := xml.NewDecoder(strings.NewReader(data)) + decoder.CharsetReader = encoding.CharsetReader + + for { + token, _ := decoder.Token() + if token == nil { + break + } + + if element, ok := token.(xml.StartElement); ok { + switch element.Name.Local { + case "rss": + return FormatRSS + case "feed": + return FormatAtom + case "RDF": + return FormatRDF + } + } + } + + return FormatUnknown +} diff --git a/reader/parser/format_test.go b/reader/parser/format_test.go new file mode 100644 index 00000000..3795541b --- /dev/null +++ b/reader/parser/format_test.go @@ -0,0 +1,70 @@ +// Copyright 2018 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package parser // import "miniflux.app/reader/parser" + +import ( + "testing" +) + +func TestDetectRDF(t *testing.T) { + data := `` + format := DetectFeedFormat(data) + + if format != FormatRDF { + t.Errorf(`Wrong format detected: %q instead of %q`, format, FormatRDF) + } +} + +func TestDetectRSS(t *testing.T) { + data := `` + format := DetectFeedFormat(data) + + if format != FormatRSS { + t.Errorf(`Wrong format detected: %q instead of %q`, format, FormatRSS) + } +} + +func TestDetectAtom(t *testing.T) { + data := `` + format := DetectFeedFormat(data) + + if format != FormatAtom { + t.Errorf(`Wrong format detected: %q instead of %q`, format, FormatAtom) + } +} + +func TestDetectAtomWithISOCharset(t *testing.T) { + data := `` + format := DetectFeedFormat(data) + + if format != FormatAtom { + t.Errorf(`Wrong format detected: %q instead of %q`, format, FormatAtom) + } +} + +func TestDetectJSON(t *testing.T) { + data := ` + { + "version" : "https://jsonfeed.org/version/1", + "title" : "Example" + } + ` + format := DetectFeedFormat(data) + + if format != FormatJSON { + t.Errorf(`Wrong format detected: %q instead of %q`, format, FormatJSON) + } +} + +func TestDetectUnknown(t *testing.T) { + data := ` + + ` + format := DetectFeedFormat(data) + + if format != FormatUnknown { + t.Errorf(`Wrong format detected: %q instead of %q`, format, FormatUnknown) + } +} diff --git a/reader/parser/parser.go b/reader/parser/parser.go new file mode 100644 index 00000000..30fc6034 --- /dev/null +++ b/reader/parser/parser.go @@ -0,0 +1,58 @@ +// Copyright 2018 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package parser // import "miniflux.app/reader/parser" + +import ( + "strings" + + "miniflux.app/errors" + "miniflux.app/logger" + "miniflux.app/model" + "miniflux.app/reader/atom" + "miniflux.app/reader/json" + "miniflux.app/reader/rdf" + "miniflux.app/reader/rss" +) + +// ParseFeed analyzes the input data and returns a normalized feed object. +func ParseFeed(data string) (*model.Feed, *errors.LocalizedError) { + data = stripInvalidXMLCharacters(data) + + switch DetectFeedFormat(data) { + case FormatAtom: + return atom.Parse(strings.NewReader(data)) + case FormatRSS: + return rss.Parse(strings.NewReader(data)) + case FormatJSON: + return json.Parse(strings.NewReader(data)) + case FormatRDF: + return rdf.Parse(strings.NewReader(data)) + default: + return nil, errors.NewLocalizedError("Unsupported feed format") + } +} + +func stripInvalidXMLCharacters(input string) string { + return strings.Map(func(r rune) rune { + if isInCharacterRange(r) { + return r + } + + logger.Debug("Strip invalid XML characters: %U", r) + return -1 + }, input) +} + +// Decide whether the given rune is in the XML Character Range, per +// the Char production of http://www.xml.com/axml/testaxml.htm, +// Section 2.2 Characters. +func isInCharacterRange(r rune) (inrange bool) { + return r == 0x09 || + r == 0x0A || + r == 0x0D || + r >= 0x20 && r <= 0xDF77 || + r >= 0xE000 && r <= 0xFFFD || + r >= 0x10000 && r <= 0x10FFFF +} diff --git a/reader/feed/parser_test.go b/reader/parser/parser_test.go similarity index 61% rename from reader/feed/parser_test.go rename to reader/parser/parser_test.go index 46dc34eb..7e3f900f 100644 --- a/reader/feed/parser_test.go +++ b/reader/parser/parser_test.go @@ -2,74 +2,12 @@ // Use of this source code is governed by the Apache 2.0 // license that can be found in the LICENSE file. -package feed // import "miniflux.app/reader/feed" +package parser // import "miniflux.app/reader/parser" import ( - "bytes" "testing" ) -func TestDetectRDF(t *testing.T) { - data := `` - format := DetectFeedFormat(bytes.NewBufferString(data)) - - if format != FormatRDF { - t.Errorf("Wrong format detected: %s instead of %s", format, FormatRDF) - } -} - -func TestDetectRSS(t *testing.T) { - data := `` - format := DetectFeedFormat(bytes.NewBufferString(data)) - - if format != FormatRSS { - t.Errorf("Wrong format detected: %s instead of %s", format, FormatRSS) - } -} - -func TestDetectAtom(t *testing.T) { - data := `` - format := DetectFeedFormat(bytes.NewBufferString(data)) - - if format != FormatAtom { - t.Errorf("Wrong format detected: %s instead of %s", format, FormatAtom) - } -} - -func TestDetectAtomWithISOCharset(t *testing.T) { - data := `` - format := DetectFeedFormat(bytes.NewBufferString(data)) - - if format != FormatAtom { - t.Errorf("Wrong format detected: %s instead of %s", format, FormatAtom) - } -} - -func TestDetectJSON(t *testing.T) { - data := ` - { - "version" : "https://jsonfeed.org/version/1", - "title" : "Example" - } - ` - format := DetectFeedFormat(bytes.NewBufferString(data)) - - if format != FormatJSON { - t.Errorf("Wrong format detected: %s instead of %s", format, FormatJSON) - } -} - -func TestDetectUnknown(t *testing.T) { - data := ` - - ` - format := DetectFeedFormat(bytes.NewBufferString(data)) - - if format != FormatUnknown { - t.Errorf("Wrong format detected: %s instead of %s", format, FormatUnknown) - } -} - func TestParseAtom(t *testing.T) { data := ` @@ -92,7 +30,7 @@ func TestParseAtom(t *testing.T) { ` - feed, err := parseFeed(bytes.NewBufferString(data)) + feed, err := ParseFeed(data) if err != nil { t.Error(err) } @@ -118,7 +56,7 @@ func TestParseRSS(t *testing.T) { ` - feed, err := parseFeed(bytes.NewBufferString(data)) + feed, err := ParseFeed(data) if err != nil { t.Error(err) } @@ -147,7 +85,7 @@ func TestParseRDF(t *testing.T) { ` - feed, err := parseFeed(bytes.NewBufferString(data)) + feed, err := ParseFeed(data) if err != nil { t.Error(err) } @@ -177,7 +115,7 @@ func TestParseJson(t *testing.T) { ] }` - feed, err := parseFeed(bytes.NewBufferString(data)) + feed, err := ParseFeed(data) if err != nil { t.Error(err) } @@ -200,14 +138,14 @@ func TestParseUnknownFeed(t *testing.T) { ` - _, err := parseFeed(bytes.NewBufferString(data)) + _, err := ParseFeed(data) if err == nil { t.Error("ParseFeed must returns an error") } } func TestParseEmptyFeed(t *testing.T) { - _, err := parseFeed(bytes.NewBufferString("")) + _, err := ParseFeed("") if err == nil { t.Error("ParseFeed must returns an error") } diff --git a/reader/scraper/scraper.go b/reader/scraper/scraper.go index 7aa70841..b62d1ca6 100644 --- a/reader/scraper/scraper.go +++ b/reader/scraper/scraper.go @@ -18,7 +18,7 @@ import ( "github.com/PuerkitoBio/goquery" ) -// Fetch downloads a web page a returns relevant contents. +// Fetch downloads a web page and returns relevant contents. func Fetch(websiteURL, rules, userAgent string) (string, error) { clt := client.New(websiteURL) if userAgent != "" { @@ -38,8 +38,7 @@ func Fetch(websiteURL, rules, userAgent string) (string, error) { return "", fmt.Errorf("scraper: this resource is not a HTML document (%s)", response.ContentType) } - page, err := response.NormalizeBodyEncoding() - if err != nil { + if err = response.EnsureUnicodeBody(); err != nil { return "", err } @@ -52,11 +51,11 @@ func Fetch(websiteURL, rules, userAgent string) (string, error) { var content string if rules != "" { - logger.Debug(`[Scraper] Using rules "%s" for "%s"`, rules, websiteURL) - content, err = scrapContent(page, rules) + logger.Debug(`[Scraper] Using rules %q for %q`, rules, websiteURL) + content, err = scrapContent(response.Body, rules) } else { - logger.Debug(`[Scraper] Using readability for "%s"`, websiteURL) - content, err = readability.ExtractContent(page) + logger.Debug(`[Scraper] Using readability for "%q`, websiteURL) + content, err = readability.ExtractContent(response.Body) } if err != nil { diff --git a/reader/subscription/finder.go b/reader/subscription/finder.go index 027e8106..c59c0027 100644 --- a/reader/subscription/finder.go +++ b/reader/subscription/finder.go @@ -5,15 +5,15 @@ package subscription // import "miniflux.app/reader/subscription" import ( - "bytes" "fmt" "io" + "strings" "time" "miniflux.app/errors" "miniflux.app/http/client" "miniflux.app/logger" - "miniflux.app/reader/feed" + "miniflux.app/reader/parser" "miniflux.app/timer" "miniflux.app/url" @@ -56,20 +56,12 @@ func FindSubscriptions(websiteURL, userAgent, username, password string) (Subscr return nil, errors.NewLocalizedError(errEmptyBody) } - body, err := response.NormalizeBodyEncoding() - if err != nil { + if err := response.EnsureUnicodeBody(); err != nil { return nil, err } - var buffer bytes.Buffer - size, _ := io.Copy(&buffer, body) - if size == 0 { - return nil, errors.NewLocalizedError(errEmptyBody) - } - - reader := bytes.NewReader(buffer.Bytes()) - - if format := feed.DetectFeedFormat(reader); format != feed.FormatUnknown { + body := response.String() + if format := parser.DetectFeedFormat(body); format != parser.FormatUnknown { var subscriptions Subscriptions subscriptions = append(subscriptions, &Subscription{ Title: response.EffectiveURL, @@ -80,8 +72,7 @@ func FindSubscriptions(websiteURL, userAgent, username, password string) (Subscr return subscriptions, nil } - reader.Seek(0, io.SeekStart) - return parseDocument(response.EffectiveURL, bytes.NewReader(buffer.Bytes())) + return parseDocument(response.EffectiveURL, strings.NewReader(body)) } func parseDocument(websiteURL string, data io.Reader) (Subscriptions, error) {