Handle more encoding edge cases
- Feeds with charset specified only in Content-Type header and not in XML document - Feeds with charset specified in both places - Feeds with charset specified only in XML document and not in HTTP header
This commit is contained in:
parent
3b62f904d6
commit
713b38e34c
10 changed files with 87 additions and 21 deletions
|
@ -95,11 +95,12 @@ func (c *Client) executeRequest(request *http.Request) (*Response, error) {
|
||||||
ContentLength: resp.ContentLength,
|
ContentLength: resp.ContentLength,
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.Debug("[HttpClient:%s] OriginalURL=%s, StatusCode=%d, ContentLength=%d, ETag=%s, LastModified=%s, EffectiveURL=%s",
|
logger.Debug("[HttpClient:%s] OriginalURL=%s, StatusCode=%d, ContentLength=%d, ContentType=%s, ETag=%s, LastModified=%s, EffectiveURL=%s",
|
||||||
request.Method,
|
request.Method,
|
||||||
c.url,
|
c.url,
|
||||||
response.StatusCode,
|
response.StatusCode,
|
||||||
resp.ContentLength,
|
resp.ContentLength,
|
||||||
|
response.ContentType,
|
||||||
response.ETag,
|
response.ETag,
|
||||||
response.LastModified,
|
response.LastModified,
|
||||||
response.EffectiveURL,
|
response.EffectiveURL,
|
||||||
|
|
|
@ -6,8 +6,10 @@ package http
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"io"
|
"io"
|
||||||
|
"mime"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
|
"github.com/miniflux/miniflux/logger"
|
||||||
"golang.org/x/net/html/charset"
|
"golang.org/x/net/html/charset"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -45,9 +47,22 @@ func (r *Response) IsModified(etag, lastModified string) bool {
|
||||||
}
|
}
|
||||||
|
|
||||||
// NormalizeBodyEncoding make sure the body is encoded in UTF-8.
|
// NormalizeBodyEncoding make sure the body is encoded in UTF-8.
|
||||||
|
//
|
||||||
|
// If a charset other than UTF-8 is detected, we convert the document to UTF-8.
|
||||||
|
// This is used by the scraper and feed readers.
|
||||||
|
//
|
||||||
|
// Do not forget edge cases:
|
||||||
|
// - Some non-utf8 feeds specify encoding only in Content-Type, not in XML document.
|
||||||
func (r *Response) NormalizeBodyEncoding() (io.Reader, error) {
|
func (r *Response) NormalizeBodyEncoding() (io.Reader, error) {
|
||||||
if strings.Contains(r.ContentType, "charset=") {
|
_, params, err := mime.ParseMediaType(r.ContentType)
|
||||||
return charset.NewReader(r.Body, r.ContentType)
|
if err == nil {
|
||||||
|
if enc, found := params["charset"]; found {
|
||||||
|
enc = strings.ToLower(enc)
|
||||||
|
if enc != "utf-8" && enc != "utf8" && enc != "" {
|
||||||
|
logger.Debug("[NormalizeBodyEncoding] Convert body to UTF-8 from %s", enc)
|
||||||
|
return charset.NewReader(r.Body, r.ContentType)
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return r.Body, nil
|
return r.Body, nil
|
||||||
}
|
}
|
||||||
|
|
|
@ -10,15 +10,14 @@ import (
|
||||||
|
|
||||||
"github.com/miniflux/miniflux/errors"
|
"github.com/miniflux/miniflux/errors"
|
||||||
"github.com/miniflux/miniflux/model"
|
"github.com/miniflux/miniflux/model"
|
||||||
|
"github.com/miniflux/miniflux/reader/encoding"
|
||||||
"golang.org/x/net/html/charset"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
// Parse returns a normalized feed struct from a Atom feed.
|
// Parse returns a normalized feed struct from a Atom feed.
|
||||||
func Parse(data io.Reader) (*model.Feed, error) {
|
func Parse(data io.Reader) (*model.Feed, error) {
|
||||||
atomFeed := new(atomFeed)
|
atomFeed := new(atomFeed)
|
||||||
decoder := xml.NewDecoder(data)
|
decoder := xml.NewDecoder(data)
|
||||||
decoder.CharsetReader = charset.NewReaderLabel
|
decoder.CharsetReader = encoding.CharsetReader
|
||||||
|
|
||||||
err := decoder.Decode(atomFeed)
|
err := decoder.Decode(atomFeed)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|
10
reader/encoding/doc.go
Normal file
10
reader/encoding/doc.go
Normal file
|
@ -0,0 +1,10 @@
|
||||||
|
// Copyright 2018 Frédéric Guillot. All rights reserved.
|
||||||
|
// Use of this source code is governed by the Apache 2.0
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
/*
|
||||||
|
|
||||||
|
Package encoding handles workarounds to deal with encoding edge cases found into the wild.
|
||||||
|
|
||||||
|
*/
|
||||||
|
package encoding
|
38
reader/encoding/encoding.go
Normal file
38
reader/encoding/encoding.go
Normal file
|
@ -0,0 +1,38 @@
|
||||||
|
// Copyright 2018 Frédéric Guillot. All rights reserved.
|
||||||
|
// Use of this source code is governed by the Apache 2.0
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
package encoding
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"io"
|
||||||
|
"unicode/utf8"
|
||||||
|
|
||||||
|
"golang.org/x/net/html/charset"
|
||||||
|
)
|
||||||
|
|
||||||
|
// CharsetReader is used when the XML encoding is specified for the input document.
|
||||||
|
//
|
||||||
|
// The document is converted in UTF-8 only if a different encoding is specified
|
||||||
|
// and the document is not already UTF-8.
|
||||||
|
//
|
||||||
|
// Several edge cases could exists:
|
||||||
|
//
|
||||||
|
// - Feeds with charset specified only in Content-Type header and not in XML document
|
||||||
|
// - Feeds with charset specified in both places
|
||||||
|
// - Feeds with charset specified only in XML document and not in HTTP header
|
||||||
|
func CharsetReader(label string, input io.Reader) (io.Reader, error) {
|
||||||
|
var buf1, buf2 bytes.Buffer
|
||||||
|
w := io.MultiWriter(&buf1, &buf2)
|
||||||
|
io.Copy(w, input)
|
||||||
|
r := bytes.NewReader(buf2.Bytes())
|
||||||
|
|
||||||
|
if !utf8.Valid(buf1.Bytes()) {
|
||||||
|
// Transform document to UTF-8 from the specified XML encoding.
|
||||||
|
return charset.NewReaderLabel(label, r)
|
||||||
|
}
|
||||||
|
|
||||||
|
// The document is already UTF-8, do not do anything (avoid double-encoding)
|
||||||
|
return r, nil
|
||||||
|
}
|
|
@ -14,12 +14,11 @@ import (
|
||||||
|
|
||||||
"github.com/miniflux/miniflux/model"
|
"github.com/miniflux/miniflux/model"
|
||||||
"github.com/miniflux/miniflux/reader/atom"
|
"github.com/miniflux/miniflux/reader/atom"
|
||||||
|
"github.com/miniflux/miniflux/reader/encoding"
|
||||||
"github.com/miniflux/miniflux/reader/json"
|
"github.com/miniflux/miniflux/reader/json"
|
||||||
"github.com/miniflux/miniflux/reader/rdf"
|
"github.com/miniflux/miniflux/reader/rdf"
|
||||||
"github.com/miniflux/miniflux/reader/rss"
|
"github.com/miniflux/miniflux/reader/rss"
|
||||||
"github.com/miniflux/miniflux/timer"
|
"github.com/miniflux/miniflux/timer"
|
||||||
|
|
||||||
"golang.org/x/net/html/charset"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
// List of feed formats.
|
// List of feed formats.
|
||||||
|
@ -32,14 +31,14 @@ const (
|
||||||
)
|
)
|
||||||
|
|
||||||
// DetectFeedFormat detect feed format from input data.
|
// DetectFeedFormat detect feed format from input data.
|
||||||
func DetectFeedFormat(data io.Reader) string {
|
func DetectFeedFormat(r io.Reader) string {
|
||||||
defer timer.ExecutionTime(time.Now(), "[Feed:DetectFeedFormat]")
|
defer timer.ExecutionTime(time.Now(), "[Feed:DetectFeedFormat]")
|
||||||
|
|
||||||
var buffer bytes.Buffer
|
var buffer bytes.Buffer
|
||||||
tee := io.TeeReader(data, &buffer)
|
tee := io.TeeReader(r, &buffer)
|
||||||
|
|
||||||
decoder := xml.NewDecoder(tee)
|
decoder := xml.NewDecoder(tee)
|
||||||
decoder.CharsetReader = charset.NewReaderLabel
|
decoder.CharsetReader = encoding.CharsetReader
|
||||||
|
|
||||||
for {
|
for {
|
||||||
token, _ := decoder.Token()
|
token, _ := decoder.Token()
|
||||||
|
@ -66,11 +65,11 @@ func DetectFeedFormat(data io.Reader) string {
|
||||||
return FormatUnknown
|
return FormatUnknown
|
||||||
}
|
}
|
||||||
|
|
||||||
func parseFeed(data io.Reader) (*model.Feed, error) {
|
func parseFeed(r io.Reader) (*model.Feed, error) {
|
||||||
defer timer.ExecutionTime(time.Now(), "[Feed:ParseFeed]")
|
defer timer.ExecutionTime(time.Now(), "[Feed:ParseFeed]")
|
||||||
|
|
||||||
var buffer bytes.Buffer
|
var buffer bytes.Buffer
|
||||||
io.Copy(&buffer, data)
|
io.Copy(&buffer, r)
|
||||||
|
|
||||||
reader := bytes.NewReader(buffer.Bytes())
|
reader := bytes.NewReader(buffer.Bytes())
|
||||||
format := DetectFeedFormat(reader)
|
format := DetectFeedFormat(reader)
|
||||||
|
|
|
@ -9,14 +9,14 @@ import (
|
||||||
"io"
|
"io"
|
||||||
|
|
||||||
"github.com/miniflux/miniflux/errors"
|
"github.com/miniflux/miniflux/errors"
|
||||||
"golang.org/x/net/html/charset"
|
"github.com/miniflux/miniflux/reader/encoding"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Parse reads an OPML file and returns a SubcriptionList.
|
// Parse reads an OPML file and returns a SubcriptionList.
|
||||||
func Parse(data io.Reader) (SubcriptionList, error) {
|
func Parse(data io.Reader) (SubcriptionList, error) {
|
||||||
feeds := new(opml)
|
feeds := new(opml)
|
||||||
decoder := xml.NewDecoder(data)
|
decoder := xml.NewDecoder(data)
|
||||||
decoder.CharsetReader = charset.NewReaderLabel
|
decoder.CharsetReader = encoding.CharsetReader
|
||||||
|
|
||||||
err := decoder.Decode(feeds)
|
err := decoder.Decode(feeds)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|
|
@ -10,14 +10,14 @@ import (
|
||||||
|
|
||||||
"github.com/miniflux/miniflux/errors"
|
"github.com/miniflux/miniflux/errors"
|
||||||
"github.com/miniflux/miniflux/model"
|
"github.com/miniflux/miniflux/model"
|
||||||
"golang.org/x/net/html/charset"
|
"github.com/miniflux/miniflux/reader/encoding"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Parse returns a normalized feed struct from a RDF feed.
|
// Parse returns a normalized feed struct from a RDF feed.
|
||||||
func Parse(data io.Reader) (*model.Feed, error) {
|
func Parse(data io.Reader) (*model.Feed, error) {
|
||||||
feed := new(rdfFeed)
|
feed := new(rdfFeed)
|
||||||
decoder := xml.NewDecoder(data)
|
decoder := xml.NewDecoder(data)
|
||||||
decoder.CharsetReader = charset.NewReaderLabel
|
decoder.CharsetReader = encoding.CharsetReader
|
||||||
|
|
||||||
err := decoder.Decode(feed)
|
err := decoder.Decode(feed)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|
|
@ -10,15 +10,14 @@ import (
|
||||||
|
|
||||||
"github.com/miniflux/miniflux/errors"
|
"github.com/miniflux/miniflux/errors"
|
||||||
"github.com/miniflux/miniflux/model"
|
"github.com/miniflux/miniflux/model"
|
||||||
|
"github.com/miniflux/miniflux/reader/encoding"
|
||||||
"golang.org/x/net/html/charset"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
// Parse returns a normalized feed struct from a RSS feed.
|
// Parse returns a normalized feed struct from a RSS feed.
|
||||||
func Parse(data io.Reader) (*model.Feed, error) {
|
func Parse(data io.Reader) (*model.Feed, error) {
|
||||||
feed := new(rssFeed)
|
feed := new(rssFeed)
|
||||||
decoder := xml.NewDecoder(data)
|
decoder := xml.NewDecoder(data)
|
||||||
decoder.CharsetReader = charset.NewReaderLabel
|
decoder.CharsetReader = encoding.CharsetReader
|
||||||
|
|
||||||
err := decoder.Decode(feed)
|
err := decoder.Decode(feed)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|
|
@ -35,8 +35,13 @@ func FindSubscriptions(websiteURL string) (Subscriptions, error) {
|
||||||
return nil, errors.NewLocalizedError(errConnectionFailure, err)
|
return nil, errors.NewLocalizedError(errConnectionFailure, err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
body, err := response.NormalizeBodyEncoding()
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
var buffer bytes.Buffer
|
var buffer bytes.Buffer
|
||||||
io.Copy(&buffer, response.Body)
|
io.Copy(&buffer, body)
|
||||||
reader := bytes.NewReader(buffer.Bytes())
|
reader := bytes.NewReader(buffer.Bytes())
|
||||||
|
|
||||||
if format := feed.DetectFeedFormat(reader); format != feed.FormatUnknown {
|
if format := feed.DetectFeedFormat(reader); format != feed.FormatUnknown {
|
||||||
|
|
Loading…
Add table
Reference in a new issue