2023-06-19 17:42:47 -04:00
|
|
|
// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
|
|
|
|
// SPDX-License-Identifier: Apache-2.0
|
2018-01-20 01:42:55 -05:00
|
|
|
|
2023-08-10 22:46:45 -04:00
|
|
|
package encoding // import "miniflux.app/v2/internal/reader/encoding"
|
2018-01-20 01:42:55 -05:00
|
|
|
|
|
|
|
import (
|
|
|
|
"bytes"
|
|
|
|
"io"
|
|
|
|
"unicode/utf8"
|
|
|
|
|
|
|
|
"golang.org/x/net/html/charset"
|
|
|
|
)
|
|
|
|
|
|
|
|
// CharsetReader is used when the XML encoding is specified for the input document.
|
|
|
|
//
|
|
|
|
// The document is converted in UTF-8 only if a different encoding is specified
|
|
|
|
// and the document is not already UTF-8.
|
|
|
|
//
|
|
|
|
// Several edge cases could exists:
|
|
|
|
//
|
2018-10-30 02:00:03 -04:00
|
|
|
// - Feeds with encoding specified only in Content-Type header and not in XML document
|
|
|
|
// - Feeds with encoding specified in both places
|
|
|
|
// - Feeds with encoding specified only in XML document and not in HTTP header
|
|
|
|
// - Feeds with wrong encoding defined and already in UTF-8
|
2023-12-01 19:27:18 -05:00
|
|
|
func CharsetReader(charsetLabel string, input io.Reader) (io.Reader, error) {
|
2021-02-17 00:19:03 -05:00
|
|
|
buffer, _ := io.ReadAll(input)
|
2018-10-30 02:00:03 -04:00
|
|
|
r := bytes.NewReader(buffer)
|
2018-01-20 01:42:55 -05:00
|
|
|
|
2018-10-30 02:00:03 -04:00
|
|
|
// The document is already UTF-8, do not do anything (avoid double-encoding).
|
|
|
|
// That means the specified encoding in XML prolog is wrong.
|
|
|
|
if utf8.Valid(buffer) {
|
|
|
|
return r, nil
|
2018-01-20 01:42:55 -05:00
|
|
|
}
|
|
|
|
|
2018-10-30 02:00:03 -04:00
|
|
|
// Transform document to UTF-8 from the specified encoding in XML prolog.
|
2023-12-01 19:27:18 -05:00
|
|
|
return charset.NewReaderLabel(charsetLabel, r)
|
|
|
|
}
|
|
|
|
|
|
|
|
// CharsetReaderFromContentType is used when the encoding is not specified for the input document.
|
|
|
|
func CharsetReaderFromContentType(contentType string, input io.Reader) (io.Reader, error) {
|
|
|
|
return charset.NewReader(input, contentType)
|
2018-01-20 01:42:55 -05:00
|
|
|
}
|