From 7b0bfd930839e6ebe4b64f37cbdc6efbf7dbd090 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Guillot?= <fred@miniflux.net>
Date: Wed, 7 Feb 2018 20:57:56 -0800
Subject: [PATCH] Strip invalid XML characters to avoid parsing errors

---
 reader/feed/parser.go | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/reader/feed/parser.go b/reader/feed/parser.go
index c04836a6..70c81e3f 100644
--- a/reader/feed/parser.go
+++ b/reader/feed/parser.go
@@ -12,6 +12,7 @@ import (
 	"strings"
 	"time"
 
+	"github.com/miniflux/miniflux/logger"
 	"github.com/miniflux/miniflux/model"
 	"github.com/miniflux/miniflux/reader/atom"
 	"github.com/miniflux/miniflux/reader/encoding"
@@ -74,7 +75,8 @@ func parseFeed(r io.Reader) (*model.Feed, error) {
 		return nil, errors.New("This feed is empty")
 	}
 
-	reader := bytes.NewReader(buffer.Bytes())
+	str := stripInvalidXMLCharacters(buffer.String())
+	reader := strings.NewReader(str)
 	format := DetectFeedFormat(reader)
 	reader.Seek(0, io.SeekStart)
 
@@ -91,3 +93,26 @@ func parseFeed(r io.Reader) (*model.Feed, error) {
 		return nil, errors.New("Unsupported feed format")
 	}
 }
+
+func stripInvalidXMLCharacters(input string) string {
+	return strings.Map(func(r rune) rune {
+		if isInCharacterRange(r) {
+			return r
+		}
+
+		logger.Debug("Strip invalid XML characters: %U", r)
+		return -1
+	}, input)
+}
+
+// Decide whether the given rune is in the XML Character Range, per
+// the Char production of http://www.xml.com/axml/testaxml.htm,
+// Section 2.2 Characters.
+func isInCharacterRange(r rune) (inrange bool) {
+	return r == 0x09 ||
+		r == 0x0A ||
+		r == 0x0D ||
+		r >= 0x20 && r <= 0xDF77 ||
+		r >= 0xE000 && r <= 0xFFFD ||
+		r >= 0x10000 && r <= 0x10FFFF
+}