1
0
Fork 0

Disable strict XML parsing

This change should improve parsing of broken XML feeds.

See https://golang.org/pkg/encoding/xml/#Decoder
This commit is contained in:
Frédéric Guillot 2019-09-18 22:27:25 -07:00 committed by Frédéric Guillot
parent ca48f7612a
commit 36d7732234
8 changed files with 95 additions and 0 deletions

View file

@ -18,6 +18,7 @@ func Parse(data io.Reader) (*model.Feed, *errors.LocalizedError) {
atomFeed := new(atomFeed)
decoder := xml.NewDecoder(data)
decoder.Entity = xml.HTMLEntity
decoder.Strict = false
decoder.CharsetReader = encoding.CharsetReader
err := decoder.Decode(atomFeed)

View file

@ -577,3 +577,22 @@ func TestParseWithHTMLEntity(t *testing.T) {
t.Errorf(`Incorrect title, got: %q`, feed.Title)
}
}
func TestParseWithInvalidCharacterEntity(t *testing.T) {
data := `
<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<title>Example Feed</title>
<link href="http://example.org/a&b"/>
</feed>
`
feed, err := Parse(bytes.NewBufferString(data))
if err != nil {
t.Fatal(err)
}
if feed.SiteURL != "http://example.org/a&b" {
t.Errorf(`Incorrect URL, got: %q`, feed.SiteURL)
}
}

View file

@ -17,6 +17,7 @@ func Parse(data io.Reader) (SubcriptionList, *errors.LocalizedError) {
feeds := new(opml)
decoder := xml.NewDecoder(data)
decoder.Entity = xml.HTMLEntity
decoder.Strict = false
decoder.CharsetReader = encoding.CharsetReader
err := decoder.Decode(feeds)

View file

@ -193,6 +193,40 @@ func TestParseOpmlVersion1WithoutOuterOutline(t *testing.T) {
}
}
}
func TestParseOpmlWithInvalidCharacterEntity(t *testing.T) {
data := `<?xml version="1.0"?>
<opml version="1.0">
<head>
<title>mySubscriptions.opml</title>
</head>
<body>
<outline title="Feed 1">
<outline type="rss" title="Feed 1" xmlUrl="http://example.org/feed1/a&b" htmlUrl="http://example.org/c&d"></outline>
</outline>
</body>
</opml>
`
var expected SubcriptionList
expected = append(expected, &Subcription{Title: "Feed 1", FeedURL: "http://example.org/feed1/a&b", SiteURL: "http://example.org/c&d", CategoryName: ""})
subscriptions, err := Parse(bytes.NewBufferString(data))
if err != nil {
t.Error(err)
}
if len(subscriptions) != 1 {
t.Errorf("Wrong number of subscriptions: %d instead of %d", len(subscriptions), 1)
}
for i := 0; i < len(subscriptions); i++ {
if !subscriptions[i].Equals(expected[i]) {
t.Errorf(`Subscription are different: "%v" vs "%v"`, subscriptions[i], expected[i])
}
}
}
func TestParseInvalidXML(t *testing.T) {
data := `garbage`
_, err := Parse(bytes.NewBufferString(data))

View file

@ -18,6 +18,7 @@ func Parse(data io.Reader) (*model.Feed, *errors.LocalizedError) {
feed := new(rdfFeed)
decoder := xml.NewDecoder(data)
decoder.Entity = xml.HTMLEntity
decoder.Strict = false
decoder.CharsetReader = encoding.CharsetReader
err := decoder.Decode(feed)

View file

@ -403,3 +403,22 @@ func TestParseFeedWithHTMLEntity(t *testing.T) {
t.Errorf(`Incorrect title, got: %q`, feed.Title)
}
}
func TestParseFeedWithInvalidCharacterEntity(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?>
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/">
<channel>
<title>Example Feed</title>
<link>http://example.org/a&b</link>
</channel>
</rdf:RDF>`
feed, err := Parse(bytes.NewBufferString(data))
if err != nil {
t.Fatal(err)
}
if feed.SiteURL != "http://example.org/a&b" {
t.Errorf(`Incorrect URL, got: %q`, feed.SiteURL)
}
}

View file

@ -18,6 +18,7 @@ func Parse(data io.Reader) (*model.Feed, *errors.LocalizedError) {
feed := new(rssFeed)
decoder := xml.NewDecoder(data)
decoder.Entity = xml.HTMLEntity
decoder.Strict = false
decoder.CharsetReader = encoding.CharsetReader
err := decoder.Decode(feed)

View file

@ -633,3 +633,22 @@ func TestParseWithHTMLEntity(t *testing.T) {
t.Errorf(`Incorrect title, got: %q`, feed.Title)
}
}
func TestParseWithInvalidCharacterEntity(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?>
<rss version="2.0" xmlns:slash="http://purl.org/rss/1.0/modules/slash/">
<channel>
<link>https://example.org/a&b</link>
<title>Example Feed</title>
</channel>
</rss>`
feed, err := Parse(bytes.NewBufferString(data))
if err != nil {
t.Fatal(err)
}
if feed.SiteURL != "https://example.org/a&b" {
t.Errorf(`Incorrect url, got: %q`, feed.SiteURL)
}
}