Deduplicate feed URLs when parsing HTML document during discovery process
Fixes #2232
This commit is contained in:
parent
bfa83cbf99
commit
5de0714256
2 changed files with 41 additions and 2 deletions
|
@ -152,6 +152,7 @@ func (f *SubscriptionFinder) FindSubscriptionsFromWebPage(websiteURL string, bod
|
|||
}
|
||||
|
||||
var subscriptions Subscriptions
|
||||
subscriptionURLs := make(map[string]bool)
|
||||
for query, kind := range queries {
|
||||
doc.Find(query).Each(func(i int, s *goquery.Selection) {
|
||||
subscription := new(Subscription)
|
||||
|
@ -163,7 +164,10 @@ func (f *SubscriptionFinder) FindSubscriptionsFromWebPage(websiteURL string, bod
|
|||
|
||||
if feedURL, exists := s.Attr("href"); exists {
|
||||
if feedURL != "" {
|
||||
subscription.URL, _ = urllib.AbsoluteURL(websiteURL, feedURL)
|
||||
subscription.URL, err = urllib.AbsoluteURL(websiteURL, feedURL)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -171,7 +175,8 @@ func (f *SubscriptionFinder) FindSubscriptionsFromWebPage(websiteURL string, bod
|
|||
subscription.Title = subscription.URL
|
||||
}
|
||||
|
||||
if subscription.URL != "" {
|
||||
if subscription.URL != "" && !subscriptionURLs[subscription.URL] {
|
||||
subscriptionURLs[subscription.URL] = true
|
||||
subscriptions = append(subscriptions, subscription)
|
||||
}
|
||||
})
|
||||
|
|
|
@ -249,6 +249,40 @@ func TestParseWebPageWithMultipleFeeds(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestParseWebPageWithDuplicatedFeeds(t *testing.T) {
|
||||
htmlPage := `
|
||||
<!doctype html>
|
||||
<html>
|
||||
<head>
|
||||
<link href="http://example.org/feed.xml" rel="alternate" type="application/rss+xml" title="Feed A">
|
||||
<link href="http://example.org/feed.xml" rel="alternate" type="application/rss+xml" title="Feed B">
|
||||
</head>
|
||||
<body>
|
||||
</body>
|
||||
</html>`
|
||||
|
||||
subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", strings.NewReader(htmlPage))
|
||||
if err != nil {
|
||||
t.Fatalf(`Parsing a correctly formatted HTML page should not return any error: %v`, err)
|
||||
}
|
||||
|
||||
if len(subscriptions) != 1 {
|
||||
t.Fatal(`Incorrect number of subscriptions returned`)
|
||||
}
|
||||
|
||||
if subscriptions[0].Title != "Feed A" {
|
||||
t.Errorf(`Incorrect subscription title: %q`, subscriptions[0].Title)
|
||||
}
|
||||
|
||||
if subscriptions[0].URL != "http://example.org/feed.xml" {
|
||||
t.Errorf(`Incorrect subscription URL: %q`, subscriptions[0].URL)
|
||||
}
|
||||
|
||||
if subscriptions[0].Type != "rss" {
|
||||
t.Errorf(`Incorrect subscription type: %q`, subscriptions[0].Type)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseWebPageWithEmptyFeedURL(t *testing.T) {
|
||||
htmlPage := `
|
||||
<!doctype html>
|
||||
|
|
Loading…
Reference in a new issue