2023-06-19 17:42:47 -04:00
|
|
|
// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
|
|
|
|
// SPDX-License-Identifier: Apache-2.0
|
2017-11-20 00:10:04 -05:00
|
|
|
|
2023-08-10 22:46:45 -04:00
|
|
|
package subscription // import "miniflux.app/v2/internal/reader/subscription"
|
2017-11-20 00:10:04 -05:00
|
|
|
|
|
|
|
import (
|
2020-08-02 14:24:02 -04:00
|
|
|
"fmt"
|
2017-11-20 20:12:37 -05:00
|
|
|
"io"
|
2020-08-02 14:24:02 -04:00
|
|
|
"regexp"
|
2018-10-14 14:46:41 -04:00
|
|
|
"strings"
|
2017-11-20 20:12:37 -05:00
|
|
|
|
2023-08-10 22:46:45 -04:00
|
|
|
"miniflux.app/v2/internal/config"
|
|
|
|
"miniflux.app/v2/internal/errors"
|
|
|
|
"miniflux.app/v2/internal/http/client"
|
|
|
|
"miniflux.app/v2/internal/reader/browser"
|
|
|
|
"miniflux.app/v2/internal/reader/parser"
|
2023-08-13 22:09:01 -04:00
|
|
|
"miniflux.app/v2/internal/urllib"
|
2017-11-20 00:10:04 -05:00
|
|
|
|
|
|
|
"github.com/PuerkitoBio/goquery"
|
|
|
|
)
|
|
|
|
|
|
|
|
var (
|
2020-08-02 14:24:02 -04:00
|
|
|
errUnreadableDoc = "Unable to analyze this page: %v"
|
|
|
|
youtubeChannelRegex = regexp.MustCompile(`youtube\.com/channel/(.*)`)
|
2020-08-02 15:16:17 -04:00
|
|
|
youtubeVideoRegex = regexp.MustCompile(`youtube\.com/watch\?v=(.*)`)
|
2017-11-20 00:10:04 -05:00
|
|
|
)
|
|
|
|
|
|
|
|
// FindSubscriptions downloads and try to find one or more subscriptions from an URL.
|
2021-03-22 23:27:58 -04:00
|
|
|
func FindSubscriptions(websiteURL, userAgent, cookie, username, password string, fetchViaProxy, allowSelfSignedCertificates bool) (Subscriptions, *errors.LocalizedError) {
|
2020-08-02 14:24:02 -04:00
|
|
|
websiteURL = findYoutubeChannelFeed(websiteURL)
|
2020-08-02 15:16:17 -04:00
|
|
|
websiteURL = parseYoutubeVideoPage(websiteURL)
|
2020-08-02 14:24:02 -04:00
|
|
|
|
2020-09-27 17:29:48 -04:00
|
|
|
clt := client.NewClientWithConfig(websiteURL, config.Opts)
|
|
|
|
clt.WithCredentials(username, password)
|
|
|
|
clt.WithUserAgent(userAgent)
|
2021-03-22 23:27:58 -04:00
|
|
|
clt.WithCookie(cookie)
|
2021-02-21 16:42:49 -05:00
|
|
|
clt.AllowSelfSignedCertificates = allowSelfSignedCertificates
|
2020-09-10 02:28:54 -04:00
|
|
|
|
|
|
|
if fetchViaProxy {
|
2020-09-27 17:29:48 -04:00
|
|
|
clt.WithProxy()
|
2020-09-10 02:28:54 -04:00
|
|
|
}
|
|
|
|
|
2020-09-27 17:29:48 -04:00
|
|
|
response, err := browser.Exec(clt)
|
2017-11-20 00:10:04 -05:00
|
|
|
if err != nil {
|
2018-01-20 01:42:55 -05:00
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
2019-12-26 18:26:23 -05:00
|
|
|
body := response.BodyAsString()
|
2018-10-14 14:46:41 -04:00
|
|
|
if format := parser.DetectFeedFormat(body); format != parser.FormatUnknown {
|
2017-11-20 00:10:04 -05:00
|
|
|
var subscriptions Subscriptions
|
|
|
|
subscriptions = append(subscriptions, &Subscription{
|
|
|
|
Title: response.EffectiveURL,
|
|
|
|
URL: response.EffectiveURL,
|
|
|
|
Type: format,
|
|
|
|
})
|
|
|
|
|
|
|
|
return subscriptions, nil
|
|
|
|
}
|
|
|
|
|
2020-08-02 15:16:17 -04:00
|
|
|
subscriptions, err := parseWebPage(response.EffectiveURL, strings.NewReader(body))
|
2020-06-16 16:52:20 -04:00
|
|
|
if err != nil || subscriptions != nil {
|
|
|
|
return subscriptions, err
|
|
|
|
}
|
2020-08-02 15:16:17 -04:00
|
|
|
|
2021-03-22 23:27:58 -04:00
|
|
|
return tryWellKnownUrls(websiteURL, userAgent, cookie, username, password)
|
2017-11-20 00:10:04 -05:00
|
|
|
}
|
|
|
|
|
2020-08-02 15:16:17 -04:00
|
|
|
func parseWebPage(websiteURL string, data io.Reader) (Subscriptions, *errors.LocalizedError) {
|
2017-11-20 00:10:04 -05:00
|
|
|
var subscriptions Subscriptions
|
|
|
|
queries := map[string]string{
|
2021-08-21 03:29:39 -04:00
|
|
|
"link[type='application/rss+xml']": "rss",
|
|
|
|
"link[type='application/atom+xml']": "atom",
|
|
|
|
"link[type='application/json']": "json",
|
|
|
|
"link[type='application/feed+json']": "json",
|
2017-11-20 00:10:04 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
doc, err := goquery.NewDocumentFromReader(data)
|
|
|
|
if err != nil {
|
|
|
|
return nil, errors.NewLocalizedError(errUnreadableDoc, err)
|
|
|
|
}
|
|
|
|
|
|
|
|
for query, kind := range queries {
|
|
|
|
doc.Find(query).Each(func(i int, s *goquery.Selection) {
|
|
|
|
subscription := new(Subscription)
|
|
|
|
subscription.Type = kind
|
|
|
|
|
|
|
|
if title, exists := s.Attr("title"); exists {
|
|
|
|
subscription.Title = title
|
|
|
|
}
|
|
|
|
|
|
|
|
if feedURL, exists := s.Attr("href"); exists {
|
2023-02-26 20:09:50 -05:00
|
|
|
if feedURL != "" {
|
2023-08-13 22:09:01 -04:00
|
|
|
subscription.URL, _ = urllib.AbsoluteURL(websiteURL, feedURL)
|
2023-02-26 20:09:50 -05:00
|
|
|
}
|
2017-11-20 00:10:04 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
if subscription.Title == "" {
|
|
|
|
subscription.Title = subscription.URL
|
|
|
|
}
|
|
|
|
|
|
|
|
if subscription.URL != "" {
|
|
|
|
subscriptions = append(subscriptions, subscription)
|
|
|
|
}
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
return subscriptions, nil
|
|
|
|
}
|
2020-06-16 16:52:20 -04:00
|
|
|
|
2020-08-02 14:24:02 -04:00
|
|
|
func findYoutubeChannelFeed(websiteURL string) string {
|
|
|
|
matches := youtubeChannelRegex.FindStringSubmatch(websiteURL)
|
|
|
|
|
|
|
|
if len(matches) == 2 {
|
|
|
|
return fmt.Sprintf(`https://www.youtube.com/feeds/videos.xml?channel_id=%s`, matches[1])
|
|
|
|
}
|
|
|
|
return websiteURL
|
|
|
|
}
|
|
|
|
|
2020-08-02 15:16:17 -04:00
|
|
|
func parseYoutubeVideoPage(websiteURL string) string {
|
|
|
|
if !youtubeVideoRegex.MatchString(websiteURL) {
|
|
|
|
return websiteURL
|
|
|
|
}
|
|
|
|
|
2020-09-27 17:29:48 -04:00
|
|
|
clt := client.NewClientWithConfig(websiteURL, config.Opts)
|
|
|
|
response, browserErr := browser.Exec(clt)
|
2020-08-02 15:16:17 -04:00
|
|
|
if browserErr != nil {
|
|
|
|
return websiteURL
|
|
|
|
}
|
|
|
|
|
|
|
|
doc, docErr := goquery.NewDocumentFromReader(response.Body)
|
|
|
|
if docErr != nil {
|
|
|
|
return websiteURL
|
|
|
|
}
|
|
|
|
|
|
|
|
if channelID, exists := doc.Find(`meta[itemprop="channelId"]`).First().Attr("content"); exists {
|
|
|
|
return fmt.Sprintf(`https://www.youtube.com/feeds/videos.xml?channel_id=%s`, channelID)
|
|
|
|
}
|
|
|
|
|
|
|
|
return websiteURL
|
|
|
|
}
|
|
|
|
|
2021-03-22 23:27:58 -04:00
|
|
|
func tryWellKnownUrls(websiteURL, userAgent, cookie, username, password string) (Subscriptions, *errors.LocalizedError) {
|
2020-06-16 16:52:20 -04:00
|
|
|
var subscriptions Subscriptions
|
|
|
|
knownURLs := map[string]string{
|
2023-03-27 14:28:13 -04:00
|
|
|
"atom.xml": "atom",
|
|
|
|
"feed.xml": "atom",
|
|
|
|
"feed/": "atom",
|
|
|
|
"rss.xml": "rss",
|
|
|
|
"rss/": "rss",
|
2020-06-16 16:52:20 -04:00
|
|
|
}
|
|
|
|
|
2023-08-13 22:09:01 -04:00
|
|
|
websiteURLRoot := urllib.RootURL(websiteURL)
|
2023-03-27 14:28:13 -04:00
|
|
|
baseURLs := []string{
|
|
|
|
// Look for knownURLs in the root.
|
|
|
|
websiteURLRoot,
|
|
|
|
}
|
|
|
|
// Look for knownURLs in current subdirectory, such as 'example.com/blog/'.
|
2023-08-13 22:09:01 -04:00
|
|
|
websiteURL, _ = urllib.AbsoluteURL(websiteURL, "./")
|
2023-03-27 14:28:13 -04:00
|
|
|
if websiteURL != websiteURLRoot {
|
|
|
|
baseURLs = append(baseURLs, websiteURL)
|
2020-06-16 16:52:20 -04:00
|
|
|
}
|
|
|
|
|
2023-03-27 14:28:13 -04:00
|
|
|
for _, baseURL := range baseURLs {
|
|
|
|
for knownURL, kind := range knownURLs {
|
2023-08-13 22:09:01 -04:00
|
|
|
fullURL, err := urllib.AbsoluteURL(baseURL, knownURL)
|
2023-03-27 14:28:13 -04:00
|
|
|
if err != nil {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
clt := client.NewClientWithConfig(fullURL, config.Opts)
|
|
|
|
clt.WithCredentials(username, password)
|
|
|
|
clt.WithUserAgent(userAgent)
|
|
|
|
clt.WithCookie(cookie)
|
|
|
|
|
|
|
|
// Some websites redirects unknown URLs to the home page.
|
|
|
|
// As result, the list of known URLs is returned to the subscription list.
|
|
|
|
// We don't want the user to choose between invalid feed URLs.
|
|
|
|
clt.WithoutRedirects()
|
|
|
|
|
|
|
|
response, err := clt.Get()
|
|
|
|
if err != nil {
|
|
|
|
continue
|
|
|
|
}
|
2020-06-16 16:52:20 -04:00
|
|
|
|
2023-03-27 14:28:13 -04:00
|
|
|
if response != nil && response.StatusCode == 200 {
|
|
|
|
subscription := new(Subscription)
|
|
|
|
subscription.Type = kind
|
|
|
|
subscription.Title = fullURL
|
|
|
|
subscription.URL = fullURL
|
|
|
|
if subscription.URL != "" {
|
|
|
|
subscriptions = append(subscriptions, subscription)
|
|
|
|
}
|
2020-06-16 16:52:20 -04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return subscriptions, nil
|
|
|
|
}
|