From c3f871b49b7cae515ca43308262ea59a502f5e20 Mon Sep 17 00:00:00 2001 From: Ilya Mateyko Date: Wed, 27 Jan 2021 12:50:34 +0000 Subject: [PATCH] Use YouTube video duration as read time This feature works by scraping YouTube website. To enable it, set the FETCH_YOUTUBE_WATCH_TIME environment variable to 1. Resolves #972. --- config/config_test.go | 18 ++++++ config/options.go | 10 ++++ config/parser.go | 2 + miniflux.1 | 6 ++ reader/processor/processor.go | 88 +++++++++++++++++++++++++++++- reader/processor/processor_test.go | 30 ++++++++++ 6 files changed, 153 insertions(+), 1 deletion(-) diff --git a/config/config_test.go b/config/config_test.go index 6d970b27..3e3e260a 100644 --- a/config/config_test.go +++ b/config/config_test.go @@ -1413,3 +1413,21 @@ func TestAuthProxyUserCreationAdmin(t *testing.T) { t.Fatalf(`Unexpected AUTH_PROXY_USER_CREATION value, got %v instead of %v`, result, expected) } } + +func TestFetchYouTubeWatchTime(t *testing.T) { + os.Clearenv() + os.Setenv("FETCH_YOUTUBE_WATCH_TIME", "1") + + parser := NewParser() + opts, err := parser.ParseEnvironmentVariables() + if err != nil { + t.Fatalf(`Parsing failure: %v`, err) + } + + expected := true + result := opts.FetchYouTubeWatchTime() + + if result != expected { + t.Fatalf(`Unexpected FETCH_YOUTUBE_WATCH_TIME value, got %v instead of %v`, result, expected) + } +} diff --git a/config/options.go b/config/options.go index c423a3b0..e522e886 100644 --- a/config/options.go +++ b/config/options.go @@ -43,6 +43,7 @@ const ( defaultCleanupArchiveUnreadDays = 180 defaultCleanupRemoveSessionsDays = 30 defaultProxyImages = "http-only" + defaultFetchYouTubeWatchTime = false defaultCreateAdmin = false defaultAdminUsername = "" defaultAdminPassword = "" @@ -108,6 +109,7 @@ type Options struct { adminUsername string adminPassword string proxyImages string + fetchYouTubeWatchTime bool oauth2UserCreationAllowed bool oauth2ClientID string oauth2ClientSecret string @@ -162,6 +164,7 @@ func NewOptions() *Options { workerPoolSize: defaultWorkerPoolSize, createAdmin: defaultCreateAdmin, proxyImages: defaultProxyImages, + fetchYouTubeWatchTime: defaultFetchYouTubeWatchTime, oauth2UserCreationAllowed: defaultOAuth2UserCreation, oauth2ClientID: defaultOAuth2ClientID, oauth2ClientSecret: defaultOAuth2ClientSecret, @@ -373,6 +376,12 @@ func (o *Options) AdminPassword() string { return o.adminPassword } +// FetchYouTubeWatchTime returns true if the YouTube video duration +// should be fetched and used as a reading time. +func (o *Options) FetchYouTubeWatchTime() bool { + return o.fetchYouTubeWatchTime +} + // ProxyImages returns "none" to never proxy, "http-only" to proxy non-HTTPS, "all" to always proxy. func (o *Options) ProxyImages() string { return o.proxyImages @@ -469,6 +478,7 @@ func (o *Options) SortedOptions() []*Option { "DATABASE_MIN_CONNS": o.databaseMinConns, "DATABASE_URL": o.databaseURL, "DEBUG": o.debug, + "FETCH_YOUTUBE_WATCH_TIME": o.fetchYouTubeWatchTime, "HSTS": o.hsts, "HTTPS": o.HTTPS, "HTTP_CLIENT_MAX_BODY_SIZE": o.httpClientMaxBodySize, diff --git a/config/parser.go b/config/parser.go index 57328bb3..64e2a699 100644 --- a/config/parser.go +++ b/config/parser.go @@ -187,6 +187,8 @@ func (p *Parser) parseLines(lines []string) (err error) { p.opts.metricsRefreshInterval = parseInt(value, defaultMetricsRefreshInterval) case "METRICS_ALLOWED_NETWORKS": p.opts.metricsAllowedNetworks = parseStringList(value, []string{defaultMetricsAllowedNetworks}) + case "FETCH_YOUTUBE_WATCH_TIME": + p.opts.fetchYouTubeWatchTime = parseBool(value, defaultFetchYouTubeWatchTime) } } diff --git a/miniflux.1 b/miniflux.1 index 7de208d2..04b87c47 100644 --- a/miniflux.1 +++ b/miniflux.1 @@ -107,6 +107,12 @@ Set the value to 1 to enable debug logs\&. .br Disabled by default\&. .TP +.B FETCH_YOUTUBE_WATCH_TIME +Set the value to 1 to scrape video duration from YouTube website and +use it as a reading time\&. +.br +Disabled by default\&. +.TP .B SERVER_TIMING_HEADER Set the value to 1 to enable server-timing headers\&. .br diff --git a/reader/processor/processor.go b/reader/processor/processor.go index 4b3dc429..90cb5712 100644 --- a/reader/processor/processor.go +++ b/reader/processor/processor.go @@ -5,24 +5,35 @@ package processor import ( + "errors" + "fmt" "math" "regexp" + "strconv" "strings" "time" "unicode/utf8" "miniflux.app/config" + "miniflux.app/http/client" "miniflux.app/logger" "miniflux.app/metric" "miniflux.app/model" + "miniflux.app/reader/browser" "miniflux.app/reader/rewrite" "miniflux.app/reader/sanitizer" "miniflux.app/reader/scraper" "miniflux.app/storage" + "github.com/PuerkitoBio/goquery" "github.com/rylans/getlang" ) +var ( + youtubeRegex = regexp.MustCompile(`youtube\.com/watch\?v=(.*)`) + iso8601Regex = regexp.MustCompile(`^P((?P\d+)Y)?((?P\d+)M)?((?P\d+)W)?((?P\d+)D)?(T((?P\d+)H)?((?P\d+)M)?((?P\d+)S)?)?$`) +) + // ProcessFeedEntries downloads original web page for entries and apply filters. func ProcessFeedEntries(store *storage.Storage, feed *model.Feed) { var filteredEntries model.Entries @@ -63,7 +74,20 @@ func ProcessFeedEntries(store *storage.Storage, feed *model.Feed) { // The sanitizer should always run at the end of the process to make sure unsafe HTML is filtered. entry.Content = sanitizer.Sanitize(entry.URL, entry.Content) - entry.ReadingTime = calculateReadingTime(entry.Content) + if config.Opts.FetchYouTubeWatchTime() { + if matches := youtubeRegex.FindStringSubmatch(entry.URL); len(matches) == 2 { + watchTime, err := fetchYouTubeWatchTime(entry.URL) + if err != nil { + logger.Error("[Processor] Unable to fetch YouTube watch time: %q => %v", entry.URL, err) + } + entry.ReadingTime = watchTime + } + } + + if entry.ReadingTime == 0 { + entry.ReadingTime = calculateReadingTime(entry.Content) + } + filteredEntries = append(filteredEntries, entry) } @@ -120,6 +144,68 @@ func ProcessEntryWebPage(entry *model.Entry) error { return nil } +func fetchYouTubeWatchTime(url string) (int, error) { + clt := client.NewClientWithConfig(url, config.Opts) + response, browserErr := browser.Exec(clt) + if browserErr != nil { + return 0, browserErr + } + + doc, docErr := goquery.NewDocumentFromReader(response.Body) + if docErr != nil { + return 0, docErr + } + + durs, exists := doc.Find(`meta[itemprop="duration"]`).First().Attr("content") + if !exists { + return 0, errors.New("duration has not found") + } + + dur, err := parseISO8601(durs) + if err != nil { + return 0, fmt.Errorf("unable to parse duration %s: %v", durs, err) + } + + return int(dur.Minutes()), nil +} + +// parseISO8601 parses an ISO 8601 duration string. +func parseISO8601(from string) (time.Duration, error) { + var match []string + var d time.Duration + + if iso8601Regex.MatchString(from) { + match = iso8601Regex.FindStringSubmatch(from) + } else { + return 0, errors.New("could not parse duration string") + } + + for i, name := range iso8601Regex.SubexpNames() { + part := match[i] + if i == 0 || name == "" || part == "" { + continue + } + + val, err := strconv.ParseInt(part, 10, 64) + if err != nil { + return 0, err + } + + switch name { + case "hour": + d = d + (time.Duration(val) * time.Hour) + case "minute": + d = d + (time.Duration(val) * time.Minute) + case "second": + d = d + (time.Duration(val) * time.Second) + default: + return 0, fmt.Errorf("unknown field %s", name) + } + } + + return d, nil +} + func calculateReadingTime(content string) int { sanitizedContent := sanitizer.StripTags(content) languageInfo := getlang.FromString(sanitizedContent) diff --git a/reader/processor/processor_test.go b/reader/processor/processor_test.go index 5052fb6b..e91055b5 100644 --- a/reader/processor/processor_test.go +++ b/reader/processor/processor_test.go @@ -6,6 +6,7 @@ package processor // import "miniflux.app/reader/processor" import ( "testing" + "time" "miniflux.app/model" ) @@ -47,3 +48,32 @@ func TestAllowEntries(t *testing.T) { } } } + +func TestParseISO8601(t *testing.T) { + var scenarios = []struct { + duration string + expected time.Duration + }{ + // Live streams and radio. + {"PT0M0S", 0}, + // https://www.youtube.com/watch?v=HLrqNhgdiC0 + {"PT6M20S", (6 * time.Minute) + (20 * time.Second)}, + // https://www.youtube.com/watch?v=LZa5KKfqHtA + {"PT5M41S", (5 * time.Minute) + (41 * time.Second)}, + // https://www.youtube.com/watch?v=yIxEEgEuhT4 + {"PT51M52S", (51 * time.Minute) + (52 * time.Second)}, + // https://www.youtube.com/watch?v=bpHf1XcoiFs + {"PT80M42S", (1 * time.Hour) + (20 * time.Minute) + (42 * time.Second)}, + } + + for _, tc := range scenarios { + result, err := parseISO8601(tc.duration) + if err != nil { + t.Errorf("Got an error when parsing %q: %v", tc.duration, err) + } + + if tc.expected != result { + t.Errorf(`Unexpected result, got %v for duration %q`, result, tc.duration) + } + } +}