2023-06-19 17:42:47 -04:00
// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
2018-10-15 01:33:19 -04:00
2018-12-02 23:51:06 -05:00
package processor
2018-10-15 01:33:19 -04:00
import (
2021-01-27 07:50:34 -05:00
"errors"
"fmt"
2020-11-18 20:29:40 -05:00
"math"
2020-10-16 17:40:56 -04:00
"regexp"
2021-01-27 07:50:34 -05:00
"strconv"
2020-11-18 20:29:40 -05:00
"strings"
2020-09-27 19:01:06 -04:00
"time"
2020-11-18 20:29:40 -05:00
"unicode/utf8"
2020-09-27 19:01:06 -04:00
2023-08-10 22:46:45 -04:00
"miniflux.app/v2/internal/integration"
2021-09-07 23:04:22 -04:00
2023-08-10 22:46:45 -04:00
"miniflux.app/v2/internal/config"
"miniflux.app/v2/internal/http/client"
"miniflux.app/v2/internal/logger"
"miniflux.app/v2/internal/metric"
"miniflux.app/v2/internal/model"
"miniflux.app/v2/internal/reader/browser"
"miniflux.app/v2/internal/reader/rewrite"
"miniflux.app/v2/internal/reader/sanitizer"
"miniflux.app/v2/internal/reader/scraper"
"miniflux.app/v2/internal/storage"
2020-11-18 20:29:40 -05:00
2021-01-27 07:50:34 -05:00
"github.com/PuerkitoBio/goquery"
2020-11-18 20:29:40 -05:00
"github.com/rylans/getlang"
2018-10-15 01:33:19 -04:00
)
2021-01-27 07:50:34 -05:00
var (
2022-07-12 00:12:26 -04:00
youtubeRegex = regexp . MustCompile ( ` youtube\.com/watch\?v=(.*) ` )
2023-03-18 06:13:58 -04:00
odyseeRegex = regexp . MustCompile ( ` ^https://odysee\.com ` )
2022-07-12 00:12:26 -04:00
iso8601Regex = regexp . MustCompile ( ` ^P((?P<year>\d+)Y)?((?P<month>\d+)M)?((?P<week>\d+)W)?((?P<day>\d+)D)?(T((?P<hour>\d+)H)?((?P<minute>\d+)M)?((?P<second>\d+)S)?)?$ ` )
customReplaceRuleRegex = regexp . MustCompile ( ` rewrite\("(.*)"\|"(.*)"\) ` )
2021-01-27 07:50:34 -05:00
)
2018-12-02 23:51:06 -05:00
// ProcessFeedEntries downloads original web page for entries and apply filters.
2023-08-08 10:12:41 -04:00
func ProcessFeedEntries ( store * storage . Storage , feed * model . Feed , user * model . User , forceRefresh bool ) {
2020-10-20 01:07:35 -04:00
var filteredEntries model . Entries
2020-10-16 17:40:56 -04:00
2022-10-14 11:18:44 -04:00
// array used for bulk push
entriesToPush := model . Entries { }
2023-03-01 11:58:01 -05:00
// Process older entries first
for i := len ( feed . Entries ) - 1 ; i >= 0 ; i -- {
entry := feed . Entries [ i ]
2020-10-20 01:07:35 -04:00
logger . Debug ( "[Processor] Processing entry %q from feed %q" , entry . URL , feed . FeedURL )
if isBlockedEntry ( feed , entry ) || ! isAllowedEntry ( feed , entry ) {
continue
}
2022-07-12 00:12:26 -04:00
url := getUrlFromEntry ( feed , entry )
2021-03-08 23:10:53 -05:00
entryIsNew := ! store . EntryURLExists ( feed . ID , entry . URL )
2023-08-08 10:12:41 -04:00
if feed . Crawler && ( entryIsNew || forceRefresh ) {
2022-07-12 00:12:26 -04:00
logger . Debug ( "[Processor] Crawling entry %q from feed %q" , url , feed . FeedURL )
2021-03-08 23:10:53 -05:00
startTime := time . Now ( )
content , scraperErr := scraper . Fetch (
2022-07-12 00:12:26 -04:00
url ,
2021-03-08 23:10:53 -05:00
feed . ScraperRules ,
feed . UserAgent ,
2021-03-22 23:27:58 -04:00
feed . Cookie ,
2021-03-08 23:10:53 -05:00
feed . AllowSelfSignedCertificates ,
2021-08-28 05:30:04 -04:00
feed . FetchViaProxy ,
2021-03-08 23:10:53 -05:00
)
if config . Opts . HasMetricsCollector ( ) {
status := "success"
2020-09-27 19:01:06 -04:00
if scraperErr != nil {
2021-03-08 23:10:53 -05:00
status = "error"
2018-10-15 01:33:19 -04:00
}
2021-03-08 23:10:53 -05:00
metric . ScraperRequestDuration . WithLabelValues ( status ) . Observe ( time . Since ( startTime ) . Seconds ( ) )
}
if scraperErr != nil {
logger . Error ( ` [Processor] Unable to crawl this entry: %q => %v ` , entry . URL , scraperErr )
} else if content != "" {
// We replace the entry content only if the scraper doesn't return any error.
entry . Content = content
2018-10-15 01:33:19 -04:00
}
}
2023-04-08 05:02:36 -04:00
rewrite . Rewriter ( url , entry , feed . RewriteRules )
2018-10-15 01:33:19 -04:00
// The sanitizer should always run at the end of the process to make sure unsafe HTML is filtered.
2022-07-12 00:12:26 -04:00
entry . Content = sanitizer . Sanitize ( url , entry . Content )
2020-10-20 01:07:35 -04:00
2021-09-07 23:04:22 -04:00
if entryIsNew {
intg , err := store . Integration ( feed . UserID )
if err != nil {
logger . Error ( "[Processor] Get integrations for user %d failed: %v; the refresh process will go on, but no integrations will run this time." , feed . UserID , err )
} else if intg != nil {
localEntry := entry
go func ( ) {
integration . PushEntry ( localEntry , intg )
} ( )
2022-10-14 11:18:44 -04:00
entriesToPush = append ( entriesToPush , localEntry )
2021-09-07 23:04:22 -04:00
}
}
2021-08-30 10:53:05 -04:00
updateEntryReadingTime ( store , feed , entry , entryIsNew , user )
2020-10-20 01:07:35 -04:00
filteredEntries = append ( filteredEntries , entry )
2018-10-15 01:33:19 -04:00
}
2020-10-20 01:07:35 -04:00
2022-10-14 11:18:44 -04:00
intg , err := store . Integration ( feed . UserID )
if err != nil {
logger . Error ( "[Processor] Get integrations for user %d failed: %v; the refresh process will go on, but no integrations will run this time." , feed . UserID , err )
2022-11-30 11:05:36 -05:00
} else if intg != nil && len ( entriesToPush ) > 0 {
2022-10-14 11:18:44 -04:00
go func ( ) {
integration . PushEntries ( entriesToPush , intg )
} ( )
}
2020-10-20 01:07:35 -04:00
feed . Entries = filteredEntries
2018-10-15 01:33:19 -04:00
}
2018-12-02 23:51:06 -05:00
2020-10-20 01:07:35 -04:00
func isBlockedEntry ( feed * model . Feed , entry * model . Entry ) bool {
if feed . BlocklistRules != "" {
match , _ := regexp . MatchString ( feed . BlocklistRules , entry . Title )
if match {
logger . Debug ( "[Processor] Blocking entry %q from feed %q based on rule %q" , entry . Title , feed . FeedURL , feed . BlocklistRules )
return true
2020-10-16 17:40:56 -04:00
}
}
2020-10-20 01:07:35 -04:00
return false
}
func isAllowedEntry ( feed * model . Feed , entry * model . Entry ) bool {
if feed . KeeplistRules != "" {
match , _ := regexp . MatchString ( feed . KeeplistRules , entry . Title )
if match {
logger . Debug ( "[Processor] Allow entry %q from feed %q based on rule %q" , entry . Title , feed . FeedURL , feed . KeeplistRules )
return true
2020-10-16 17:40:56 -04:00
}
2020-10-20 01:07:35 -04:00
return false
2020-10-16 17:40:56 -04:00
}
2020-10-20 01:07:35 -04:00
return true
2020-10-16 17:40:56 -04:00
}
2018-12-02 23:51:06 -05:00
// ProcessEntryWebPage downloads the entry web page and apply rewrite rules.
2021-08-30 10:53:05 -04:00
func ProcessEntryWebPage ( feed * model . Feed , entry * model . Entry , user * model . User ) error {
2020-09-27 19:01:06 -04:00
startTime := time . Now ( )
2022-07-12 00:12:26 -04:00
url := getUrlFromEntry ( feed , entry )
2021-02-21 16:42:49 -05:00
content , scraperErr := scraper . Fetch (
2022-07-12 00:12:26 -04:00
url ,
2021-02-21 16:42:49 -05:00
entry . Feed . ScraperRules ,
entry . Feed . UserAgent ,
2021-03-22 23:27:58 -04:00
entry . Feed . Cookie ,
2021-02-21 16:42:49 -05:00
feed . AllowSelfSignedCertificates ,
2021-08-28 05:30:04 -04:00
feed . FetchViaProxy ,
2021-02-21 16:42:49 -05:00
)
2020-09-27 19:01:06 -04:00
if config . Opts . HasMetricsCollector ( ) {
status := "success"
if scraperErr != nil {
status = "error"
}
metric . ScraperRequestDuration . WithLabelValues ( status ) . Observe ( time . Since ( startTime ) . Seconds ( ) )
}
if scraperErr != nil {
return scraperErr
2018-12-02 23:51:06 -05:00
}
if content != "" {
entry . Content = content
2021-08-30 10:53:05 -04:00
entry . ReadingTime = calculateReadingTime ( content , user )
2018-12-02 23:51:06 -05:00
}
2023-04-08 05:02:36 -04:00
rewrite . Rewriter ( url , entry , entry . Feed . RewriteRules )
entry . Content = sanitizer . Sanitize ( url , entry . Content )
2018-12-02 23:51:06 -05:00
return nil
}
2020-11-18 20:29:40 -05:00
2022-07-12 00:12:26 -04:00
func getUrlFromEntry ( feed * model . Feed , entry * model . Entry ) string {
var url = entry . URL
if feed . UrlRewriteRules != "" {
parts := customReplaceRuleRegex . FindStringSubmatch ( feed . UrlRewriteRules )
if len ( parts ) >= 3 {
re := regexp . MustCompile ( parts [ 1 ] )
url = re . ReplaceAllString ( entry . URL , parts [ 2 ] )
logger . Debug ( ` [Processor] Rewriting entry URL %s to %s ` , entry . URL , url )
} else {
logger . Debug ( "[Processor] Cannot find search and replace terms for replace rule %s" , feed . UrlRewriteRules )
}
}
return url
}
2021-08-30 10:53:05 -04:00
func updateEntryReadingTime ( store * storage . Storage , feed * model . Feed , entry * model . Entry , entryIsNew bool , user * model . User ) {
2021-03-08 23:10:53 -05:00
if shouldFetchYouTubeWatchTime ( entry ) {
if entryIsNew {
watchTime , err := fetchYouTubeWatchTime ( entry . URL )
if err != nil {
logger . Error ( "[Processor] Unable to fetch YouTube watch time: %q => %v" , entry . URL , err )
}
entry . ReadingTime = watchTime
} else {
entry . ReadingTime = store . GetReadTime ( entry , feed )
}
}
2023-03-18 06:13:58 -04:00
if shouldFetchOdyseeWatchTime ( entry ) {
if entryIsNew {
watchTime , err := fetchOdyseeWatchTime ( entry . URL )
if err != nil {
logger . Error ( "[Processor] Unable to fetch Odysee watch time: %q => %v" , entry . URL , err )
}
entry . ReadingTime = watchTime
} else {
entry . ReadingTime = store . GetReadTime ( entry , feed )
}
}
2021-03-08 23:10:53 -05:00
// Handle YT error case and non-YT entries.
if entry . ReadingTime == 0 {
2021-08-30 10:53:05 -04:00
entry . ReadingTime = calculateReadingTime ( entry . Content , user )
2021-03-08 23:10:53 -05:00
}
}
func shouldFetchYouTubeWatchTime ( entry * model . Entry ) bool {
if ! config . Opts . FetchYouTubeWatchTime ( ) {
return false
}
matches := youtubeRegex . FindStringSubmatch ( entry . URL )
urlMatchesYouTubePattern := len ( matches ) == 2
return urlMatchesYouTubePattern
}
2023-03-18 06:13:58 -04:00
func shouldFetchOdyseeWatchTime ( entry * model . Entry ) bool {
if ! config . Opts . FetchOdyseeWatchTime ( ) {
return false
}
matches := odyseeRegex . FindStringSubmatch ( entry . URL )
return matches != nil
}
2021-01-27 07:50:34 -05:00
func fetchYouTubeWatchTime ( url string ) ( int , error ) {
clt := client . NewClientWithConfig ( url , config . Opts )
response , browserErr := browser . Exec ( clt )
if browserErr != nil {
return 0 , browserErr
}
doc , docErr := goquery . NewDocumentFromReader ( response . Body )
if docErr != nil {
return 0 , docErr
}
durs , exists := doc . Find ( ` meta[itemprop="duration"] ` ) . First ( ) . Attr ( "content" )
if ! exists {
return 0 , errors . New ( "duration has not found" )
}
dur , err := parseISO8601 ( durs )
if err != nil {
return 0 , fmt . Errorf ( "unable to parse duration %s: %v" , durs , err )
}
return int ( dur . Minutes ( ) ) , nil
}
2023-03-18 06:13:58 -04:00
func fetchOdyseeWatchTime ( url string ) ( int , error ) {
clt := client . NewClientWithConfig ( url , config . Opts )
response , browserErr := browser . Exec ( clt )
if browserErr != nil {
return 0 , browserErr
}
doc , docErr := goquery . NewDocumentFromReader ( response . Body )
if docErr != nil {
return 0 , docErr
}
durs , exists := doc . Find ( ` meta[property="og:video:duration"] ` ) . First ( ) . Attr ( "content" )
// durs contains video watch time in seconds
if ! exists {
return 0 , errors . New ( "duration has not found" )
}
dur , err := strconv . ParseInt ( durs , 10 , 64 )
if err != nil {
return 0 , fmt . Errorf ( "unable to parse duration %s: %v" , durs , err )
}
return int ( dur / 60 ) , nil
}
2021-01-27 07:50:34 -05:00
// parseISO8601 parses an ISO 8601 duration string.
func parseISO8601 ( from string ) ( time . Duration , error ) {
var match [ ] string
var d time . Duration
if iso8601Regex . MatchString ( from ) {
match = iso8601Regex . FindStringSubmatch ( from )
} else {
return 0 , errors . New ( "could not parse duration string" )
}
for i , name := range iso8601Regex . SubexpNames ( ) {
part := match [ i ]
if i == 0 || name == "" || part == "" {
continue
}
val , err := strconv . ParseInt ( part , 10 , 64 )
if err != nil {
return 0 , err
}
switch name {
case "hour" :
d = d + ( time . Duration ( val ) * time . Hour )
case "minute" :
d = d + ( time . Duration ( val ) * time . Minute )
case "second" :
d = d + ( time . Duration ( val ) * time . Second )
default :
return 0 , fmt . Errorf ( "unknown field %s" , name )
}
}
return d , nil
}
2021-08-30 10:53:05 -04:00
func calculateReadingTime ( content string , user * model . User ) int {
2020-11-18 20:29:40 -05:00
sanitizedContent := sanitizer . StripTags ( content )
languageInfo := getlang . FromString ( sanitizedContent )
var timeToReadInt int
if languageInfo . LanguageCode ( ) == "ko" || languageInfo . LanguageCode ( ) == "zh" || languageInfo . LanguageCode ( ) == "jp" {
2021-08-30 10:53:05 -04:00
timeToReadInt = int ( math . Ceil ( float64 ( utf8 . RuneCountInString ( sanitizedContent ) ) / float64 ( user . CJKReadingSpeed ) ) )
2020-11-18 20:29:40 -05:00
} else {
nbOfWords := len ( strings . Fields ( sanitizedContent ) )
2021-08-30 10:53:05 -04:00
timeToReadInt = int ( math . Ceil ( float64 ( nbOfWords ) / float64 ( user . DefaultReadingSpeed ) ) )
2020-11-18 20:29:40 -05:00
}
return timeToReadInt
}