2023-06-19 17:42:47 -04:00
|
|
|
// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
|
|
|
|
// SPDX-License-Identifier: Apache-2.0
|
2017-12-12 01:16:32 -05:00
|
|
|
|
2018-08-25 00:51:50 -04:00
|
|
|
package rewrite // import "miniflux.app/reader/rewrite"
|
2017-12-12 01:16:32 -05:00
|
|
|
|
|
|
|
import (
|
2022-05-25 23:44:04 -04:00
|
|
|
"encoding/base64"
|
2017-12-14 00:30:40 -05:00
|
|
|
"fmt"
|
2019-08-14 03:33:54 -04:00
|
|
|
"html"
|
2019-08-13 11:44:23 -04:00
|
|
|
"net/url"
|
2017-12-12 01:16:32 -05:00
|
|
|
"regexp"
|
|
|
|
"strings"
|
|
|
|
|
2022-01-05 23:43:03 -05:00
|
|
|
"miniflux.app/config"
|
|
|
|
|
2017-12-12 01:16:32 -05:00
|
|
|
"github.com/PuerkitoBio/goquery"
|
2022-07-27 10:55:28 -04:00
|
|
|
"github.com/yuin/goldmark"
|
|
|
|
goldmarkhtml "github.com/yuin/goldmark/renderer/html"
|
2017-12-12 01:16:32 -05:00
|
|
|
)
|
|
|
|
|
|
|
|
var (
|
2022-01-03 10:47:10 -05:00
|
|
|
youtubeRegex = regexp.MustCompile(`youtube\.com/watch\?v=(.*)`)
|
|
|
|
youtubeIdRegex = regexp.MustCompile(`youtube_id"?\s*[:=]\s*"([a-zA-Z0-9_-]{11})"`)
|
|
|
|
invidioRegex = regexp.MustCompile(`https?:\/\/(.*)\/watch\?v=(.*)`)
|
|
|
|
imgRegex = regexp.MustCompile(`<img [^>]+>`)
|
|
|
|
textLinkRegex = regexp.MustCompile(`(?mi)(\bhttps?:\/\/[-A-Z0-9+&@#\/%?=~_|!:,.;]*[-A-Z0-9+&@#\/%=~_|])`)
|
2017-12-12 01:16:32 -05:00
|
|
|
)
|
|
|
|
|
|
|
|
func addImageTitle(entryURL, entryContent string) string {
|
|
|
|
doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent))
|
|
|
|
if err != nil {
|
|
|
|
return entryContent
|
|
|
|
}
|
|
|
|
|
2018-06-26 17:39:56 -04:00
|
|
|
matches := doc.Find("img[src][title]")
|
|
|
|
|
|
|
|
if matches.Length() > 0 {
|
|
|
|
matches.Each(func(i int, img *goquery.Selection) {
|
|
|
|
altAttr := img.AttrOr("alt", "")
|
|
|
|
srcAttr, _ := img.Attr("src")
|
|
|
|
titleAttr, _ := img.Attr("title")
|
|
|
|
|
2019-08-14 03:33:54 -04:00
|
|
|
img.ReplaceWithHtml(`<figure><img src="` + srcAttr + `" alt="` + altAttr + `"/><figcaption><p>` + html.EscapeString(titleAttr) + `</p></figcaption></figure>`)
|
2018-06-26 17:39:56 -04:00
|
|
|
})
|
|
|
|
|
|
|
|
output, _ := doc.Find("body").First().Html()
|
|
|
|
return output
|
2017-12-12 01:16:32 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
return entryContent
|
|
|
|
}
|
|
|
|
|
2019-08-13 11:44:23 -04:00
|
|
|
func addMailtoSubject(entryURL, entryContent string) string {
|
|
|
|
doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent))
|
|
|
|
if err != nil {
|
|
|
|
return entryContent
|
|
|
|
}
|
|
|
|
|
|
|
|
matches := doc.Find(`a[href^="mailto:"]`)
|
|
|
|
|
|
|
|
if matches.Length() > 0 {
|
|
|
|
matches.Each(func(i int, a *goquery.Selection) {
|
|
|
|
hrefAttr, _ := a.Attr("href")
|
|
|
|
|
|
|
|
mailto, err := url.Parse(hrefAttr)
|
|
|
|
if err != nil {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
subject := mailto.Query().Get("subject")
|
|
|
|
if subject == "" {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
a.AppendHtml(" [" + html.EscapeString(subject) + "]")
|
|
|
|
})
|
|
|
|
|
|
|
|
output, _ := doc.Find("body").First().Html()
|
|
|
|
return output
|
|
|
|
}
|
|
|
|
|
|
|
|
return entryContent
|
|
|
|
}
|
|
|
|
|
2018-07-09 01:22:48 -04:00
|
|
|
func addDynamicImage(entryURL, entryContent string) string {
|
|
|
|
doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent))
|
|
|
|
if err != nil {
|
|
|
|
return entryContent
|
|
|
|
}
|
|
|
|
|
|
|
|
// Ordered most preferred to least preferred.
|
|
|
|
candidateAttrs := []string{
|
|
|
|
"data-src",
|
|
|
|
"data-original",
|
|
|
|
"data-orig",
|
|
|
|
"data-url",
|
|
|
|
"data-orig-file",
|
|
|
|
"data-large-file",
|
|
|
|
"data-medium-file",
|
|
|
|
"data-2000src",
|
|
|
|
"data-1000src",
|
|
|
|
"data-800src",
|
|
|
|
"data-655src",
|
|
|
|
"data-500src",
|
|
|
|
"data-380src",
|
|
|
|
}
|
|
|
|
|
2021-10-22 21:12:23 -04:00
|
|
|
candidateSrcsetAttrs := []string{
|
|
|
|
"data-srcset",
|
|
|
|
}
|
|
|
|
|
2018-07-09 01:22:48 -04:00
|
|
|
changed := false
|
|
|
|
|
|
|
|
doc.Find("img,div").Each(func(i int, img *goquery.Selection) {
|
2021-10-22 21:12:23 -04:00
|
|
|
// Src-linked candidates
|
2018-07-09 01:22:48 -04:00
|
|
|
for _, candidateAttr := range candidateAttrs {
|
|
|
|
if srcAttr, found := img.Attr(candidateAttr); found {
|
|
|
|
changed = true
|
|
|
|
|
|
|
|
if img.Is("img") {
|
2018-08-25 00:51:50 -04:00
|
|
|
img.SetAttr("src", srcAttr)
|
2018-07-09 01:22:48 -04:00
|
|
|
} else {
|
|
|
|
altAttr := img.AttrOr("alt", "")
|
|
|
|
img.ReplaceWithHtml(`<img src="` + srcAttr + `" alt="` + altAttr + `"/>`)
|
|
|
|
}
|
|
|
|
|
2018-08-25 00:51:50 -04:00
|
|
|
break
|
2018-07-09 01:22:48 -04:00
|
|
|
}
|
|
|
|
}
|
2021-10-22 21:12:23 -04:00
|
|
|
|
|
|
|
// Srcset-linked candidates
|
|
|
|
for _, candidateAttr := range candidateSrcsetAttrs {
|
|
|
|
if srcAttr, found := img.Attr(candidateAttr); found {
|
|
|
|
changed = true
|
|
|
|
|
|
|
|
if img.Is("img") {
|
|
|
|
img.SetAttr("srcset", srcAttr)
|
|
|
|
} else {
|
|
|
|
altAttr := img.AttrOr("alt", "")
|
|
|
|
img.ReplaceWithHtml(`<img srcset="` + srcAttr + `" alt="` + altAttr + `"/>`)
|
|
|
|
}
|
|
|
|
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
2018-07-09 01:22:48 -04:00
|
|
|
})
|
|
|
|
|
|
|
|
if !changed {
|
|
|
|
doc.Find("noscript").Each(func(i int, noscript *goquery.Selection) {
|
|
|
|
matches := imgRegex.FindAllString(noscript.Text(), 2)
|
|
|
|
|
|
|
|
if len(matches) == 1 {
|
|
|
|
changed = true
|
|
|
|
|
|
|
|
noscript.ReplaceWithHtml(matches[0])
|
|
|
|
}
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
if changed {
|
|
|
|
output, _ := doc.Find("body").First().Html()
|
|
|
|
return output
|
|
|
|
}
|
|
|
|
|
|
|
|
return entryContent
|
|
|
|
}
|
|
|
|
|
2020-09-30 01:22:25 -04:00
|
|
|
func fixMediumImages(entryURL, entryContent string) string {
|
|
|
|
doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent))
|
|
|
|
if err != nil {
|
|
|
|
return entryContent
|
|
|
|
}
|
|
|
|
|
|
|
|
doc.Find("figure.paragraph-image").Each(func(i int, paragraphImage *goquery.Selection) {
|
|
|
|
noscriptElement := paragraphImage.Find("noscript")
|
2020-10-20 00:04:14 -04:00
|
|
|
if noscriptElement.Length() > 0 {
|
|
|
|
paragraphImage.ReplaceWithHtml(noscriptElement.Text())
|
|
|
|
}
|
|
|
|
})
|
|
|
|
|
|
|
|
output, _ := doc.Find("body").First().Html()
|
|
|
|
return output
|
|
|
|
}
|
|
|
|
|
|
|
|
func useNoScriptImages(entryURL, entryContent string) string {
|
|
|
|
doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent))
|
|
|
|
if err != nil {
|
|
|
|
return entryContent
|
|
|
|
}
|
|
|
|
|
|
|
|
doc.Find("figure").Each(func(i int, figureElement *goquery.Selection) {
|
|
|
|
imgElement := figureElement.Find("img")
|
|
|
|
if imgElement.Length() > 0 {
|
|
|
|
noscriptElement := figureElement.Find("noscript")
|
|
|
|
if noscriptElement.Length() > 0 {
|
|
|
|
figureElement.PrependHtml(noscriptElement.Text())
|
|
|
|
imgElement.Remove()
|
|
|
|
noscriptElement.Remove()
|
|
|
|
}
|
|
|
|
}
|
2020-09-30 01:22:25 -04:00
|
|
|
})
|
|
|
|
|
|
|
|
output, _ := doc.Find("body").First().Html()
|
|
|
|
return output
|
|
|
|
}
|
|
|
|
|
2017-12-12 01:16:32 -05:00
|
|
|
func addYoutubeVideo(entryURL, entryContent string) string {
|
|
|
|
matches := youtubeRegex.FindStringSubmatch(entryURL)
|
|
|
|
|
|
|
|
if len(matches) == 2 {
|
|
|
|
video := `<iframe width="650" height="350" frameborder="0" src="https://www.youtube-nocookie.com/embed/` + matches[1] + `" allowfullscreen></iframe>`
|
2019-12-01 01:46:12 -05:00
|
|
|
return video + `<br>` + entryContent
|
2017-12-12 01:16:32 -05:00
|
|
|
}
|
|
|
|
return entryContent
|
|
|
|
}
|
2017-12-14 00:30:40 -05:00
|
|
|
|
2020-03-20 23:45:37 -04:00
|
|
|
func addYoutubeVideoUsingInvidiousPlayer(entryURL, entryContent string) string {
|
|
|
|
matches := youtubeRegex.FindStringSubmatch(entryURL)
|
|
|
|
|
|
|
|
if len(matches) == 2 {
|
2022-01-05 23:43:03 -05:00
|
|
|
video := `<iframe width="650" height="350" frameborder="0" src="https://` + config.Opts.InvidiousInstance() + `/embed/` + matches[1] + `" allowfullscreen></iframe>`
|
2020-03-20 23:45:37 -04:00
|
|
|
return video + `<br>` + entryContent
|
|
|
|
}
|
|
|
|
return entryContent
|
|
|
|
}
|
|
|
|
|
2022-01-03 10:47:10 -05:00
|
|
|
func addYoutubeVideoFromId(entryContent string) string {
|
|
|
|
matches := youtubeIdRegex.FindAllStringSubmatch(entryContent, -1)
|
|
|
|
if matches == nil {
|
|
|
|
return entryContent
|
|
|
|
}
|
|
|
|
sb := strings.Builder{}
|
|
|
|
for _, match := range matches {
|
|
|
|
if len(match) == 2 {
|
|
|
|
sb.WriteString(`<iframe width="650" height="350" frameborder="0" src="https://www.youtube-nocookie.com/embed/`)
|
|
|
|
sb.WriteString(match[1])
|
|
|
|
sb.WriteString(`" allowfullscreen></iframe><br>`)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
sb.WriteString(entryContent)
|
|
|
|
return sb.String()
|
|
|
|
}
|
|
|
|
|
2020-03-20 23:45:37 -04:00
|
|
|
func addInvidiousVideo(entryURL, entryContent string) string {
|
|
|
|
matches := invidioRegex.FindStringSubmatch(entryURL)
|
2020-09-06 16:41:42 -04:00
|
|
|
if len(matches) == 3 {
|
|
|
|
video := `<iframe width="650" height="350" frameborder="0" src="https://` + matches[1] + `/embed/` + matches[2] + `" allowfullscreen></iframe>`
|
2020-03-20 23:45:37 -04:00
|
|
|
return video + `<br>` + entryContent
|
|
|
|
}
|
|
|
|
return entryContent
|
|
|
|
}
|
|
|
|
|
2017-12-14 00:30:40 -05:00
|
|
|
func addPDFLink(entryURL, entryContent string) string {
|
|
|
|
if strings.HasSuffix(entryURL, ".pdf") {
|
|
|
|
return fmt.Sprintf(`<a href="%s">PDF</a><br>%s`, entryURL, entryContent)
|
|
|
|
}
|
|
|
|
return entryContent
|
|
|
|
}
|
2018-10-08 23:47:10 -04:00
|
|
|
|
|
|
|
func replaceTextLinks(input string) string {
|
|
|
|
return textLinkRegex.ReplaceAllString(input, `<a href="${1}">${1}</a>`)
|
|
|
|
}
|
|
|
|
|
|
|
|
func replaceLineFeeds(input string) string {
|
|
|
|
return strings.Replace(input, "\n", "<br>", -1)
|
|
|
|
}
|
2020-11-25 17:51:54 -05:00
|
|
|
|
|
|
|
func replaceCustom(entryContent string, searchTerm string, replaceTerm string) string {
|
|
|
|
re, err := regexp.Compile(searchTerm)
|
|
|
|
if err == nil {
|
|
|
|
return re.ReplaceAllString(entryContent, replaceTerm)
|
|
|
|
}
|
|
|
|
return entryContent
|
|
|
|
}
|
2021-09-01 17:42:23 -04:00
|
|
|
|
|
|
|
func removeCustom(entryContent string, selector string) string {
|
|
|
|
doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent))
|
|
|
|
if err != nil {
|
|
|
|
return entryContent
|
|
|
|
}
|
|
|
|
|
|
|
|
doc.Find(selector).Remove()
|
|
|
|
|
|
|
|
output, _ := doc.Find("body").First().Html()
|
|
|
|
return output
|
|
|
|
}
|
2022-01-30 03:11:43 -05:00
|
|
|
|
|
|
|
func addCastopodEpisode(entryURL, entryContent string) string {
|
|
|
|
player := `<iframe width="650" frameborder="0" src="` + entryURL + `/embed/light"></iframe>`
|
|
|
|
|
|
|
|
return player + `<br>` + entryContent
|
|
|
|
}
|
2022-05-25 23:44:04 -04:00
|
|
|
|
|
|
|
func applyFuncOnTextContent(entryContent string, selector string, repl func(string) string) string {
|
|
|
|
var treatChildren func(i int, s *goquery.Selection)
|
|
|
|
treatChildren = func(i int, s *goquery.Selection) {
|
|
|
|
if s.Nodes[0].Type == 1 {
|
|
|
|
s.ReplaceWithHtml(repl(s.Nodes[0].Data))
|
|
|
|
} else {
|
|
|
|
s.Contents().Each(treatChildren)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent))
|
|
|
|
if err != nil {
|
|
|
|
return entryContent
|
|
|
|
}
|
|
|
|
|
|
|
|
doc.Find(selector).Each(treatChildren)
|
|
|
|
|
|
|
|
output, _ := doc.Find("body").First().Html()
|
|
|
|
return output
|
|
|
|
}
|
|
|
|
|
|
|
|
func decodeBase64Content(entryContent string) string {
|
|
|
|
if ret, err := base64.StdEncoding.DecodeString(strings.TrimSpace(entryContent)); err != nil {
|
|
|
|
return entryContent
|
|
|
|
} else {
|
|
|
|
return html.EscapeString(string(ret))
|
|
|
|
}
|
|
|
|
}
|
2022-07-27 10:55:28 -04:00
|
|
|
|
|
|
|
func parseMarkdown(entryContent string) string {
|
|
|
|
var sb strings.Builder
|
|
|
|
md := goldmark.New(
|
|
|
|
goldmark.WithRendererOptions(
|
|
|
|
goldmarkhtml.WithUnsafe(),
|
|
|
|
),
|
|
|
|
)
|
|
|
|
|
|
|
|
if err := md.Convert([]byte(entryContent), &sb); err != nil {
|
|
|
|
return entryContent
|
|
|
|
}
|
|
|
|
|
|
|
|
return sb.String()
|
|
|
|
}
|
2023-03-31 14:23:31 -04:00
|
|
|
|
|
|
|
func removeTables(entryContent string) string {
|
|
|
|
doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent))
|
|
|
|
if err != nil {
|
|
|
|
return entryContent
|
|
|
|
}
|
|
|
|
|
2023-04-01 05:02:58 -04:00
|
|
|
selectors := []string{"table", "tbody", "thead", "td", "th", "td"}
|
2023-03-31 14:23:31 -04:00
|
|
|
|
2023-04-01 05:02:58 -04:00
|
|
|
var loopElement *goquery.Selection
|
2023-03-31 14:23:31 -04:00
|
|
|
|
2023-04-01 05:02:58 -04:00
|
|
|
for _, selector := range selectors {
|
|
|
|
for {
|
|
|
|
loopElement = doc.Find(selector).First()
|
2023-03-31 14:23:31 -04:00
|
|
|
|
2023-04-01 05:02:58 -04:00
|
|
|
if loopElement.Length() == 0 {
|
|
|
|
break
|
|
|
|
}
|
2023-03-31 14:23:31 -04:00
|
|
|
|
2023-04-01 05:02:58 -04:00
|
|
|
innerHtml, err := loopElement.Html()
|
|
|
|
if err != nil {
|
|
|
|
break
|
|
|
|
}
|
2023-03-31 14:23:31 -04:00
|
|
|
|
2023-04-01 05:02:58 -04:00
|
|
|
loopElement.Parent().AppendHtml(innerHtml)
|
|
|
|
loopElement.Remove()
|
|
|
|
}
|
2023-03-31 14:23:31 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
output, _ := doc.Find("body").First().Html()
|
|
|
|
return output
|
|
|
|
}
|
2023-04-08 05:02:36 -04:00
|
|
|
|
|
|
|
func removeClickbait(entryTitle string) string {
|
|
|
|
titleWords := []string{}
|
|
|
|
for _, word := range strings.Fields(entryTitle) {
|
|
|
|
runes := []rune(word)
|
|
|
|
if len(runes) > 1 {
|
|
|
|
// keep first rune as is to keep the first capital letter
|
|
|
|
titleWords = append(titleWords, string([]rune{runes[0]})+strings.ToLower(string(runes[1:])))
|
|
|
|
} else {
|
|
|
|
titleWords = append(titleWords, word)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return strings.Join(titleWords, " ")
|
|
|
|
}
|