From e3eaaea15a533030b2193cbfa032bcb036e736d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Guillot?= Date: Wed, 1 Nov 2023 19:26:16 +0000 Subject: [PATCH] Update date parser to parse more invalid date formats --- internal/reader/atom/atom_03.go | 2 +- internal/reader/atom/atom_10.go | 2 +- internal/reader/date/parser.go | 6 +++++- internal/reader/date/parser_test.go | 6 +++++- internal/reader/json/json.go | 2 +- internal/reader/rdf/rdf.go | 2 +- internal/reader/rss/rss.go | 2 +- 7 files changed, 15 insertions(+), 7 deletions(-) diff --git a/internal/reader/atom/atom_03.go b/internal/reader/atom/atom_03.go index aff0af47..edcb83dc 100644 --- a/internal/reader/atom/atom_03.go +++ b/internal/reader/atom/atom_03.go @@ -126,7 +126,7 @@ func (a *atom03Entry) entryDate() time.Time { if dateText != "" { result, err := date.Parse(dateText) if err != nil { - slog.Warn("Unable to parse date from Atom 0.3 feed", + slog.Debug("Unable to parse date from Atom 0.3 feed", slog.String("date", dateText), slog.String("id", a.ID), slog.Any("error", err), diff --git a/internal/reader/atom/atom_10.go b/internal/reader/atom/atom_10.go index 5d561d43..afcb127d 100644 --- a/internal/reader/atom/atom_10.go +++ b/internal/reader/atom/atom_10.go @@ -144,7 +144,7 @@ func (a *atom10Entry) entryDate() time.Time { if dateText != "" { result, err := date.Parse(dateText) if err != nil { - slog.Warn("Unable to parse date from Atom 0.3 feed", + slog.Debug("Unable to parse date from Atom 0.3 feed", slog.String("date", dateText), slog.String("id", a.ID), slog.Any("error", err), diff --git a/internal/reader/date/parser.go b/internal/reader/date/parser.go index 850bc20d..c1f0b1e8 100644 --- a/internal/reader/date/parser.go +++ b/internal/reader/date/parser.go @@ -219,6 +219,10 @@ var dateFormats = []string{ "Mon, 2rd Jan 2006 15:04:05 MST", "Mon, 2nd Jan 2006 15:04:05 MST", "Mon, 2st Jan 2006 15:04:05 MST", + "Mon, Jan 02 2006 03:04:05 PM", + "Monday, January 2, 2006 - 15:04", + "01/02/06 15:04:05", + "02.01.06", } var invalidTimezoneReplacer = strings.NewReplacer( @@ -309,6 +313,7 @@ var invalidLocalizedDateReplacer = strings.NewReplacer( // Parse parses a given date string using a large // list of commonly found feed date formats. func Parse(rawInput string) (t time.Time, err error) { + rawInput = strings.TrimSpace(rawInput) timestamp, err := strconv.ParseInt(rawInput, 10, 64) if err == nil { return time.Unix(timestamp, 0), nil @@ -316,7 +321,6 @@ func Parse(rawInput string) (t time.Time, err error) { processedInput := invalidLocalizedDateReplacer.Replace(rawInput) processedInput = invalidTimezoneReplacer.Replace(processedInput) - processedInput = strings.TrimSpace(processedInput) if processedInput == "" { return t, errors.New(`date parser: empty value`) } diff --git a/internal/reader/date/parser_test.go b/internal/reader/date/parser_test.go index 3eaab431..6b7b2a89 100644 --- a/internal/reader/date/parser_test.go +++ b/internal/reader/date/parser_test.go @@ -214,11 +214,15 @@ func TestParseWeirdDateFormat(t *testing.T) { "Jun 23, 2023 19:00 GMT", "09/15/2014 4:20 pm PST", "Fri, 23rd Jun 2023 09:32:20 GMT", + "Sat, Oct 28 2023 08:28:28 PM", + "Monday, October 6, 2023 - 16:29\n", + "10/30/23 21:55:58", + "30.10.23", } for _, date := range dates { if _, err := Parse(date); err != nil { - t.Errorf(`Unable to parse date: %q`, date) + t.Errorf(`Unable to parse date: %q (%v)`, date, err) } } } diff --git a/internal/reader/json/json.go b/internal/reader/json/json.go index 6786af74..c6920947 100644 --- a/internal/reader/json/json.go +++ b/internal/reader/json/json.go @@ -110,7 +110,7 @@ func (j *jsonItem) GetDate() time.Time { if value != "" { d, err := date.Parse(value) if err != nil { - slog.Warn("Unable to parse date from JSON feed", + slog.Debug("Unable to parse date from JSON feed", slog.String("date", value), slog.String("url", j.URL), slog.Any("error", err), diff --git a/internal/reader/rdf/rdf.go b/internal/reader/rdf/rdf.go index fa89f60f..5a123a2f 100644 --- a/internal/reader/rdf/rdf.go +++ b/internal/reader/rdf/rdf.go @@ -100,7 +100,7 @@ func (r *rdfItem) entryDate() time.Time { if r.DublinCoreDate != "" { result, err := date.Parse(r.DublinCoreDate) if err != nil { - slog.Warn("Unable to parse date from RDF feed", + slog.Debug("Unable to parse date from RDF feed", slog.String("date", r.DublinCoreDate), slog.String("link", r.Link), slog.Any("error", err), diff --git a/internal/reader/rss/rss.go b/internal/reader/rss/rss.go index 52488840..963b2d10 100644 --- a/internal/reader/rss/rss.go +++ b/internal/reader/rss/rss.go @@ -235,7 +235,7 @@ func (r *rssItem) entryDate() time.Time { if value != "" { result, err := date.Parse(value) if err != nil { - slog.Warn("Unable to parse date from RSS feed", + slog.Debug("Unable to parse date from RSS feed", slog.String("date", value), slog.String("guid", r.GUID.Data), slog.Any("error", err),