1
0
Fork 0

Handle various invalid date

This commit is contained in:
Frédéric Guillot 2020-11-16 21:31:39 -08:00
parent 4f4f573955
commit a108cb7808
4 changed files with 101 additions and 46 deletions

View file

@ -123,7 +123,7 @@ func (a *atom10Entry) entryDate() time.Time {
if dateText != "" { if dateText != "" {
result, err := date.Parse(dateText) result, err := date.Parse(dateText)
if err != nil { if err != nil {
logger.Error("atom: %v", err) logger.Error("atom: %v (entry ID = %s)", err, a.ID)
return time.Now() return time.Now()
} }

View file

@ -23,6 +23,7 @@ var dateFormats = []string{
time.RFC1123Z, time.RFC1123Z,
time.RFC1123, time.RFC1123,
time.ANSIC, time.ANSIC,
"Mon, January 2, 2006, 3:04 PM MST",
"Mon, January 2 2006 15:04:05 -0700", "Mon, January 2 2006 15:04:05 -0700",
"Mon, January 02, 2006, 15:04:05 MST", "Mon, January 02, 2006, 15:04:05 MST",
"Mon, January 02, 2006 15:04:05 MST", "Mon, January 02, 2006 15:04:05 MST",
@ -37,6 +38,8 @@ var dateFormats = []string{
"Mon Jan 02, 2006 3:04 pm", "Mon Jan 02, 2006 3:04 pm",
"Mon, Jan 02,2006 15:04:05 MST", "Mon, Jan 02,2006 15:04:05 MST",
"Mon Jan 02 2006 15:04:05 -0700", "Mon Jan 02 2006 15:04:05 -0700",
"Monday, 2. January 2006 - 15:04",
"Monday 02 January 2006",
"Monday, January 2, 2006 15:04:05 MST", "Monday, January 2, 2006 15:04:05 MST",
"Monday, January 2, 2006 03:04 PM", "Monday, January 2, 2006 03:04 PM",
"Monday, January 2, 2006", "Monday, January 2, 2006",
@ -111,6 +114,11 @@ var dateFormats = []string{
"Mon, 02 Jan 2006", "Mon, 02 Jan 2006",
"Mon, 02 Jan 06 15:04:05 MST", "Mon, 02 Jan 06 15:04:05 MST",
"Mon, 02 Jan 2006 3:04 PM MST", "Mon, 02 Jan 2006 3:04 PM MST",
"Mon Jan 02 2006 15:04:05 MST",
"Mon, 01 02 2006 15:04:05 -0700",
"Mon, 2th Jan 2006 15:05:05 MST",
"Jan. 2, 2006, 3:04 a.m.",
"fri, 02 jan 2006 15:04:05 -0700",
"January 02 2006 03:04:05 PM", "January 02 2006 03:04:05 PM",
"January 2, 2006 3:04 PM", "January 2, 2006 3:04 PM",
"January 2, 2006, 3:04 p.m.", "January 2, 2006, 3:04 p.m.",
@ -145,6 +153,7 @@ var dateFormats = []string{
"2006-1-2T15:04:05Z", "2006-1-2T15:04:05Z",
"2006-1-2 15:04:05", "2006-1-2 15:04:05",
"2006-1-2", "2006-1-2",
"2006-01-02T15:04:05-07:00Z",
"2006-1-02T15:04:05Z", "2006-1-02T15:04:05Z",
"2006-01-02T15:04Z", "2006-01-02T15:04Z",
"2006-01-02T15:04-07:00", "2006-01-02T15:04-07:00",
@ -196,63 +205,15 @@ var dateFormats = []string{
"01/02/2006 - 15:04", "01/02/2006 - 15:04",
"01/02/2006", "01/02/2006",
"01-02-2006", "01-02-2006",
"Jan. 2006",
} }
// Parse parses a given date string using a large var invalidTimezoneReplacer = strings.NewReplacer(
// list of commonly found feed date formats. "Europe/Brussels", "CET",
func Parse(ds string) (t time.Time, err error) { "GMT+0000 (Coordinated Universal Time)", "GMT",
timestamp, err := strconv.ParseInt(ds, 10, 64) )
if err == nil {
return time.Unix(timestamp, 0), nil
}
ds = replaceNonEnglishWords(ds) var invalidLocalizedDateReplacer = strings.NewReplacer(
d := strings.TrimSpace(ds)
if d == "" {
return t, errors.New("date parser: empty value")
}
for _, layout := range dateFormats {
switch layout {
case time.RFC822, time.RFC850, time.RFC1123:
if t, err = parseLocalTimeDates(layout, d); err == nil {
return
}
}
if t, err = time.Parse(layout, d); err == nil {
return
}
}
lastSpace := strings.LastIndex(ds, " ")
if lastSpace > 0 {
return Parse(ds[0:lastSpace])
}
err = fmt.Errorf(`date parser: failed to parse date "%s"`, ds)
return
}
// According to Golang documentation:
//
// RFC822, RFC850, and RFC1123 formats should be applied only to local times.
// Applying them to UTC times will use "UTC" as the time zone abbreviation,
// while strictly speaking those RFCs require the use of "GMT" in that case.
func parseLocalTimeDates(layout, ds string) (t time.Time, err error) {
loc := time.UTC
// Workaround for dates that don't use GMT.
if strings.HasSuffix(ds, "PST") || strings.HasSuffix(ds, "PDT") {
loc, _ = time.LoadLocation("America/Los_Angeles")
}
return time.ParseInLocation(layout, ds, loc)
}
// Replace German and French dates to English.
func replaceNonEnglishWords(ds string) string {
r := strings.NewReplacer(
"Mo,", "Mon,", "Mo,", "Mon,",
"Di,", "Tue,", "Di,", "Tue,",
"Mi,", "Wed,", "Mi,", "Wed,",
@ -271,10 +232,94 @@ func replaceNonEnglishWords(ds string) string {
"ven,", "Fri,", "ven,", "Fri,",
"sam,", "Sat,", "sam,", "Sat,",
"dim,", "Sun,", "dim,", "Sun,",
"lun.", "Mon",
"mar.", "Tue",
"mer.", "Wed",
"jeu.", "Thu",
"ven.", "Fri",
"sam.", "Sat",
"dim.", "Sun",
"Lundi,", "Monday,",
"Mardi,", "Tuesday,",
"Mercredi,", "Wednesday,",
"Jeudi,", "Thursday,",
"Vendredi,", "Friday,",
"Samedi,", "Saturday,",
"Dimanche,", "Sunday,",
"avr ", "Apr ", "avr ", "Apr ",
"mai ", "May ", "mai ", "May ",
"jui ", "Jun ", "jui ", "Jun ",
) "juin ", "June ",
"jan.", "January ",
"feb.", "February ",
"mars.", "March ",
"avril.", "April ",
"mai.", "May ",
"juin.", "June ",
"juil.", "july",
"août.", "august",
"sept.", "september",
"oct.", "october",
"nov.", "november",
"dec.", "december",
"Janvier", "January",
"Février", "February",
"Mars", "March",
"Avril", "April",
"Mai", "May",
"Juin", "June",
"Juillet", "July",
"Août", "August",
"Septembre", "September",
"Octobre", "October",
"Novembre", "November",
"Décembre", "December",
)
return r.Replace(ds) // Parse parses a given date string using a large
// list of commonly found feed date formats.
func Parse(rawInput string) (t time.Time, err error) {
timestamp, err := strconv.ParseInt(rawInput, 10, 64)
if err == nil {
return time.Unix(timestamp, 0), nil
}
processedInput := invalidLocalizedDateReplacer.Replace(rawInput)
processedInput = invalidTimezoneReplacer.Replace(processedInput)
processedInput = strings.TrimSpace(processedInput)
if processedInput == "" {
return t, errors.New(`date parser: empty value`)
}
for _, layout := range dateFormats {
switch layout {
case time.RFC822, time.RFC850, time.RFC1123:
if t, err = parseLocalTimeDates(layout, processedInput); err == nil {
return
}
}
if t, err = time.Parse(layout, processedInput); err == nil {
return
}
}
err = fmt.Errorf(`date parser: failed to parse date "%s"`, rawInput)
return
}
// According to Golang documentation:
//
// RFC822, RFC850, and RFC1123 formats should be applied only to local times.
// Applying them to UTC times will use "UTC" as the time zone abbreviation,
// while strictly speaking those RFCs require the use of "GMT" in that case.
func parseLocalTimeDates(layout, ds string) (t time.Time, err error) {
loc := time.UTC
// Workaround for dates that don't use GMT.
if strings.HasSuffix(ds, "PST") || strings.HasSuffix(ds, "PDT") {
loc, _ = time.LoadLocation("America/Los_Angeles")
}
return time.ParseInLocation(layout, ds, loc)
} }

View file

@ -133,11 +133,21 @@ func TestParseWeirdDateFormat(t *testing.T) {
"Mon, 30 Mar 2020 19:53 +0000", "Mon, 30 Mar 2020 19:53 +0000",
"Mon, 03/30/2020 - 19:19", "Mon, 03/30/2020 - 19:19",
"2018-12-12T12:12", "2018-12-12T12:12",
"2020-11-08T16:20:00-05:00Z",
"Nov. 16, 2020, 10:57 a.m.",
"Friday 06 November 2020",
"Mon, November 16, 2020, 11:12 PM EST",
"Lundi, 16. Novembre 2020 - 15:54",
"Thu Nov 12 2020 17:00:00 GMT+0000 (Coordinated Universal Time)",
"Sat, 11 04 2020 08:51:49 +0100",
"Mon, 16th Nov 2020 13:16:28 GMT",
"Nov. 2020",
"ven., 03 juil. 2020 15:09:58 +0000",
} }
for _, date := range dates { for _, date := range dates {
if _, err := Parse(date); err != nil { if _, err := Parse(date); err != nil {
t.Fatalf(`Unable to parse date: %q`, date) t.Errorf(`Unable to parse date: %q`, date)
} }
} }
} }

View file

@ -179,7 +179,7 @@ func (r *rssItem) entryDate() time.Time {
if value != "" { if value != "" {
result, err := date.Parse(value) result, err := date.Parse(value)
if err != nil { if err != nil {
logger.Error("rss: %v", err) logger.Error("rss: %v (entry GUID = %s)", err, r.GUID)
return time.Now() return time.Now()
} }