1
0
Fork 0

Handle various invalid date

This commit is contained in:
Frédéric Guillot 2020-11-16 21:31:39 -08:00
parent 4f4f573955
commit a108cb7808
4 changed files with 101 additions and 46 deletions

View file

@ -123,7 +123,7 @@ func (a *atom10Entry) entryDate() time.Time {
if dateText != "" {
result, err := date.Parse(dateText)
if err != nil {
logger.Error("atom: %v", err)
logger.Error("atom: %v (entry ID = %s)", err, a.ID)
return time.Now()
}

View file

@ -23,6 +23,7 @@ var dateFormats = []string{
time.RFC1123Z,
time.RFC1123,
time.ANSIC,
"Mon, January 2, 2006, 3:04 PM MST",
"Mon, January 2 2006 15:04:05 -0700",
"Mon, January 02, 2006, 15:04:05 MST",
"Mon, January 02, 2006 15:04:05 MST",
@ -37,6 +38,8 @@ var dateFormats = []string{
"Mon Jan 02, 2006 3:04 pm",
"Mon, Jan 02,2006 15:04:05 MST",
"Mon Jan 02 2006 15:04:05 -0700",
"Monday, 2. January 2006 - 15:04",
"Monday 02 January 2006",
"Monday, January 2, 2006 15:04:05 MST",
"Monday, January 2, 2006 03:04 PM",
"Monday, January 2, 2006",
@ -111,6 +114,11 @@ var dateFormats = []string{
"Mon, 02 Jan 2006",
"Mon, 02 Jan 06 15:04:05 MST",
"Mon, 02 Jan 2006 3:04 PM MST",
"Mon Jan 02 2006 15:04:05 MST",
"Mon, 01 02 2006 15:04:05 -0700",
"Mon, 2th Jan 2006 15:05:05 MST",
"Jan. 2, 2006, 3:04 a.m.",
"fri, 02 jan 2006 15:04:05 -0700",
"January 02 2006 03:04:05 PM",
"January 2, 2006 3:04 PM",
"January 2, 2006, 3:04 p.m.",
@ -145,6 +153,7 @@ var dateFormats = []string{
"2006-1-2T15:04:05Z",
"2006-1-2 15:04:05",
"2006-1-2",
"2006-01-02T15:04:05-07:00Z",
"2006-1-02T15:04:05Z",
"2006-01-02T15:04Z",
"2006-01-02T15:04-07:00",
@ -196,41 +205,106 @@ var dateFormats = []string{
"01/02/2006 - 15:04",
"01/02/2006",
"01-02-2006",
"Jan. 2006",
}
var invalidTimezoneReplacer = strings.NewReplacer(
"Europe/Brussels", "CET",
"GMT+0000 (Coordinated Universal Time)", "GMT",
)
var invalidLocalizedDateReplacer = strings.NewReplacer(
"Mo,", "Mon,",
"Di,", "Tue,",
"Mi,", "Wed,",
"Do,", "Thu,",
"Fr,", "Fri,",
"Sa,", "Sat,",
"So,", "Sun,",
"Mär ", "Mar ",
"Mai ", "May ",
"Okt ", "Oct ",
"Dez ", "Dec ",
"lun,", "Mon,",
"mar,", "Tue,",
"mer,", "Wed,",
"jeu,", "Thu,",
"ven,", "Fri,",
"sam,", "Sat,",
"dim,", "Sun,",
"lun.", "Mon",
"mar.", "Tue",
"mer.", "Wed",
"jeu.", "Thu",
"ven.", "Fri",
"sam.", "Sat",
"dim.", "Sun",
"Lundi,", "Monday,",
"Mardi,", "Tuesday,",
"Mercredi,", "Wednesday,",
"Jeudi,", "Thursday,",
"Vendredi,", "Friday,",
"Samedi,", "Saturday,",
"Dimanche,", "Sunday,",
"avr ", "Apr ",
"mai ", "May ",
"jui ", "Jun ",
"juin ", "June ",
"jan.", "January ",
"feb.", "February ",
"mars.", "March ",
"avril.", "April ",
"mai.", "May ",
"juin.", "June ",
"juil.", "july",
"août.", "august",
"sept.", "september",
"oct.", "october",
"nov.", "november",
"dec.", "december",
"Janvier", "January",
"Février", "February",
"Mars", "March",
"Avril", "April",
"Mai", "May",
"Juin", "June",
"Juillet", "July",
"Août", "August",
"Septembre", "September",
"Octobre", "October",
"Novembre", "November",
"Décembre", "December",
)
// Parse parses a given date string using a large
// list of commonly found feed date formats.
func Parse(ds string) (t time.Time, err error) {
timestamp, err := strconv.ParseInt(ds, 10, 64)
func Parse(rawInput string) (t time.Time, err error) {
timestamp, err := strconv.ParseInt(rawInput, 10, 64)
if err == nil {
return time.Unix(timestamp, 0), nil
}
ds = replaceNonEnglishWords(ds)
d := strings.TrimSpace(ds)
if d == "" {
return t, errors.New("date parser: empty value")
processedInput := invalidLocalizedDateReplacer.Replace(rawInput)
processedInput = invalidTimezoneReplacer.Replace(processedInput)
processedInput = strings.TrimSpace(processedInput)
if processedInput == "" {
return t, errors.New(`date parser: empty value`)
}
for _, layout := range dateFormats {
switch layout {
case time.RFC822, time.RFC850, time.RFC1123:
if t, err = parseLocalTimeDates(layout, d); err == nil {
if t, err = parseLocalTimeDates(layout, processedInput); err == nil {
return
}
}
if t, err = time.Parse(layout, d); err == nil {
if t, err = time.Parse(layout, processedInput); err == nil {
return
}
}
lastSpace := strings.LastIndex(ds, " ")
if lastSpace > 0 {
return Parse(ds[0:lastSpace])
}
err = fmt.Errorf(`date parser: failed to parse date "%s"`, ds)
err = fmt.Errorf(`date parser: failed to parse date "%s"`, rawInput)
return
}
@ -249,32 +323,3 @@ func parseLocalTimeDates(layout, ds string) (t time.Time, err error) {
return time.ParseInLocation(layout, ds, loc)
}
// Replace German and French dates to English.
func replaceNonEnglishWords(ds string) string {
r := strings.NewReplacer(
"Mo,", "Mon,",
"Di,", "Tue,",
"Mi,", "Wed,",
"Do,", "Thu,",
"Fr,", "Fri,",
"Sa,", "Sat,",
"So,", "Sun,",
"Mär ", "Mar ",
"Mai ", "May ",
"Okt ", "Oct ",
"Dez ", "Dec ",
"lun,", "Mon,",
"mar,", "Tue,",
"mer,", "Wed,",
"jeu,", "Thu,",
"ven,", "Fri,",
"sam,", "Sat,",
"dim,", "Sun,",
"avr ", "Apr ",
"mai ", "May ",
"jui ", "Jun ",
)
return r.Replace(ds)
}

View file

@ -133,11 +133,21 @@ func TestParseWeirdDateFormat(t *testing.T) {
"Mon, 30 Mar 2020 19:53 +0000",
"Mon, 03/30/2020 - 19:19",
"2018-12-12T12:12",
"2020-11-08T16:20:00-05:00Z",
"Nov. 16, 2020, 10:57 a.m.",
"Friday 06 November 2020",
"Mon, November 16, 2020, 11:12 PM EST",
"Lundi, 16. Novembre 2020 - 15:54",
"Thu Nov 12 2020 17:00:00 GMT+0000 (Coordinated Universal Time)",
"Sat, 11 04 2020 08:51:49 +0100",
"Mon, 16th Nov 2020 13:16:28 GMT",
"Nov. 2020",
"ven., 03 juil. 2020 15:09:58 +0000",
}
for _, date := range dates {
if _, err := Parse(date); err != nil {
t.Fatalf(`Unable to parse date: %q`, date)
t.Errorf(`Unable to parse date: %q`, date)
}
}
}

View file

@ -179,7 +179,7 @@ func (r *rssItem) entryDate() time.Time {
if value != "" {
result, err := date.Parse(value)
if err != nil {
logger.Error("rss: %v", err)
logger.Error("rss: %v (entry GUID = %s)", err, r.GUID)
return time.Now()
}