Handle various invalid date
This commit is contained in:
parent
4f4f573955
commit
a108cb7808
4 changed files with 101 additions and 46 deletions
|
@ -123,7 +123,7 @@ func (a *atom10Entry) entryDate() time.Time {
|
||||||
if dateText != "" {
|
if dateText != "" {
|
||||||
result, err := date.Parse(dateText)
|
result, err := date.Parse(dateText)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logger.Error("atom: %v", err)
|
logger.Error("atom: %v (entry ID = %s)", err, a.ID)
|
||||||
return time.Now()
|
return time.Now()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -23,6 +23,7 @@ var dateFormats = []string{
|
||||||
time.RFC1123Z,
|
time.RFC1123Z,
|
||||||
time.RFC1123,
|
time.RFC1123,
|
||||||
time.ANSIC,
|
time.ANSIC,
|
||||||
|
"Mon, January 2, 2006, 3:04 PM MST",
|
||||||
"Mon, January 2 2006 15:04:05 -0700",
|
"Mon, January 2 2006 15:04:05 -0700",
|
||||||
"Mon, January 02, 2006, 15:04:05 MST",
|
"Mon, January 02, 2006, 15:04:05 MST",
|
||||||
"Mon, January 02, 2006 15:04:05 MST",
|
"Mon, January 02, 2006 15:04:05 MST",
|
||||||
|
@ -37,6 +38,8 @@ var dateFormats = []string{
|
||||||
"Mon Jan 02, 2006 3:04 pm",
|
"Mon Jan 02, 2006 3:04 pm",
|
||||||
"Mon, Jan 02,2006 15:04:05 MST",
|
"Mon, Jan 02,2006 15:04:05 MST",
|
||||||
"Mon Jan 02 2006 15:04:05 -0700",
|
"Mon Jan 02 2006 15:04:05 -0700",
|
||||||
|
"Monday, 2. January 2006 - 15:04",
|
||||||
|
"Monday 02 January 2006",
|
||||||
"Monday, January 2, 2006 15:04:05 MST",
|
"Monday, January 2, 2006 15:04:05 MST",
|
||||||
"Monday, January 2, 2006 03:04 PM",
|
"Monday, January 2, 2006 03:04 PM",
|
||||||
"Monday, January 2, 2006",
|
"Monday, January 2, 2006",
|
||||||
|
@ -111,6 +114,11 @@ var dateFormats = []string{
|
||||||
"Mon, 02 Jan 2006",
|
"Mon, 02 Jan 2006",
|
||||||
"Mon, 02 Jan 06 15:04:05 MST",
|
"Mon, 02 Jan 06 15:04:05 MST",
|
||||||
"Mon, 02 Jan 2006 3:04 PM MST",
|
"Mon, 02 Jan 2006 3:04 PM MST",
|
||||||
|
"Mon Jan 02 2006 15:04:05 MST",
|
||||||
|
"Mon, 01 02 2006 15:04:05 -0700",
|
||||||
|
"Mon, 2th Jan 2006 15:05:05 MST",
|
||||||
|
"Jan. 2, 2006, 3:04 a.m.",
|
||||||
|
"fri, 02 jan 2006 15:04:05 -0700",
|
||||||
"January 02 2006 03:04:05 PM",
|
"January 02 2006 03:04:05 PM",
|
||||||
"January 2, 2006 3:04 PM",
|
"January 2, 2006 3:04 PM",
|
||||||
"January 2, 2006, 3:04 p.m.",
|
"January 2, 2006, 3:04 p.m.",
|
||||||
|
@ -145,6 +153,7 @@ var dateFormats = []string{
|
||||||
"2006-1-2T15:04:05Z",
|
"2006-1-2T15:04:05Z",
|
||||||
"2006-1-2 15:04:05",
|
"2006-1-2 15:04:05",
|
||||||
"2006-1-2",
|
"2006-1-2",
|
||||||
|
"2006-01-02T15:04:05-07:00Z",
|
||||||
"2006-1-02T15:04:05Z",
|
"2006-1-02T15:04:05Z",
|
||||||
"2006-01-02T15:04Z",
|
"2006-01-02T15:04Z",
|
||||||
"2006-01-02T15:04-07:00",
|
"2006-01-02T15:04-07:00",
|
||||||
|
@ -196,63 +205,15 @@ var dateFormats = []string{
|
||||||
"01/02/2006 - 15:04",
|
"01/02/2006 - 15:04",
|
||||||
"01/02/2006",
|
"01/02/2006",
|
||||||
"01-02-2006",
|
"01-02-2006",
|
||||||
|
"Jan. 2006",
|
||||||
}
|
}
|
||||||
|
|
||||||
// Parse parses a given date string using a large
|
var invalidTimezoneReplacer = strings.NewReplacer(
|
||||||
// list of commonly found feed date formats.
|
"Europe/Brussels", "CET",
|
||||||
func Parse(ds string) (t time.Time, err error) {
|
"GMT+0000 (Coordinated Universal Time)", "GMT",
|
||||||
timestamp, err := strconv.ParseInt(ds, 10, 64)
|
)
|
||||||
if err == nil {
|
|
||||||
return time.Unix(timestamp, 0), nil
|
|
||||||
}
|
|
||||||
|
|
||||||
ds = replaceNonEnglishWords(ds)
|
var invalidLocalizedDateReplacer = strings.NewReplacer(
|
||||||
d := strings.TrimSpace(ds)
|
|
||||||
if d == "" {
|
|
||||||
return t, errors.New("date parser: empty value")
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, layout := range dateFormats {
|
|
||||||
switch layout {
|
|
||||||
case time.RFC822, time.RFC850, time.RFC1123:
|
|
||||||
if t, err = parseLocalTimeDates(layout, d); err == nil {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if t, err = time.Parse(layout, d); err == nil {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
lastSpace := strings.LastIndex(ds, " ")
|
|
||||||
if lastSpace > 0 {
|
|
||||||
return Parse(ds[0:lastSpace])
|
|
||||||
}
|
|
||||||
|
|
||||||
err = fmt.Errorf(`date parser: failed to parse date "%s"`, ds)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
// According to Golang documentation:
|
|
||||||
//
|
|
||||||
// RFC822, RFC850, and RFC1123 formats should be applied only to local times.
|
|
||||||
// Applying them to UTC times will use "UTC" as the time zone abbreviation,
|
|
||||||
// while strictly speaking those RFCs require the use of "GMT" in that case.
|
|
||||||
func parseLocalTimeDates(layout, ds string) (t time.Time, err error) {
|
|
||||||
loc := time.UTC
|
|
||||||
|
|
||||||
// Workaround for dates that don't use GMT.
|
|
||||||
if strings.HasSuffix(ds, "PST") || strings.HasSuffix(ds, "PDT") {
|
|
||||||
loc, _ = time.LoadLocation("America/Los_Angeles")
|
|
||||||
}
|
|
||||||
|
|
||||||
return time.ParseInLocation(layout, ds, loc)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Replace German and French dates to English.
|
|
||||||
func replaceNonEnglishWords(ds string) string {
|
|
||||||
r := strings.NewReplacer(
|
|
||||||
"Mo,", "Mon,",
|
"Mo,", "Mon,",
|
||||||
"Di,", "Tue,",
|
"Di,", "Tue,",
|
||||||
"Mi,", "Wed,",
|
"Mi,", "Wed,",
|
||||||
|
@ -271,10 +232,94 @@ func replaceNonEnglishWords(ds string) string {
|
||||||
"ven,", "Fri,",
|
"ven,", "Fri,",
|
||||||
"sam,", "Sat,",
|
"sam,", "Sat,",
|
||||||
"dim,", "Sun,",
|
"dim,", "Sun,",
|
||||||
|
"lun.", "Mon",
|
||||||
|
"mar.", "Tue",
|
||||||
|
"mer.", "Wed",
|
||||||
|
"jeu.", "Thu",
|
||||||
|
"ven.", "Fri",
|
||||||
|
"sam.", "Sat",
|
||||||
|
"dim.", "Sun",
|
||||||
|
"Lundi,", "Monday,",
|
||||||
|
"Mardi,", "Tuesday,",
|
||||||
|
"Mercredi,", "Wednesday,",
|
||||||
|
"Jeudi,", "Thursday,",
|
||||||
|
"Vendredi,", "Friday,",
|
||||||
|
"Samedi,", "Saturday,",
|
||||||
|
"Dimanche,", "Sunday,",
|
||||||
"avr ", "Apr ",
|
"avr ", "Apr ",
|
||||||
"mai ", "May ",
|
"mai ", "May ",
|
||||||
"jui ", "Jun ",
|
"jui ", "Jun ",
|
||||||
)
|
"juin ", "June ",
|
||||||
|
"jan.", "January ",
|
||||||
|
"feb.", "February ",
|
||||||
|
"mars.", "March ",
|
||||||
|
"avril.", "April ",
|
||||||
|
"mai.", "May ",
|
||||||
|
"juin.", "June ",
|
||||||
|
"juil.", "july",
|
||||||
|
"août.", "august",
|
||||||
|
"sept.", "september",
|
||||||
|
"oct.", "october",
|
||||||
|
"nov.", "november",
|
||||||
|
"dec.", "december",
|
||||||
|
"Janvier", "January",
|
||||||
|
"Février", "February",
|
||||||
|
"Mars", "March",
|
||||||
|
"Avril", "April",
|
||||||
|
"Mai", "May",
|
||||||
|
"Juin", "June",
|
||||||
|
"Juillet", "July",
|
||||||
|
"Août", "August",
|
||||||
|
"Septembre", "September",
|
||||||
|
"Octobre", "October",
|
||||||
|
"Novembre", "November",
|
||||||
|
"Décembre", "December",
|
||||||
|
)
|
||||||
|
|
||||||
return r.Replace(ds)
|
// Parse parses a given date string using a large
|
||||||
|
// list of commonly found feed date formats.
|
||||||
|
func Parse(rawInput string) (t time.Time, err error) {
|
||||||
|
timestamp, err := strconv.ParseInt(rawInput, 10, 64)
|
||||||
|
if err == nil {
|
||||||
|
return time.Unix(timestamp, 0), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
processedInput := invalidLocalizedDateReplacer.Replace(rawInput)
|
||||||
|
processedInput = invalidTimezoneReplacer.Replace(processedInput)
|
||||||
|
processedInput = strings.TrimSpace(processedInput)
|
||||||
|
if processedInput == "" {
|
||||||
|
return t, errors.New(`date parser: empty value`)
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, layout := range dateFormats {
|
||||||
|
switch layout {
|
||||||
|
case time.RFC822, time.RFC850, time.RFC1123:
|
||||||
|
if t, err = parseLocalTimeDates(layout, processedInput); err == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if t, err = time.Parse(layout, processedInput); err == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
err = fmt.Errorf(`date parser: failed to parse date "%s"`, rawInput)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// According to Golang documentation:
|
||||||
|
//
|
||||||
|
// RFC822, RFC850, and RFC1123 formats should be applied only to local times.
|
||||||
|
// Applying them to UTC times will use "UTC" as the time zone abbreviation,
|
||||||
|
// while strictly speaking those RFCs require the use of "GMT" in that case.
|
||||||
|
func parseLocalTimeDates(layout, ds string) (t time.Time, err error) {
|
||||||
|
loc := time.UTC
|
||||||
|
|
||||||
|
// Workaround for dates that don't use GMT.
|
||||||
|
if strings.HasSuffix(ds, "PST") || strings.HasSuffix(ds, "PDT") {
|
||||||
|
loc, _ = time.LoadLocation("America/Los_Angeles")
|
||||||
|
}
|
||||||
|
|
||||||
|
return time.ParseInLocation(layout, ds, loc)
|
||||||
}
|
}
|
||||||
|
|
|
@ -133,11 +133,21 @@ func TestParseWeirdDateFormat(t *testing.T) {
|
||||||
"Mon, 30 Mar 2020 19:53 +0000",
|
"Mon, 30 Mar 2020 19:53 +0000",
|
||||||
"Mon, 03/30/2020 - 19:19",
|
"Mon, 03/30/2020 - 19:19",
|
||||||
"2018-12-12T12:12",
|
"2018-12-12T12:12",
|
||||||
|
"2020-11-08T16:20:00-05:00Z",
|
||||||
|
"Nov. 16, 2020, 10:57 a.m.",
|
||||||
|
"Friday 06 November 2020",
|
||||||
|
"Mon, November 16, 2020, 11:12 PM EST",
|
||||||
|
"Lundi, 16. Novembre 2020 - 15:54",
|
||||||
|
"Thu Nov 12 2020 17:00:00 GMT+0000 (Coordinated Universal Time)",
|
||||||
|
"Sat, 11 04 2020 08:51:49 +0100",
|
||||||
|
"Mon, 16th Nov 2020 13:16:28 GMT",
|
||||||
|
"Nov. 2020",
|
||||||
|
"ven., 03 juil. 2020 15:09:58 +0000",
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, date := range dates {
|
for _, date := range dates {
|
||||||
if _, err := Parse(date); err != nil {
|
if _, err := Parse(date); err != nil {
|
||||||
t.Fatalf(`Unable to parse date: %q`, date)
|
t.Errorf(`Unable to parse date: %q`, date)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -179,7 +179,7 @@ func (r *rssItem) entryDate() time.Time {
|
||||||
if value != "" {
|
if value != "" {
|
||||||
result, err := date.Parse(value)
|
result, err := date.Parse(value)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logger.Error("rss: %v", err)
|
logger.Error("rss: %v (entry GUID = %s)", err, r.GUID)
|
||||||
return time.Now()
|
return time.Now()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue