miniflux/internal/reader/date/parser.go
jvoisin 040938ff6d Small refactoring of internal/reader/date/parser.go
- Split dates formats into those that require local times
  and those who don't, so that there is no need to have a switch-case in the
  for loop with around 250 iterations at most.
- Be more strict when it comes to timezones, previously invalid ones like -13
  were accepted. Also add a test for this.
- Bail out early if the date is an empty string.
2024-02-26 18:08:04 -08:00

375 lines
9.9 KiB
Go
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
package date // import "miniflux.app/v2/internal/reader/date"
import (
"errors"
"fmt"
"strconv"
"strings"
"time"
)
// RFC822, RFC850, and RFC1123 formats should be applied only to local times.
var dateFormatsLocalTimesOnly = []string{
time.RFC822, // RSS
time.RFC850,
time.RFC1123,
}
// dateFormats taken from github.com/mjibson/goread
var dateFormats = []string{
time.RFC822Z, // RSS
time.RFC3339, // Atom
time.UnixDate,
time.RubyDate,
time.RFC1123Z,
time.ANSIC,
"Mon, 02 Jan 2006 15:04:05 MST -07:00",
"Mon, January 2, 2006, 3:04 PM MST",
"Mon, January 2 2006 15:04:05 -0700",
"Mon, January 02, 2006, 15:04:05 MST",
"Mon, January 02, 2006 15:04:05 MST",
"Mon, Jan 2, 2006 15:04 MST",
"Mon, Jan 2 2006 15:04 MST",
"Mon, Jan 2 2006 15:04:05 MST",
"Mon, Jan 2, 2006 15:04:05 MST",
"Mon, Jan 2 2006 15:04:05 -700",
"Mon, Jan 2 2006 15:04:05 -0700",
"Mon Jan 2 15:04 2006",
"Mon Jan 2 15:04:05 2006 MST",
"Mon Jan 02, 2006 3:04 pm",
"Mon, Jan 02,2006 15:04:05 MST",
"Mon Jan 02 2006 15:04:05 -0700",
"Mon, 02/01/2006",
"Monday, 2. January 2006 - 15:04",
"Monday 02 January 2006",
"Monday, January 2, 2006 15:04:05 MST",
"Monday, January 2, 2006 03:04 PM",
"Monday, January 2, 2006",
"Monday, January 02, 2006",
"Monday, 2 January 2006 15:04:05 MST",
"Monday, 2 January 2006 15:04:05 -0700",
"Monday, 2 Jan 2006 15:04:05 MST",
"Monday, 2 Jan 2006 15:04:05 -0700",
"Monday, 02 January 2006 15:04:05 MST",
"Monday, 02 January 2006 15:04:05 -0700",
"Monday, 02 January 2006 15:04:05",
"Monday, January 02, 2006 - 3:04pm",
"Monday, January 2, 2006 - 3:04pm",
"Mon, 01/02/2006 - 15:04",
"Mon, 2 January 2006 15:04 MST",
"Mon, 2 January 2006, 15:04 -0700",
"Mon, 2 January 2006, 15:04:05 MST",
"Mon, 2 January 2006 15:04:05 MST",
"Mon, 2 January 2006 15:04:05 -0700",
"Mon, 2 January 2006",
"Mon, 2 Jan 2006 3:04:05 PM -0700",
"Mon, 2 Jan 2006 15:4:5 MST",
"Mon, 2 Jan 2006 15:4:5 -0700 GMT",
"Mon, 2, Jan 2006 15:4",
"Mon, 2 Jan 2006 15:04 MST",
"Mon, 2 Jan 2006, 15:04 -0700",
"Mon, 2 Jan 2006 15:04 -0700",
"Mon, 2 Jan 2006 15:04:05 UT",
"Mon, 2 Jan 2006 15:04:05MST",
"Mon, 2 Jan 2006 15:04:05 MST",
"Mon 2 Jan 2006 15:04:05 MST",
"mon,2 Jan 2006 15:04:05 MST",
"Mon, 2 Jan 2006 15:04:05 -0700 MST",
"Mon, 2 Jan 2006 15:04:05-0700",
"Mon, 2 Jan 2006 15:04:05 -0700",
"Mon, 2 Jan 2006 15:04:05",
"Mon, 2 Jan 2006 15:04",
"Mon, 02 Jan 2006, 15:04",
"Mon, 2 Jan 2006, 15:04",
"Mon,2 Jan 2006",
"Mon, 2 Jan 2006",
"Mon, 2 Jan 15:04:05 MST",
"Mon, 2 Jan 06 15:04:05 MST",
"Mon, 2 Jan 06 15:04:05 -0700",
"Mon, 2006-01-02 15:04",
"Mon,02 January 2006 14:04:05 MST",
"Mon, 02 January 2006",
"Mon, 02 Jan 2006 3:04:05 PM MST",
"Mon, 02 Jan 2006 15 -0700",
"Mon,02 Jan 2006 15:04 MST",
"Mon, 02 Jan 2006 15:04 MST",
"Mon, 02 Jan 2006 15:04 -0700",
"Mon, 02 Jan 2006 15:04:05 Z",
"Mon, 02 Jan 2006 15:04:05 UT",
"Mon, 02 Jan 2006 15:04:05 MST-07:00",
"Mon, 02 Jan 2006 15:04:05 MST -0700",
"Mon, 02 Jan 2006, 15:04:05 MST",
"Mon, 02 Jan 2006 15:04:05MST",
"Mon, 02 Jan 2006 15:04:05 MST",
"Mon , 02 Jan 2006 15:04:05 MST",
"Mon, 02 Jan 2006 15:04:05 GMT-0700",
"Mon,02 Jan 2006 15:04:05 -0700",
"Mon, 02 Jan 2006 15:04:05 -0700",
"Mon, 02 Jan 2006 15:04:05 -07:00",
"Mon, 02 Jan 2006 15:04:05 --0700",
"Mon 02 Jan 2006 15:04:05 -0700",
"Mon 02 Jan 2006, 15:04:05 MST",
"Mon, 02 Jan 2006 15:04:05 MST",
"Mon, 02 Jan 2006 15:04:05 -07",
"Mon, 02 Jan 2006 15:04:05 00",
"Mon, 02 Jan 2006 15:04:05",
"Mon, 02 Jan 2006",
"Mon, 02 Jan 06 15:04:05 MST",
"Mon, 02 Jan 2006 3:04 PM MST",
"Mon Jan 02 2006 15:04:05 MST",
"Mon, 01 02 2006 15:04:05 -0700",
"Mon, 2th Jan 2006 15:05:05 MST",
"Jan. 2, 2006, 3:04 a.m.",
"fri, 02 jan 2006 15:04:05 -0700",
"January 02 2006 03:04:05 PM",
"January 2, 2006 3:04 PM",
"January 2, 2006, 3:04 p.m.",
"January 2, 2006 15:04:05 MST",
"January 2, 2006 15:04:05",
"January 2, 2006 03:04 PM",
"January 2, 2006",
"January 02, 2006 15:04:05 MST",
"January 02, 2006 15:04",
"January 02, 2006 03:04 PM",
"January 02, 2006",
"Jan 2, 2006 3:04:05 PM MST",
"Jan 2, 2006 3:04:05 PM",
"Jan 2, 2006 15:04:05 MST",
"Jan 2, 2006",
"Jan 02 2006 03:04:05PM",
"Jan 02, 2006",
"6/1/2 15:04",
"6-1-2 15:04",
"2 January 2006 15:04:05 MST",
"2 January 2006 15:04:05 -0700",
"2 January 2006",
"2 Jan 2006 15:04:05 Z",
"2 Jan 2006 15:04:05 MST",
"2 Jan 2006 15:04:05 -0700",
"2 Jan 2006",
"2 Jan 2006 15:04 MST",
"2.1.2006 15:04:05",
"2/1/2006",
"2-1-2006",
"2006 January 02",
"2006-1-2T15:04:05Z",
"2006-1-2 15:04:05",
"2006-1-2",
"2006-01-02T15:04:05-07:00Z",
"2006-1-02T15:04:05Z",
"2006-01-02T15:04Z",
"2006-01-02T15:04-07:00",
"2006-01-02T15:04:05Z",
"2006-01-02T15:04:05-07:00:00",
"2006-01-02T15:04:05:-0700",
"2006-01-02T15:04:05-0700",
"2006-01-02T15:04:05-07:00",
"2006-01-02T15:04:05 -0700",
"2006-01-02T15:04:05:00",
"2006-01-02T15:04:05",
"2006-01-02T15:04",
"2006-01-02 at 15:04:05",
"2006-01-02 15:04:05Z",
"2006-01-02 15:04:05 MST",
"2006-01-02 15:04:05-0700",
"2006-01-02 15:04:05-07:00",
"2006-01-02 15:04:05 -0700",
"2006-01-02 15:04",
"2006-01-02 00:00:00.0 15:04:05.0 -0700",
"2006/01/02",
"2006-01-02",
"15:04 02.01.2006 -0700",
"1/2/2006 3:04 PM MST",
"1/2/2006 3:04:05 PM MST",
"1/2/2006 3:04:05 PM",
"1/2/2006 15:04:05 MST",
"1/2/2006",
"06/1/2 15:04",
"06-1-2 15:04",
"02 Monday, Jan 2006 15:04",
"02 Jan 2006 15:04 MST",
"02 Jan 2006 15:04:05 UT",
"02 Jan 2006 15:04:05 MST",
"02 Jan 2006 15:04:05 -0700",
"02 Jan 2006 15:04:05",
"02 Jan 2006",
"02/01/2006 15:04 MST",
"02-01-2006 15:04:05 MST",
"02.01.2006 15:04:05",
"02/01/2006 15:04:05",
"02.01.2006 15:04",
"02/01/2006 - 15:04",
"02.01.2006 -0700",
"02/01/2006",
"02-01-2006",
"01/02/2006 3:04 PM",
"01/02/2006 15:04:05 MST",
"01/02/2006 - 15:04",
"01/02/2006",
"01-02-2006",
"Jan. 2006",
"Jan. 2, 2006, 03:04 p.m.",
"2006-01-02 15:04:05 -07:00",
"2 January, 2006",
"2 Jan 2006 MST",
"Mon, January 2, 2006 at 03:04 PM MST",
"Jan 2, 2006 15:04 MST",
"01/02/2006 3:04 pm MST",
"Mon, 2th Jan 2006 15:04:05 MST",
"Mon, 2rd Jan 2006 15:04:05 MST",
"Mon, 2nd Jan 2006 15:04:05 MST",
"Mon, 2st Jan 2006 15:04:05 MST",
"Mon, Jan 02 2006 03:04:05 PM",
"Monday, January 2, 2006 - 15:04",
"01/02/06 15:04:05",
"02.01.06",
}
var invalidTimezoneReplacer = strings.NewReplacer(
"Europe/Brussels", "CET",
"America/Los_Angeles", "PDT",
"GMT+0000 (Coordinated Universal Time)", "GMT",
"GMT-", "GMT -",
)
var invalidLocalizedDateReplacer = strings.NewReplacer(
"Mo,", "Mon,",
"Di,", "Tue,",
"Mi,", "Wed,",
"Do,", "Thu,",
"Fr,", "Fri,",
"Sa,", "Sat,",
"So,", "Sun,",
"Mär ", "Mar ",
"Mai ", "May ",
"Okt ", "Oct ",
"Dez ", "Dec ",
"lun,", "Mon,",
"mar,", "Tue,",
"mer,", "Wed,",
"jeu,", "Thu,",
"ven,", "Fri,",
"sam,", "Sat,",
"dim,", "Sun,",
"lun.", "Mon",
"mar.", "Tue",
"mer.", "Wed",
"jeu.", "Thu",
"ven.", "Fri",
"sam.", "Sat",
"dim.", "Sun",
"Lundi,", "Monday,",
"Mardi,", "Tuesday,",
"Mercredi,", "Wednesday,",
"Jeudi,", "Thursday,",
"Vendredi,", "Friday,",
"Samedi,", "Saturday,",
"Dimanche,", "Sunday,",
"jan.", "January ",
"feb.", "February ",
"mars.", "March ",
"avril.", "April ",
"mai.", "May ",
"juin.", "June ",
"juil.", "July",
"août.", "August",
"sept.", "September",
"oct.", "October",
"nov.", "November",
"dec.", "December",
"déc.", "December",
"janvier ", "January ",
"février ", "February ",
"mars ", "March ",
"avril ", "April ",
"mai ", "May ",
"juin ", "June ",
"juillet ", "July",
"août ", "August",
"septembre ", "September",
"octobre ", "October",
"november ", "November",
"décembre ", "December",
"Janvier", "January",
"Février", "February",
"Mars", "March",
"Avril", "April",
"Mai", "May",
"Juin", "June",
"Juillet", "July",
"Août", "August",
"Septembre", "September",
"Octobre", "October",
"Novembre", "November",
"Décembre", "December",
"avr ", "Apr ",
"mai ", "May ",
"jui ", "Jun ",
"juin ", "June ",
"Thurs,", "Thu,",
"Thur,", "Thu,",
)
// Parse parses a given date string using a large
// list of commonly found feed date formats.
func Parse(rawInput string) (t time.Time, err error) {
rawInput = strings.TrimSpace(rawInput)
if rawInput == "" {
return t, errors.New(`date parser: empty value`)
}
if timestamp, err := strconv.ParseInt(rawInput, 10, 64); err == nil {
return time.Unix(timestamp, 0), nil
}
processedInput := invalidLocalizedDateReplacer.Replace(rawInput)
processedInput = invalidTimezoneReplacer.Replace(processedInput)
for _, layout := range dateFormatsLocalTimesOnly {
if t, err = parseLocalTimeDates(layout, processedInput); err == nil {
return checkTimezoneRange(t), nil
}
}
for _, layout := range dateFormats {
if t, err = time.Parse(layout, processedInput); err == nil {
return checkTimezoneRange(t), nil
}
}
return t, fmt.Errorf(`date parser: failed to parse date "%s"`, rawInput)
}
// According to Golang documentation:
//
// RFC822, RFC850, and RFC1123 formats should be applied only to local times.
// Applying them to UTC times will use "UTC" as the time zone abbreviation,
// while strictly speaking those RFCs require the use of "GMT" in that case.
func parseLocalTimeDates(layout, ds string) (t time.Time, err error) {
loc := time.UTC
// Workaround for dates that don't use GMT.
if strings.HasSuffix(ds, "PST") || strings.HasSuffix(ds, "PDT") {
loc, _ = time.LoadLocation("America/Los_Angeles")
}
if strings.HasSuffix(ds, "EST") || strings.HasSuffix(ds, "EDT") {
loc, _ = time.LoadLocation("America/New_York")
}
return time.ParseInLocation(layout, ds, loc)
}
// https://en.wikipedia.org/wiki/List_of_UTC_offsets
// Offset range: westernmost (12:00) to the easternmost (+14:00)
// Avoid "pq: time zone displacement out of range" errors
func checkTimezoneRange(t time.Time) time.Time {
_, offset := t.Zone()
if float64(offset) > 14*60*60 || float64(offset) < -12*60*60 {
t = t.UTC()
}
return t
}