parser.py 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937
  1. """Provides the :class:`Arrow <arrow.parser.DateTimeParser>` class, a better way to parse datetime strings."""
  2. import re
  3. from datetime import datetime, timedelta, timezone
  4. from datetime import tzinfo as dt_tzinfo
  5. from functools import lru_cache
  6. from typing import (
  7. Any,
  8. ClassVar,
  9. Dict,
  10. Iterable,
  11. List,
  12. Literal,
  13. Match,
  14. Optional,
  15. Pattern,
  16. SupportsFloat,
  17. SupportsInt,
  18. Tuple,
  19. TypedDict,
  20. Union,
  21. cast,
  22. overload,
  23. )
  24. try:
  25. from zoneinfo import ZoneInfo, ZoneInfoNotFoundError
  26. except ImportError:
  27. from backports.zoneinfo import ZoneInfo, ZoneInfoNotFoundError # type: ignore[no-redef]
  28. from arrow import locales
  29. from arrow.constants import DEFAULT_LOCALE
  30. from arrow.util import next_weekday, normalize_timestamp
  31. class ParserError(ValueError):
  32. """
  33. A custom exception class for handling parsing errors in the parser.
  34. Notes:
  35. This class inherits from the built-in `ValueError` class and is used to raise exceptions
  36. when an error occurs during the parsing process.
  37. """
  38. pass
  39. # Allows for ParserErrors to be propagated from _build_datetime()
  40. # when day_of_year errors occur.
  41. # Before this, the ParserErrors were caught by the try/except in
  42. # _parse_multiformat() and the appropriate error message was not
  43. # transmitted to the user.
  44. class ParserMatchError(ParserError):
  45. """
  46. This class is a subclass of the ParserError class and is used to raise errors that occur during the matching process.
  47. Notes:
  48. This class is part of the Arrow parser and is used to provide error handling when a parsing match fails.
  49. """
  50. pass
  51. _WEEKDATE_ELEMENT = Union[str, bytes, SupportsInt, bytearray]
  52. _FORMAT_TYPE = Literal[
  53. "YYYY",
  54. "YY",
  55. "MM",
  56. "M",
  57. "DDDD",
  58. "DDD",
  59. "DD",
  60. "D",
  61. "HH",
  62. "H",
  63. "hh",
  64. "h",
  65. "mm",
  66. "m",
  67. "ss",
  68. "s",
  69. "X",
  70. "x",
  71. "ZZZ",
  72. "ZZ",
  73. "Z",
  74. "S",
  75. "W",
  76. "MMMM",
  77. "MMM",
  78. "Do",
  79. "dddd",
  80. "ddd",
  81. "d",
  82. "a",
  83. "A",
  84. ]
  85. class _Parts(TypedDict, total=False):
  86. """
  87. A dictionary that represents different parts of a datetime.
  88. :class:`_Parts` is a TypedDict that represents various components of a date or time,
  89. such as year, month, day, hour, minute, second, microsecond, timestamp, expanded_timestamp, tzinfo,
  90. am_pm, day_of_week, and weekdate.
  91. :ivar year: The year, if present, as an integer.
  92. :ivar month: The month, if present, as an integer.
  93. :ivar day_of_year: The day of the year, if present, as an integer.
  94. :ivar day: The day, if present, as an integer.
  95. :ivar hour: The hour, if present, as an integer.
  96. :ivar minute: The minute, if present, as an integer.
  97. :ivar second: The second, if present, as an integer.
  98. :ivar microsecond: The microsecond, if present, as an integer.
  99. :ivar timestamp: The timestamp, if present, as a float.
  100. :ivar expanded_timestamp: The expanded timestamp, if present, as an integer.
  101. :ivar tzinfo: The timezone info, if present, as a :class:`dt_tzinfo` object.
  102. :ivar am_pm: The AM/PM indicator, if present, as a string literal "am" or "pm".
  103. :ivar day_of_week: The day of the week, if present, as an integer.
  104. :ivar weekdate: The week date, if present, as a tuple of three integers or None.
  105. """
  106. year: int
  107. month: int
  108. day_of_year: int
  109. day: int
  110. hour: int
  111. minute: int
  112. second: int
  113. microsecond: int
  114. timestamp: float
  115. expanded_timestamp: int
  116. tzinfo: dt_tzinfo
  117. am_pm: Literal["am", "pm"]
  118. day_of_week: int
  119. weekdate: Tuple[_WEEKDATE_ELEMENT, _WEEKDATE_ELEMENT, Optional[_WEEKDATE_ELEMENT]]
  120. class DateTimeParser:
  121. """A :class:`DateTimeParser <arrow.arrow.parser>` object
  122. Contains the regular expressions and functions to parse and split the input strings into tokens and eventually
  123. produce a datetime that is used by :class:`Arrow <arrow.arrow.Arrow>` internally.
  124. :param locale: the locale string
  125. :param cache_size: the size of the LRU cache used for regular expressions. Defaults to 0.
  126. """
  127. _FORMAT_RE: ClassVar[Pattern[str]] = re.compile(
  128. r"(YYY?Y?|MM?M?M?|Do|DD?D?D?|d?d?d?d|HH?|hh?|mm?|ss?|S+|ZZ?Z?|a|A|x|X|W)"
  129. )
  130. _ESCAPE_RE: ClassVar[Pattern[str]] = re.compile(r"\[[^\[\]]*\]")
  131. _ONE_OR_TWO_DIGIT_RE: ClassVar[Pattern[str]] = re.compile(r"\d{1,2}")
  132. _ONE_OR_TWO_OR_THREE_DIGIT_RE: ClassVar[Pattern[str]] = re.compile(r"\d{1,3}")
  133. _ONE_OR_MORE_DIGIT_RE: ClassVar[Pattern[str]] = re.compile(r"\d+")
  134. _TWO_DIGIT_RE: ClassVar[Pattern[str]] = re.compile(r"\d{2}")
  135. _THREE_DIGIT_RE: ClassVar[Pattern[str]] = re.compile(r"\d{3}")
  136. _FOUR_DIGIT_RE: ClassVar[Pattern[str]] = re.compile(r"\d{4}")
  137. _TZ_Z_RE: ClassVar[Pattern[str]] = re.compile(r"([\+\-])(\d{2})(?:(\d{2}))?|Z")
  138. _TZ_ZZ_RE: ClassVar[Pattern[str]] = re.compile(r"([\+\-])(\d{2})(?:\:(\d{2}))?|Z")
  139. _TZ_NAME_RE: ClassVar[Pattern[str]] = re.compile(r"\w[\w+\-/]+")
  140. # NOTE: timestamps cannot be parsed from natural language strings (by removing the ^...$) because it will
  141. # break cases like "15 Jul 2000" and a format list (see issue #447)
  142. _TIMESTAMP_RE: ClassVar[Pattern[str]] = re.compile(r"^\-?\d+\.?\d+$")
  143. _TIMESTAMP_EXPANDED_RE: ClassVar[Pattern[str]] = re.compile(r"^\-?\d+$")
  144. _TIME_RE: ClassVar[Pattern[str]] = re.compile(
  145. r"^(\d{2})(?:\:?(\d{2}))?(?:\:?(\d{2}))?(?:([\.\,])(\d+))?$"
  146. )
  147. _WEEK_DATE_RE: ClassVar[Pattern[str]] = re.compile(
  148. r"(?P<year>\d{4})[\-]?W(?P<week>\d{2})[\-]?(?P<day>\d)?"
  149. )
  150. _BASE_INPUT_RE_MAP: ClassVar[Dict[_FORMAT_TYPE, Pattern[str]]] = {
  151. "YYYY": _FOUR_DIGIT_RE,
  152. "YY": _TWO_DIGIT_RE,
  153. "MM": _TWO_DIGIT_RE,
  154. "M": _ONE_OR_TWO_DIGIT_RE,
  155. "DDDD": _THREE_DIGIT_RE,
  156. "DDD": _ONE_OR_TWO_OR_THREE_DIGIT_RE,
  157. "DD": _TWO_DIGIT_RE,
  158. "D": _ONE_OR_TWO_DIGIT_RE,
  159. "HH": _TWO_DIGIT_RE,
  160. "H": _ONE_OR_TWO_DIGIT_RE,
  161. "hh": _TWO_DIGIT_RE,
  162. "h": _ONE_OR_TWO_DIGIT_RE,
  163. "mm": _TWO_DIGIT_RE,
  164. "m": _ONE_OR_TWO_DIGIT_RE,
  165. "ss": _TWO_DIGIT_RE,
  166. "s": _ONE_OR_TWO_DIGIT_RE,
  167. "X": _TIMESTAMP_RE,
  168. "x": _TIMESTAMP_EXPANDED_RE,
  169. "ZZZ": _TZ_NAME_RE,
  170. "ZZ": _TZ_ZZ_RE,
  171. "Z": _TZ_Z_RE,
  172. "S": _ONE_OR_MORE_DIGIT_RE,
  173. "W": _WEEK_DATE_RE,
  174. }
  175. SEPARATORS: ClassVar[List[str]] = ["-", "/", "."]
  176. locale: locales.Locale
  177. _input_re_map: Dict[_FORMAT_TYPE, Pattern[str]]
  178. def __init__(self, locale: str = DEFAULT_LOCALE, cache_size: int = 0) -> None:
  179. """
  180. Contains the regular expressions and functions to parse and split the input strings into tokens and eventually
  181. produce a datetime that is used by :class:`Arrow <arrow.arrow.Arrow>` internally.
  182. :param locale: the locale string
  183. :type locale: str
  184. :param cache_size: the size of the LRU cache used for regular expressions. Defaults to 0.
  185. :type cache_size: int
  186. """
  187. self.locale = locales.get_locale(locale)
  188. self._input_re_map = self._BASE_INPUT_RE_MAP.copy()
  189. self._input_re_map.update(
  190. {
  191. "MMMM": self._generate_choice_re(
  192. self.locale.month_names[1:], re.IGNORECASE
  193. ),
  194. "MMM": self._generate_choice_re(
  195. self.locale.month_abbreviations[1:], re.IGNORECASE
  196. ),
  197. "Do": re.compile(self.locale.ordinal_day_re),
  198. "dddd": self._generate_choice_re(
  199. self.locale.day_names[1:], re.IGNORECASE
  200. ),
  201. "ddd": self._generate_choice_re(
  202. self.locale.day_abbreviations[1:], re.IGNORECASE
  203. ),
  204. "d": re.compile(r"[1-7]"),
  205. "a": self._generate_choice_re(
  206. (self.locale.meridians["am"], self.locale.meridians["pm"])
  207. ),
  208. # note: 'A' token accepts both 'am/pm' and 'AM/PM' formats to
  209. # ensure backwards compatibility of this token
  210. "A": self._generate_choice_re(self.locale.meridians.values()),
  211. }
  212. )
  213. if cache_size > 0:
  214. self._generate_pattern_re = lru_cache(maxsize=cache_size)( # type: ignore
  215. self._generate_pattern_re
  216. )
  217. # TODO: since we support more than ISO 8601, we should rename this function
  218. # IDEA: break into multiple functions
  219. def parse_iso(
  220. self, datetime_string: str, normalize_whitespace: bool = False
  221. ) -> datetime:
  222. """
  223. Parses a datetime string using a ISO 8601-like format.
  224. :param datetime_string: The datetime string to parse.
  225. :param normalize_whitespace: Whether to normalize whitespace in the datetime string (default is False).
  226. :type datetime_string: str
  227. :type normalize_whitespace: bool
  228. :returns: The parsed datetime object.
  229. :rtype: datetime
  230. :raises ParserError: If the datetime string is not in a valid ISO 8601-like format.
  231. Usage::
  232. >>> import arrow.parser
  233. >>> arrow.parser.DateTimeParser().parse_iso('2021-10-12T14:30:00')
  234. datetime.datetime(2021, 10, 12, 14, 30)
  235. """
  236. if normalize_whitespace:
  237. datetime_string = re.sub(r"\s+", " ", datetime_string.strip())
  238. has_space_divider = " " in datetime_string
  239. has_t_divider = "T" in datetime_string
  240. num_spaces = datetime_string.count(" ")
  241. if has_space_divider and num_spaces != 1 or has_t_divider and num_spaces > 0:
  242. raise ParserError(
  243. f"Expected an ISO 8601-like string, but was given {datetime_string!r}. "
  244. "Try passing in a format string to resolve this."
  245. )
  246. has_time = has_space_divider or has_t_divider
  247. has_tz = False
  248. # date formats (ISO 8601 and others) to test against
  249. # NOTE: YYYYMM is omitted to avoid confusion with YYMMDD (no longer part of ISO 8601, but is still often used)
  250. formats = [
  251. "YYYY-MM-DD",
  252. "YYYY-M-DD",
  253. "YYYY-M-D",
  254. "YYYY/MM/DD",
  255. "YYYY/M/DD",
  256. "YYYY/M/D",
  257. "YYYY.MM.DD",
  258. "YYYY.M.DD",
  259. "YYYY.M.D",
  260. "YYYYMMDD",
  261. "YYYY-DDDD",
  262. "YYYYDDDD",
  263. "YYYY-MM",
  264. "YYYY/MM",
  265. "YYYY.MM",
  266. "YYYY",
  267. "W",
  268. ]
  269. if has_time:
  270. if has_space_divider:
  271. date_string, time_string = datetime_string.split(" ", 1)
  272. else:
  273. date_string, time_string = datetime_string.split("T", 1)
  274. time_parts = re.split(
  275. r"[\+\-Z]", time_string, maxsplit=1, flags=re.IGNORECASE
  276. )
  277. time_components: Optional[Match[str]] = self._TIME_RE.match(time_parts[0])
  278. if time_components is None:
  279. raise ParserError(
  280. "Invalid time component provided. "
  281. "Please specify a format or provide a valid time component in the basic or extended ISO 8601 time format."
  282. )
  283. (
  284. hours,
  285. minutes,
  286. seconds,
  287. subseconds_sep,
  288. subseconds,
  289. ) = time_components.groups()
  290. has_tz = len(time_parts) == 2
  291. has_minutes = minutes is not None
  292. has_seconds = seconds is not None
  293. has_subseconds = subseconds is not None
  294. is_basic_time_format = ":" not in time_parts[0]
  295. tz_format = "Z"
  296. # use 'ZZ' token instead since tz offset is present in non-basic format
  297. if has_tz and ":" in time_parts[1]:
  298. tz_format = "ZZ"
  299. time_sep = "" if is_basic_time_format else ":"
  300. if has_subseconds:
  301. time_string = "HH{time_sep}mm{time_sep}ss{subseconds_sep}S".format(
  302. time_sep=time_sep, subseconds_sep=subseconds_sep
  303. )
  304. elif has_seconds:
  305. time_string = "HH{time_sep}mm{time_sep}ss".format(time_sep=time_sep)
  306. elif has_minutes:
  307. time_string = f"HH{time_sep}mm"
  308. else:
  309. time_string = "HH"
  310. if has_space_divider:
  311. formats = [f"{f} {time_string}" for f in formats]
  312. else:
  313. formats = [f"{f}T{time_string}" for f in formats]
  314. if has_time and has_tz:
  315. # Add "Z" or "ZZ" to the format strings to indicate to
  316. # _parse_token() that a timezone needs to be parsed
  317. formats = [f"{f}{tz_format}" for f in formats]
  318. return self._parse_multiformat(datetime_string, formats)
  319. def parse(
  320. self,
  321. datetime_string: str,
  322. fmt: Union[List[str], str],
  323. normalize_whitespace: bool = False,
  324. ) -> datetime:
  325. """
  326. Parses a datetime string using a specified format.
  327. :param datetime_string: The datetime string to parse.
  328. :param fmt: The format string or list of format strings to use for parsing.
  329. :param normalize_whitespace: Whether to normalize whitespace in the datetime string (default is False).
  330. :type datetime_string: str
  331. :type fmt: Union[List[str], str]
  332. :type normalize_whitespace: bool
  333. :returns: The parsed datetime object.
  334. :rtype: datetime
  335. :raises ParserMatchError: If the datetime string does not match the specified format.
  336. Usage::
  337. >>> import arrow.parser
  338. >>> arrow.parser.DateTimeParser().parse('2021-10-12 14:30:00', 'YYYY-MM-DD HH:mm:ss')
  339. datetime.datetime(2021, 10, 12, 14, 30)
  340. """
  341. if normalize_whitespace:
  342. datetime_string = re.sub(r"\s+", " ", datetime_string)
  343. if isinstance(fmt, list):
  344. return self._parse_multiformat(datetime_string, fmt)
  345. try:
  346. fmt_tokens: List[_FORMAT_TYPE]
  347. fmt_pattern_re: Pattern[str]
  348. fmt_tokens, fmt_pattern_re = self._generate_pattern_re(fmt)
  349. except re.error as e:
  350. raise ParserMatchError(
  351. f"Failed to generate regular expression pattern: {e}."
  352. )
  353. match = fmt_pattern_re.search(datetime_string)
  354. if match is None:
  355. raise ParserMatchError(
  356. f"Failed to match {fmt!r} when parsing {datetime_string!r}."
  357. )
  358. parts: _Parts = {}
  359. for token in fmt_tokens:
  360. value: Union[Tuple[str, str, str], str]
  361. if token == "Do":
  362. value = match.group("value")
  363. elif token == "W":
  364. value = (match.group("year"), match.group("week"), match.group("day"))
  365. else:
  366. value = match.group(token)
  367. if value is None:
  368. raise ParserMatchError(
  369. f"Unable to find a match group for the specified token {token!r}."
  370. )
  371. self._parse_token(token, value, parts) # type: ignore[arg-type]
  372. return self._build_datetime(parts)
  373. def _generate_pattern_re(self, fmt: str) -> Tuple[List[_FORMAT_TYPE], Pattern[str]]:
  374. """
  375. Generates a regular expression pattern from a format string.
  376. :param fmt: The format string to convert into a regular expression pattern.
  377. :type fmt: str
  378. :returns: A tuple containing a list of format tokens and the corresponding regular expression pattern.
  379. :rtype: Tuple[List[_FORMAT_TYPE], Pattern[str]]
  380. :raises ParserError: If an unrecognized token is encountered in the format string.
  381. """
  382. # fmt is a string of tokens like 'YYYY-MM-DD'
  383. # we construct a new string by replacing each
  384. # token by its pattern:
  385. # 'YYYY-MM-DD' -> '(?P<YYYY>\d{4})-(?P<MM>\d{2})-(?P<DD>\d{2})'
  386. tokens: List[_FORMAT_TYPE] = []
  387. offset = 0
  388. # Escape all special RegEx chars
  389. escaped_fmt = re.escape(fmt)
  390. # Extract the bracketed expressions to be reinserted later.
  391. escaped_fmt = re.sub(self._ESCAPE_RE, "#", escaped_fmt)
  392. # Any number of S is the same as one.
  393. # TODO: allow users to specify the number of digits to parse
  394. escaped_fmt = re.sub(r"S+", "S", escaped_fmt)
  395. escaped_data = re.findall(self._ESCAPE_RE, fmt)
  396. fmt_pattern = escaped_fmt
  397. for m in self._FORMAT_RE.finditer(escaped_fmt):
  398. token: _FORMAT_TYPE = cast(_FORMAT_TYPE, m.group(0))
  399. try:
  400. input_re = self._input_re_map[token]
  401. except KeyError:
  402. raise ParserError(f"Unrecognized token {token!r}.")
  403. input_pattern = f"(?P<{token}>{input_re.pattern})"
  404. tokens.append(token)
  405. # a pattern doesn't have the same length as the token
  406. # it replaces! We keep the difference in the offset variable.
  407. # This works because the string is scanned left-to-right and matches
  408. # are returned in the order found by finditer.
  409. fmt_pattern = (
  410. fmt_pattern[: m.start() + offset]
  411. + input_pattern
  412. + fmt_pattern[m.end() + offset :]
  413. )
  414. offset += len(input_pattern) - (m.end() - m.start())
  415. final_fmt_pattern = ""
  416. split_fmt = fmt_pattern.split(r"\#")
  417. # Due to the way Python splits, 'split_fmt' will always be longer
  418. for i in range(len(split_fmt)):
  419. final_fmt_pattern += split_fmt[i]
  420. if i < len(escaped_data):
  421. final_fmt_pattern += escaped_data[i][1:-1]
  422. # Wrap final_fmt_pattern in a custom word boundary to strictly
  423. # match the formatting pattern and filter out date and time formats
  424. # that include junk such as: blah1998-09-12 blah, blah 1998-09-12blah,
  425. # blah1998-09-12blah. The custom word boundary matches every character
  426. # that is not a whitespace character to allow for searching for a date
  427. # and time string in a natural language sentence. Therefore, searching
  428. # for a string of the form YYYY-MM-DD in "blah 1998-09-12 blah" will
  429. # work properly.
  430. # Certain punctuation before or after the target pattern such as
  431. # "1998-09-12," is permitted. For the full list of valid punctuation,
  432. # see the documentation.
  433. starting_word_boundary = (
  434. r"(?<!\S\S)" # Don't have two consecutive non-whitespace characters. This ensures that we allow cases
  435. # like .11.25.2019 but not 1.11.25.2019 (for pattern MM.DD.YYYY)
  436. r"(?<![^\,\.\;\:\?\!\"\'\`\[\]\{\}\(\)<>\s])" # This is the list of punctuation that is ok before the
  437. # pattern (i.e. "It can't not be these characters before the pattern")
  438. r"(\b|^)"
  439. # The \b is to block cases like 1201912 but allow 201912 for pattern YYYYMM. The ^ was necessary to allow a
  440. # negative number through i.e. before epoch numbers
  441. )
  442. ending_word_boundary = (
  443. r"(?=[\,\.\;\:\?\!\"\'\`\[\]\{\}\(\)\<\>]?" # Positive lookahead stating that these punctuation marks
  444. # can appear after the pattern at most 1 time
  445. r"(?!\S))" # Don't allow any non-whitespace character after the punctuation
  446. )
  447. bounded_fmt_pattern = r"{}{}{}".format(
  448. starting_word_boundary, final_fmt_pattern, ending_word_boundary
  449. )
  450. return tokens, re.compile(bounded_fmt_pattern, flags=re.IGNORECASE)
  451. @overload
  452. def _parse_token(
  453. self,
  454. token: Literal[
  455. "YYYY",
  456. "YY",
  457. "MM",
  458. "M",
  459. "DDDD",
  460. "DDD",
  461. "DD",
  462. "D",
  463. "Do",
  464. "HH",
  465. "hh",
  466. "h",
  467. "H",
  468. "mm",
  469. "m",
  470. "ss",
  471. "s",
  472. "x",
  473. ],
  474. value: Union[str, bytes, SupportsInt, bytearray],
  475. parts: _Parts,
  476. ) -> None: ... # pragma: no cover
  477. @overload
  478. def _parse_token(
  479. self,
  480. token: Literal["X"],
  481. value: Union[str, bytes, SupportsFloat, bytearray],
  482. parts: _Parts,
  483. ) -> None: ... # pragma: no cover
  484. @overload
  485. def _parse_token(
  486. self,
  487. token: Literal["MMMM", "MMM", "dddd", "ddd", "S"],
  488. value: Union[str, bytes, bytearray],
  489. parts: _Parts,
  490. ) -> None: ... # pragma: no cover
  491. @overload
  492. def _parse_token(
  493. self,
  494. token: Literal["a", "A", "ZZZ", "ZZ", "Z"],
  495. value: Union[str, bytes],
  496. parts: _Parts,
  497. ) -> None: ... # pragma: no cover
  498. @overload
  499. def _parse_token(
  500. self,
  501. token: Literal["W"],
  502. value: Tuple[_WEEKDATE_ELEMENT, _WEEKDATE_ELEMENT, Optional[_WEEKDATE_ELEMENT]],
  503. parts: _Parts,
  504. ) -> None: ... # pragma: no cover
  505. def _parse_token(
  506. self,
  507. token: Any,
  508. value: Any,
  509. parts: _Parts,
  510. ) -> None:
  511. """
  512. Parse a token and its value, and update the `_Parts` dictionary with the parsed values.
  513. The function supports several tokens, including "YYYY", "YY", "MMMM", "MMM", "MM", "M", "DDDD", "DDD", "DD", "D", "Do", "dddd", "ddd", "HH", "H", "mm", "m", "ss", "s", "S", "X", "x", "ZZZ", "ZZ", "Z", "a", "A", and "W". Each token is matched and the corresponding value is parsed and added to the `_Parts` dictionary.
  514. :param token: The token to parse.
  515. :type token: Any
  516. :param value: The value of the token.
  517. :type value: Any
  518. :param parts: A dictionary to update with the parsed values.
  519. :type parts: _Parts
  520. :raises ParserMatchError: If the hour token value is not between 0 and 12 inclusive for tokens "a" or "A".
  521. """
  522. if token == "YYYY":
  523. parts["year"] = int(value)
  524. elif token == "YY":
  525. value = int(value)
  526. parts["year"] = 1900 + value if value > 68 else 2000 + value
  527. elif token in ["MMMM", "MMM"]:
  528. # FIXME: month_number() is nullable
  529. parts["month"] = self.locale.month_number(value.lower()) # type: ignore[typeddict-item]
  530. elif token in ["MM", "M"]:
  531. parts["month"] = int(value)
  532. elif token in ["DDDD", "DDD"]:
  533. parts["day_of_year"] = int(value)
  534. elif token in ["DD", "D"]:
  535. parts["day"] = int(value)
  536. elif token == "Do":
  537. parts["day"] = int(value)
  538. elif token == "dddd":
  539. # locale day names are 1-indexed
  540. day_of_week = [x.lower() for x in self.locale.day_names].index(
  541. value.lower()
  542. )
  543. parts["day_of_week"] = day_of_week - 1
  544. elif token == "ddd":
  545. # locale day abbreviations are 1-indexed
  546. day_of_week = [x.lower() for x in self.locale.day_abbreviations].index(
  547. value.lower()
  548. )
  549. parts["day_of_week"] = day_of_week - 1
  550. elif token.upper() in ["HH", "H"]:
  551. parts["hour"] = int(value)
  552. elif token in ["mm", "m"]:
  553. parts["minute"] = int(value)
  554. elif token in ["ss", "s"]:
  555. parts["second"] = int(value)
  556. elif token == "S":
  557. # We have the *most significant* digits of an arbitrary-precision integer.
  558. # We want the six most significant digits as an integer, rounded.
  559. # IDEA: add nanosecond support somehow? Need datetime support for it first.
  560. value = value.ljust(7, "0")
  561. # floating-point (IEEE-754) defaults to half-to-even rounding
  562. seventh_digit = int(value[6])
  563. if seventh_digit == 5:
  564. rounding = int(value[5]) % 2
  565. elif seventh_digit > 5:
  566. rounding = 1
  567. else:
  568. rounding = 0
  569. parts["microsecond"] = int(value[:6]) + rounding
  570. elif token == "X":
  571. parts["timestamp"] = float(value)
  572. elif token == "x":
  573. parts["expanded_timestamp"] = int(value)
  574. elif token in ["ZZZ", "ZZ", "Z"]:
  575. parts["tzinfo"] = TzinfoParser.parse(value)
  576. elif token in ["a", "A"]:
  577. if value in (self.locale.meridians["am"], self.locale.meridians["AM"]):
  578. parts["am_pm"] = "am"
  579. if "hour" in parts and not 0 <= parts["hour"] <= 12:
  580. raise ParserMatchError(
  581. f"Hour token value must be between 0 and 12 inclusive for token {token!r}."
  582. )
  583. elif value in (self.locale.meridians["pm"], self.locale.meridians["PM"]):
  584. parts["am_pm"] = "pm"
  585. elif token == "W":
  586. parts["weekdate"] = value
  587. @staticmethod
  588. def _build_datetime(parts: _Parts) -> datetime:
  589. """
  590. Build a datetime object from a dictionary of date parts.
  591. :param parts: A dictionary containing the date parts extracted from a date string.
  592. :type parts: dict
  593. :return: A datetime object representing the date and time.
  594. :rtype: datetime.datetime
  595. """
  596. weekdate = parts.get("weekdate")
  597. if weekdate is not None:
  598. year, week = int(weekdate[0]), int(weekdate[1])
  599. if weekdate[2] is not None:
  600. _day = int(weekdate[2])
  601. else:
  602. # day not given, default to 1
  603. _day = 1
  604. date_string = f"{year}-{week}-{_day}"
  605. # tokens for ISO 8601 weekdates
  606. dt = datetime.strptime(date_string, "%G-%V-%u")
  607. parts["year"] = dt.year
  608. parts["month"] = dt.month
  609. parts["day"] = dt.day
  610. timestamp = parts.get("timestamp")
  611. if timestamp is not None:
  612. return datetime.fromtimestamp(timestamp, tz=timezone.utc)
  613. expanded_timestamp = parts.get("expanded_timestamp")
  614. if expanded_timestamp is not None:
  615. return datetime.fromtimestamp(
  616. normalize_timestamp(expanded_timestamp),
  617. tz=timezone.utc,
  618. )
  619. day_of_year = parts.get("day_of_year")
  620. if day_of_year is not None:
  621. _year = parts.get("year")
  622. month = parts.get("month")
  623. if _year is None:
  624. raise ParserError(
  625. "Year component is required with the DDD and DDDD tokens."
  626. )
  627. if month is not None:
  628. raise ParserError(
  629. "Month component is not allowed with the DDD and DDDD tokens."
  630. )
  631. date_string = f"{_year}-{day_of_year}"
  632. try:
  633. dt = datetime.strptime(date_string, "%Y-%j")
  634. except ValueError:
  635. raise ParserError(
  636. f"The provided day of year {day_of_year!r} is invalid."
  637. )
  638. parts["year"] = dt.year
  639. parts["month"] = dt.month
  640. parts["day"] = dt.day
  641. day_of_week: Optional[int] = parts.get("day_of_week")
  642. day = parts.get("day")
  643. # If day is passed, ignore day of week
  644. if day_of_week is not None and day is None:
  645. year = parts.get("year", 1970)
  646. month = parts.get("month", 1)
  647. day = 1
  648. # dddd => first day of week after epoch
  649. # dddd YYYY => first day of week in specified year
  650. # dddd MM YYYY => first day of week in specified year and month
  651. # dddd MM => first day after epoch in specified month
  652. next_weekday_dt = next_weekday(datetime(year, month, day), day_of_week)
  653. parts["year"] = next_weekday_dt.year
  654. parts["month"] = next_weekday_dt.month
  655. parts["day"] = next_weekday_dt.day
  656. am_pm = parts.get("am_pm")
  657. hour = parts.get("hour", 0)
  658. if am_pm == "pm" and hour < 12:
  659. hour += 12
  660. elif am_pm == "am" and hour == 12:
  661. hour = 0
  662. # Support for midnight at the end of day
  663. if hour == 24:
  664. if parts.get("minute", 0) != 0:
  665. raise ParserError("Midnight at the end of day must not contain minutes")
  666. if parts.get("second", 0) != 0:
  667. raise ParserError("Midnight at the end of day must not contain seconds")
  668. if parts.get("microsecond", 0) != 0:
  669. raise ParserError(
  670. "Midnight at the end of day must not contain microseconds"
  671. )
  672. hour = 0
  673. day_increment = 1
  674. else:
  675. day_increment = 0
  676. # account for rounding up to 1000000
  677. microsecond = parts.get("microsecond", 0)
  678. if microsecond == 1000000:
  679. microsecond = 0
  680. second_increment = 1
  681. else:
  682. second_increment = 0
  683. increment = timedelta(days=day_increment, seconds=second_increment)
  684. return (
  685. datetime(
  686. year=parts.get("year", 1),
  687. month=parts.get("month", 1),
  688. day=parts.get("day", 1),
  689. hour=hour,
  690. minute=parts.get("minute", 0),
  691. second=parts.get("second", 0),
  692. microsecond=microsecond,
  693. tzinfo=parts.get("tzinfo"),
  694. )
  695. + increment
  696. )
  697. def _parse_multiformat(self, string: str, formats: Iterable[str]) -> datetime:
  698. """
  699. Parse a date and time string using multiple formats.
  700. Tries to parse the provided string with each format in the given `formats`
  701. iterable, returning the resulting `datetime` object if a match is found. If no
  702. format matches the string, a `ParserError` is raised.
  703. :param string: The date and time string to parse.
  704. :type string: str
  705. :param formats: An iterable of date and time format strings to try, in order.
  706. :type formats: Iterable[str]
  707. :returns: The parsed date and time.
  708. :rtype: datetime.datetime
  709. :raises ParserError: If no format matches the input string.
  710. """
  711. _datetime: Optional[datetime] = None
  712. for fmt in formats:
  713. try:
  714. _datetime = self.parse(string, fmt)
  715. break
  716. except ParserMatchError:
  717. pass
  718. if _datetime is None:
  719. supported_formats = ", ".join(formats)
  720. raise ParserError(
  721. f"Could not match input {string!r} to any of the following formats: {supported_formats}."
  722. )
  723. return _datetime
  724. # generates a capture group of choices separated by an OR operator
  725. @staticmethod
  726. def _generate_choice_re(
  727. choices: Iterable[str], flags: Union[int, re.RegexFlag] = 0
  728. ) -> Pattern[str]:
  729. """
  730. Generate a regular expression pattern that matches a choice from an iterable.
  731. Takes an iterable of strings (`choices`) and returns a compiled regular expression
  732. pattern that matches any of the choices. The pattern is created by joining the
  733. choices with the '|' (OR) operator, which matches any of the enclosed patterns.
  734. :param choices: An iterable of strings to match.
  735. :type choices: Iterable[str]
  736. :param flags: Optional regular expression flags. Default is 0.
  737. :type flags: Union[int, re.RegexFlag], optional
  738. :returns: A compiled regular expression pattern that matches any of the choices.
  739. :rtype: re.Pattern[str]
  740. """
  741. return re.compile(r"({})".format("|".join(choices)), flags=flags)
  742. class TzinfoParser:
  743. """
  744. Parser for timezone information.
  745. """
  746. _TZINFO_RE: ClassVar[Pattern[str]] = re.compile(
  747. r"^(?:\(UTC)*([\+\-])?(\d{2})(?:\:?(\d{2}))?"
  748. )
  749. @classmethod
  750. def parse(cls, tzinfo_string: str) -> dt_tzinfo:
  751. """
  752. Parse a timezone string and return a datetime timezone object.
  753. :param tzinfo_string: The timezone string to parse.
  754. :type tzinfo_string: str
  755. :returns: The parsed datetime timezone object.
  756. :rtype: datetime.timezone
  757. :raises ParserError: If the timezone string cannot be parsed.
  758. """
  759. tzinfo: Optional[dt_tzinfo] = None
  760. if tzinfo_string == "local":
  761. tzinfo = datetime.now().astimezone().tzinfo
  762. elif tzinfo_string in ["utc", "UTC", "Z"]:
  763. tzinfo = timezone.utc
  764. else:
  765. iso_match = cls._TZINFO_RE.match(tzinfo_string)
  766. if iso_match:
  767. sign: Optional[str]
  768. hours: str
  769. minutes: Union[str, int, None]
  770. sign, hours, minutes = iso_match.groups()
  771. seconds = int(hours) * 3600 + int(minutes or 0) * 60
  772. if sign == "-":
  773. seconds *= -1
  774. tzinfo = timezone(timedelta(seconds=seconds))
  775. else:
  776. try:
  777. tzinfo = ZoneInfo(tzinfo_string)
  778. except ZoneInfoNotFoundError:
  779. tzinfo = None
  780. if tzinfo is None:
  781. raise ParserError(f"Could not parse timezone expression {tzinfo_string!r}.")
  782. return tzinfo