_parser.py 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782
  1. # SPDX-License-Identifier: MIT
  2. # SPDX-FileCopyrightText: 2021 Taneli Hukkinen
  3. # Licensed to PSF under a Contributor Agreement.
  4. from __future__ import annotations
  5. import sys
  6. from types import MappingProxyType
  7. from ._re import (
  8. RE_DATETIME,
  9. RE_LOCALTIME,
  10. RE_NUMBER,
  11. match_to_datetime,
  12. match_to_localtime,
  13. match_to_number,
  14. )
  15. TYPE_CHECKING = False
  16. if TYPE_CHECKING:
  17. from collections.abc import Iterable
  18. from typing import IO, Any, Final
  19. from ._types import Key, ParseFloat, Pos
  20. # Inline tables/arrays are implemented using recursion. Pathologically
  21. # nested documents cause pure Python to raise RecursionError (which is OK),
  22. # but mypyc binary wheels will crash unrecoverably (not OK). According to
  23. # mypyc docs this will be fixed in the future:
  24. # https://mypyc.readthedocs.io/en/latest/differences_from_python.html#stack-overflows
  25. # Before mypyc's fix is in, recursion needs to be limited by this library.
  26. # Choosing `sys.getrecursionlimit()` as maximum inline table/array nesting
  27. # level, as it allows more nesting than pure Python, but still seems a far
  28. # lower number than where mypyc binaries crash.
  29. MAX_INLINE_NESTING: Final = sys.getrecursionlimit()
  30. ASCII_CTRL: Final = frozenset(chr(i) for i in range(32)) | frozenset(chr(127))
  31. # Neither of these sets include quotation mark or backslash. They are
  32. # currently handled as separate cases in the parser functions.
  33. ILLEGAL_BASIC_STR_CHARS: Final = ASCII_CTRL - frozenset("\t")
  34. ILLEGAL_MULTILINE_BASIC_STR_CHARS: Final = ASCII_CTRL - frozenset("\t\n")
  35. ILLEGAL_LITERAL_STR_CHARS: Final = ILLEGAL_BASIC_STR_CHARS
  36. ILLEGAL_MULTILINE_LITERAL_STR_CHARS: Final = ILLEGAL_MULTILINE_BASIC_STR_CHARS
  37. ILLEGAL_COMMENT_CHARS: Final = ILLEGAL_BASIC_STR_CHARS
  38. TOML_WS: Final = frozenset(" \t")
  39. TOML_WS_AND_NEWLINE: Final = TOML_WS | frozenset("\n")
  40. BARE_KEY_CHARS: Final = frozenset(
  41. "abcdefghijklmnopqrstuvwxyz" "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "0123456789" "-_"
  42. )
  43. KEY_INITIAL_CHARS: Final = BARE_KEY_CHARS | frozenset("\"'")
  44. HEXDIGIT_CHARS: Final = frozenset("abcdef" "ABCDEF" "0123456789")
  45. BASIC_STR_ESCAPE_REPLACEMENTS: Final = MappingProxyType(
  46. {
  47. "\\b": "\u0008", # backspace
  48. "\\t": "\u0009", # tab
  49. "\\n": "\u000a", # linefeed
  50. "\\f": "\u000c", # form feed
  51. "\\r": "\u000d", # carriage return
  52. "\\e": "\u001b", # escape
  53. '\\"': "\u0022", # quote
  54. "\\\\": "\u005c", # backslash
  55. }
  56. )
  57. class DEPRECATED_DEFAULT:
  58. """Sentinel to be used as default arg during deprecation
  59. period of TOMLDecodeError's free-form arguments."""
  60. class TOMLDecodeError(ValueError):
  61. """An error raised if a document is not valid TOML.
  62. Adds the following attributes to ValueError:
  63. msg: The unformatted error message
  64. doc: The TOML document being parsed
  65. pos: The index of doc where parsing failed
  66. lineno: The line corresponding to pos
  67. colno: The column corresponding to pos
  68. """
  69. def __init__(
  70. self,
  71. msg: str | type[DEPRECATED_DEFAULT] = DEPRECATED_DEFAULT,
  72. doc: str | type[DEPRECATED_DEFAULT] = DEPRECATED_DEFAULT,
  73. pos: Pos | type[DEPRECATED_DEFAULT] = DEPRECATED_DEFAULT,
  74. *args: Any,
  75. ):
  76. if (
  77. args
  78. or not isinstance(msg, str)
  79. or not isinstance(doc, str)
  80. or not isinstance(pos, int)
  81. ):
  82. import warnings
  83. warnings.warn(
  84. "Free-form arguments for TOMLDecodeError are deprecated. "
  85. "Please set 'msg' (str), 'doc' (str) and 'pos' (int) arguments only.",
  86. DeprecationWarning,
  87. stacklevel=2,
  88. )
  89. if pos is not DEPRECATED_DEFAULT:
  90. args = pos, *args
  91. if doc is not DEPRECATED_DEFAULT:
  92. args = doc, *args
  93. if msg is not DEPRECATED_DEFAULT:
  94. args = msg, *args
  95. ValueError.__init__(self, *args)
  96. return
  97. lineno = doc.count("\n", 0, pos) + 1
  98. if lineno == 1:
  99. colno = pos + 1
  100. else:
  101. colno = pos - doc.rindex("\n", 0, pos)
  102. if pos >= len(doc):
  103. coord_repr = "end of document"
  104. else:
  105. coord_repr = f"line {lineno}, column {colno}"
  106. errmsg = f"{msg} (at {coord_repr})"
  107. ValueError.__init__(self, errmsg)
  108. self.msg = msg
  109. self.doc = doc
  110. self.pos = pos
  111. self.lineno = lineno
  112. self.colno = colno
  113. def load(__fp: IO[bytes], *, parse_float: ParseFloat = float) -> dict[str, Any]:
  114. """Parse TOML from a binary file object."""
  115. b = __fp.read()
  116. try:
  117. s = b.decode()
  118. except AttributeError:
  119. raise TypeError(
  120. "File must be opened in binary mode, e.g. use `open('foo.toml', 'rb')`"
  121. ) from None
  122. return loads(s, parse_float=parse_float)
  123. def loads(__s: str, *, parse_float: ParseFloat = float) -> dict[str, Any]:
  124. """Parse TOML from a string."""
  125. # The spec allows converting "\r\n" to "\n", even in string
  126. # literals. Let's do so to simplify parsing.
  127. try:
  128. src = __s.replace("\r\n", "\n")
  129. except (AttributeError, TypeError):
  130. raise TypeError(
  131. f"Expected str object, not '{type(__s).__qualname__}'"
  132. ) from None
  133. pos = 0
  134. out = Output()
  135. header: Key = ()
  136. parse_float = make_safe_parse_float(parse_float)
  137. # Parse one statement at a time
  138. # (typically means one line in TOML source)
  139. while True:
  140. # 1. Skip line leading whitespace
  141. pos = skip_chars(src, pos, TOML_WS)
  142. # 2. Parse rules. Expect one of the following:
  143. # - end of file
  144. # - end of line
  145. # - comment
  146. # - key/value pair
  147. # - append dict to list (and move to its namespace)
  148. # - create dict (and move to its namespace)
  149. # Skip trailing whitespace when applicable.
  150. try:
  151. char = src[pos]
  152. except IndexError:
  153. break
  154. if char == "\n":
  155. pos += 1
  156. continue
  157. if char in KEY_INITIAL_CHARS:
  158. pos = key_value_rule(src, pos, out, header, parse_float)
  159. pos = skip_chars(src, pos, TOML_WS)
  160. elif char == "[":
  161. try:
  162. second_char: str | None = src[pos + 1]
  163. except IndexError:
  164. second_char = None
  165. out.flags.finalize_pending()
  166. if second_char == "[":
  167. pos, header = create_list_rule(src, pos, out)
  168. else:
  169. pos, header = create_dict_rule(src, pos, out)
  170. pos = skip_chars(src, pos, TOML_WS)
  171. elif char != "#":
  172. raise TOMLDecodeError("Invalid statement", src, pos)
  173. # 3. Skip comment
  174. pos = skip_comment(src, pos)
  175. # 4. Expect end of line or end of file
  176. try:
  177. char = src[pos]
  178. except IndexError:
  179. break
  180. if char != "\n":
  181. raise TOMLDecodeError(
  182. "Expected newline or end of document after a statement", src, pos
  183. )
  184. pos += 1
  185. return out.data.dict
  186. class Flags:
  187. """Flags that map to parsed keys/namespaces."""
  188. # Marks an immutable namespace (inline array or inline table).
  189. FROZEN: Final = 0
  190. # Marks a nest that has been explicitly created and can no longer
  191. # be opened using the "[table]" syntax.
  192. EXPLICIT_NEST: Final = 1
  193. def __init__(self) -> None:
  194. self._flags: dict[str, dict[Any, Any]] = {}
  195. self._pending_flags: set[tuple[Key, int]] = set()
  196. def add_pending(self, key: Key, flag: int) -> None:
  197. self._pending_flags.add((key, flag))
  198. def finalize_pending(self) -> None:
  199. for key, flag in self._pending_flags:
  200. self.set(key, flag, recursive=False)
  201. self._pending_flags.clear()
  202. def unset_all(self, key: Key) -> None:
  203. cont = self._flags
  204. for k in key[:-1]:
  205. if k not in cont:
  206. return
  207. cont = cont[k]["nested"]
  208. cont.pop(key[-1], None)
  209. def set(self, key: Key, flag: int, *, recursive: bool) -> None: # noqa: A003
  210. cont = self._flags
  211. key_parent, key_stem = key[:-1], key[-1]
  212. for k in key_parent:
  213. if k not in cont:
  214. cont[k] = {"flags": set(), "recursive_flags": set(), "nested": {}}
  215. cont = cont[k]["nested"]
  216. if key_stem not in cont:
  217. cont[key_stem] = {"flags": set(), "recursive_flags": set(), "nested": {}}
  218. cont[key_stem]["recursive_flags" if recursive else "flags"].add(flag)
  219. def is_(self, key: Key, flag: int) -> bool:
  220. if not key:
  221. return False # document root has no flags
  222. cont = self._flags
  223. for k in key[:-1]:
  224. if k not in cont:
  225. return False
  226. inner_cont = cont[k]
  227. if flag in inner_cont["recursive_flags"]:
  228. return True
  229. cont = inner_cont["nested"]
  230. key_stem = key[-1]
  231. if key_stem in cont:
  232. inner_cont = cont[key_stem]
  233. return flag in inner_cont["flags"] or flag in inner_cont["recursive_flags"]
  234. return False
  235. class NestedDict:
  236. def __init__(self) -> None:
  237. # The parsed content of the TOML document
  238. self.dict: dict[str, Any] = {}
  239. def get_or_create_nest(
  240. self,
  241. key: Key,
  242. *,
  243. access_lists: bool = True,
  244. ) -> dict[str, Any]:
  245. cont: Any = self.dict
  246. for k in key:
  247. if k not in cont:
  248. cont[k] = {}
  249. cont = cont[k]
  250. if access_lists and isinstance(cont, list):
  251. cont = cont[-1]
  252. if not isinstance(cont, dict):
  253. raise KeyError("There is no nest behind this key")
  254. return cont # type: ignore[no-any-return]
  255. def append_nest_to_list(self, key: Key) -> None:
  256. cont = self.get_or_create_nest(key[:-1])
  257. last_key = key[-1]
  258. if last_key in cont:
  259. list_ = cont[last_key]
  260. if not isinstance(list_, list):
  261. raise KeyError("An object other than list found behind this key")
  262. list_.append({})
  263. else:
  264. cont[last_key] = [{}]
  265. class Output:
  266. def __init__(self) -> None:
  267. self.data = NestedDict()
  268. self.flags = Flags()
  269. def skip_chars(src: str, pos: Pos, chars: Iterable[str]) -> Pos:
  270. try:
  271. while src[pos] in chars:
  272. pos += 1
  273. except IndexError:
  274. pass
  275. return pos
  276. def skip_until(
  277. src: str,
  278. pos: Pos,
  279. expect: str,
  280. *,
  281. error_on: frozenset[str],
  282. error_on_eof: bool,
  283. ) -> Pos:
  284. try:
  285. new_pos = src.index(expect, pos)
  286. except ValueError:
  287. new_pos = len(src)
  288. if error_on_eof:
  289. raise TOMLDecodeError(f"Expected {expect!r}", src, new_pos) from None
  290. if not error_on.isdisjoint(src[pos:new_pos]):
  291. while src[pos] not in error_on:
  292. pos += 1
  293. raise TOMLDecodeError(f"Found invalid character {src[pos]!r}", src, pos)
  294. return new_pos
  295. def skip_comment(src: str, pos: Pos) -> Pos:
  296. try:
  297. char: str | None = src[pos]
  298. except IndexError:
  299. char = None
  300. if char == "#":
  301. return skip_until(
  302. src, pos + 1, "\n", error_on=ILLEGAL_COMMENT_CHARS, error_on_eof=False
  303. )
  304. return pos
  305. def skip_comments_and_array_ws(src: str, pos: Pos) -> Pos:
  306. while True:
  307. pos_before_skip = pos
  308. pos = skip_chars(src, pos, TOML_WS_AND_NEWLINE)
  309. pos = skip_comment(src, pos)
  310. if pos == pos_before_skip:
  311. return pos
  312. def create_dict_rule(src: str, pos: Pos, out: Output) -> tuple[Pos, Key]:
  313. pos += 1 # Skip "["
  314. pos = skip_chars(src, pos, TOML_WS)
  315. pos, key = parse_key(src, pos)
  316. if out.flags.is_(key, Flags.EXPLICIT_NEST) or out.flags.is_(key, Flags.FROZEN):
  317. raise TOMLDecodeError(f"Cannot declare {key} twice", src, pos)
  318. out.flags.set(key, Flags.EXPLICIT_NEST, recursive=False)
  319. try:
  320. out.data.get_or_create_nest(key)
  321. except KeyError:
  322. raise TOMLDecodeError("Cannot overwrite a value", src, pos) from None
  323. if not src.startswith("]", pos):
  324. raise TOMLDecodeError(
  325. "Expected ']' at the end of a table declaration", src, pos
  326. )
  327. return pos + 1, key
  328. def create_list_rule(src: str, pos: Pos, out: Output) -> tuple[Pos, Key]:
  329. pos += 2 # Skip "[["
  330. pos = skip_chars(src, pos, TOML_WS)
  331. pos, key = parse_key(src, pos)
  332. if out.flags.is_(key, Flags.FROZEN):
  333. raise TOMLDecodeError(f"Cannot mutate immutable namespace {key}", src, pos)
  334. # Free the namespace now that it points to another empty list item...
  335. out.flags.unset_all(key)
  336. # ...but this key precisely is still prohibited from table declaration
  337. out.flags.set(key, Flags.EXPLICIT_NEST, recursive=False)
  338. try:
  339. out.data.append_nest_to_list(key)
  340. except KeyError:
  341. raise TOMLDecodeError("Cannot overwrite a value", src, pos) from None
  342. if not src.startswith("]]", pos):
  343. raise TOMLDecodeError(
  344. "Expected ']]' at the end of an array declaration", src, pos
  345. )
  346. return pos + 2, key
  347. def key_value_rule(
  348. src: str, pos: Pos, out: Output, header: Key, parse_float: ParseFloat
  349. ) -> Pos:
  350. pos, key, value = parse_key_value_pair(src, pos, parse_float, nest_lvl=0)
  351. key_parent, key_stem = key[:-1], key[-1]
  352. abs_key_parent = header + key_parent
  353. relative_path_cont_keys = (header + key[:i] for i in range(1, len(key)))
  354. for cont_key in relative_path_cont_keys:
  355. # Check that dotted key syntax does not redefine an existing table
  356. if out.flags.is_(cont_key, Flags.EXPLICIT_NEST):
  357. raise TOMLDecodeError(f"Cannot redefine namespace {cont_key}", src, pos)
  358. # Containers in the relative path can't be opened with the table syntax or
  359. # dotted key/value syntax in following table sections.
  360. out.flags.add_pending(cont_key, Flags.EXPLICIT_NEST)
  361. if out.flags.is_(abs_key_parent, Flags.FROZEN):
  362. raise TOMLDecodeError(
  363. f"Cannot mutate immutable namespace {abs_key_parent}", src, pos
  364. )
  365. try:
  366. nest = out.data.get_or_create_nest(abs_key_parent)
  367. except KeyError:
  368. raise TOMLDecodeError("Cannot overwrite a value", src, pos) from None
  369. if key_stem in nest:
  370. raise TOMLDecodeError("Cannot overwrite a value", src, pos)
  371. # Mark inline table and array namespaces recursively immutable
  372. if isinstance(value, (dict, list)):
  373. out.flags.set(header + key, Flags.FROZEN, recursive=True)
  374. nest[key_stem] = value
  375. return pos
  376. def parse_key_value_pair(
  377. src: str, pos: Pos, parse_float: ParseFloat, nest_lvl: int
  378. ) -> tuple[Pos, Key, Any]:
  379. pos, key = parse_key(src, pos)
  380. try:
  381. char: str | None = src[pos]
  382. except IndexError:
  383. char = None
  384. if char != "=":
  385. raise TOMLDecodeError("Expected '=' after a key in a key/value pair", src, pos)
  386. pos += 1
  387. pos = skip_chars(src, pos, TOML_WS)
  388. pos, value = parse_value(src, pos, parse_float, nest_lvl)
  389. return pos, key, value
  390. def parse_key(src: str, pos: Pos) -> tuple[Pos, Key]:
  391. pos, key_part = parse_key_part(src, pos)
  392. key: Key = (key_part,)
  393. pos = skip_chars(src, pos, TOML_WS)
  394. while True:
  395. try:
  396. char: str | None = src[pos]
  397. except IndexError:
  398. char = None
  399. if char != ".":
  400. return pos, key
  401. pos += 1
  402. pos = skip_chars(src, pos, TOML_WS)
  403. pos, key_part = parse_key_part(src, pos)
  404. key += (key_part,)
  405. pos = skip_chars(src, pos, TOML_WS)
  406. def parse_key_part(src: str, pos: Pos) -> tuple[Pos, str]:
  407. try:
  408. char: str | None = src[pos]
  409. except IndexError:
  410. char = None
  411. if char in BARE_KEY_CHARS:
  412. start_pos = pos
  413. pos = skip_chars(src, pos, BARE_KEY_CHARS)
  414. return pos, src[start_pos:pos]
  415. if char == "'":
  416. return parse_literal_str(src, pos)
  417. if char == '"':
  418. return parse_one_line_basic_str(src, pos)
  419. raise TOMLDecodeError("Invalid initial character for a key part", src, pos)
  420. def parse_one_line_basic_str(src: str, pos: Pos) -> tuple[Pos, str]:
  421. pos += 1
  422. return parse_basic_str(src, pos, multiline=False)
  423. def parse_array(
  424. src: str, pos: Pos, parse_float: ParseFloat, nest_lvl: int
  425. ) -> tuple[Pos, list[Any]]:
  426. pos += 1
  427. array: list[Any] = []
  428. pos = skip_comments_and_array_ws(src, pos)
  429. if src.startswith("]", pos):
  430. return pos + 1, array
  431. while True:
  432. pos, val = parse_value(src, pos, parse_float, nest_lvl)
  433. array.append(val)
  434. pos = skip_comments_and_array_ws(src, pos)
  435. c = src[pos : pos + 1]
  436. if c == "]":
  437. return pos + 1, array
  438. if c != ",":
  439. raise TOMLDecodeError("Unclosed array", src, pos)
  440. pos += 1
  441. pos = skip_comments_and_array_ws(src, pos)
  442. if src.startswith("]", pos):
  443. return pos + 1, array
  444. def parse_inline_table(
  445. src: str, pos: Pos, parse_float: ParseFloat, nest_lvl: int
  446. ) -> tuple[Pos, dict[str, Any]]:
  447. pos += 1
  448. nested_dict = NestedDict()
  449. flags = Flags()
  450. pos = skip_comments_and_array_ws(src, pos)
  451. if src.startswith("}", pos):
  452. return pos + 1, nested_dict.dict
  453. while True:
  454. pos, key, value = parse_key_value_pair(src, pos, parse_float, nest_lvl)
  455. key_parent, key_stem = key[:-1], key[-1]
  456. if flags.is_(key, Flags.FROZEN):
  457. raise TOMLDecodeError(f"Cannot mutate immutable namespace {key}", src, pos)
  458. try:
  459. nest = nested_dict.get_or_create_nest(key_parent, access_lists=False)
  460. except KeyError:
  461. raise TOMLDecodeError("Cannot overwrite a value", src, pos) from None
  462. if key_stem in nest:
  463. raise TOMLDecodeError(f"Duplicate inline table key {key_stem!r}", src, pos)
  464. nest[key_stem] = value
  465. pos = skip_comments_and_array_ws(src, pos)
  466. c = src[pos : pos + 1]
  467. if c == "}":
  468. return pos + 1, nested_dict.dict
  469. if c != ",":
  470. raise TOMLDecodeError("Unclosed inline table", src, pos)
  471. pos += 1
  472. pos = skip_comments_and_array_ws(src, pos)
  473. if src.startswith("}", pos):
  474. return pos + 1, nested_dict.dict
  475. if isinstance(value, (dict, list)):
  476. flags.set(key, Flags.FROZEN, recursive=True)
  477. def parse_basic_str_escape(
  478. src: str, pos: Pos, *, multiline: bool = False
  479. ) -> tuple[Pos, str]:
  480. escape_id = src[pos : pos + 2]
  481. pos += 2
  482. if multiline and escape_id in {"\\ ", "\\\t", "\\\n"}:
  483. # Skip whitespace until next non-whitespace character or end of
  484. # the doc. Error if non-whitespace is found before newline.
  485. if escape_id != "\\\n":
  486. pos = skip_chars(src, pos, TOML_WS)
  487. try:
  488. char = src[pos]
  489. except IndexError:
  490. return pos, ""
  491. if char != "\n":
  492. raise TOMLDecodeError("Unescaped '\\' in a string", src, pos)
  493. pos += 1
  494. pos = skip_chars(src, pos, TOML_WS_AND_NEWLINE)
  495. return pos, ""
  496. if escape_id == "\\x":
  497. return parse_hex_char(src, pos, 2)
  498. if escape_id == "\\u":
  499. return parse_hex_char(src, pos, 4)
  500. if escape_id == "\\U":
  501. return parse_hex_char(src, pos, 8)
  502. try:
  503. return pos, BASIC_STR_ESCAPE_REPLACEMENTS[escape_id]
  504. except KeyError:
  505. raise TOMLDecodeError("Unescaped '\\' in a string", src, pos) from None
  506. def parse_basic_str_escape_multiline(src: str, pos: Pos) -> tuple[Pos, str]:
  507. return parse_basic_str_escape(src, pos, multiline=True)
  508. def parse_hex_char(src: str, pos: Pos, hex_len: int) -> tuple[Pos, str]:
  509. hex_str = src[pos : pos + hex_len]
  510. if len(hex_str) != hex_len or not HEXDIGIT_CHARS.issuperset(hex_str):
  511. raise TOMLDecodeError("Invalid hex value", src, pos)
  512. pos += hex_len
  513. hex_int = int(hex_str, 16)
  514. if not is_unicode_scalar_value(hex_int):
  515. raise TOMLDecodeError(
  516. "Escaped character is not a Unicode scalar value", src, pos
  517. )
  518. return pos, chr(hex_int)
  519. def parse_literal_str(src: str, pos: Pos) -> tuple[Pos, str]:
  520. pos += 1 # Skip starting apostrophe
  521. start_pos = pos
  522. pos = skip_until(
  523. src, pos, "'", error_on=ILLEGAL_LITERAL_STR_CHARS, error_on_eof=True
  524. )
  525. return pos + 1, src[start_pos:pos] # Skip ending apostrophe
  526. def parse_multiline_str(src: str, pos: Pos, *, literal: bool) -> tuple[Pos, str]:
  527. pos += 3
  528. if src.startswith("\n", pos):
  529. pos += 1
  530. if literal:
  531. delim = "'"
  532. end_pos = skip_until(
  533. src,
  534. pos,
  535. "'''",
  536. error_on=ILLEGAL_MULTILINE_LITERAL_STR_CHARS,
  537. error_on_eof=True,
  538. )
  539. result = src[pos:end_pos]
  540. pos = end_pos + 3
  541. else:
  542. delim = '"'
  543. pos, result = parse_basic_str(src, pos, multiline=True)
  544. # Add at maximum two extra apostrophes/quotes if the end sequence
  545. # is 4 or 5 chars long instead of just 3.
  546. if not src.startswith(delim, pos):
  547. return pos, result
  548. pos += 1
  549. if not src.startswith(delim, pos):
  550. return pos, result + delim
  551. pos += 1
  552. return pos, result + (delim * 2)
  553. def parse_basic_str(src: str, pos: Pos, *, multiline: bool) -> tuple[Pos, str]:
  554. if multiline:
  555. error_on = ILLEGAL_MULTILINE_BASIC_STR_CHARS
  556. parse_escapes = parse_basic_str_escape_multiline
  557. else:
  558. error_on = ILLEGAL_BASIC_STR_CHARS
  559. parse_escapes = parse_basic_str_escape
  560. result = ""
  561. start_pos = pos
  562. while True:
  563. try:
  564. char = src[pos]
  565. except IndexError:
  566. raise TOMLDecodeError("Unterminated string", src, pos) from None
  567. if char == '"':
  568. if not multiline:
  569. return pos + 1, result + src[start_pos:pos]
  570. if src.startswith('"""', pos):
  571. return pos + 3, result + src[start_pos:pos]
  572. pos += 1
  573. continue
  574. if char == "\\":
  575. result += src[start_pos:pos]
  576. pos, parsed_escape = parse_escapes(src, pos)
  577. result += parsed_escape
  578. start_pos = pos
  579. continue
  580. if char in error_on:
  581. raise TOMLDecodeError(f"Illegal character {char!r}", src, pos)
  582. pos += 1
  583. def parse_value(
  584. src: str, pos: Pos, parse_float: ParseFloat, nest_lvl: int
  585. ) -> tuple[Pos, Any]:
  586. if nest_lvl > MAX_INLINE_NESTING:
  587. # Pure Python should have raised RecursionError already.
  588. # This ensures mypyc binaries eventually do the same.
  589. raise RecursionError( # pragma: no cover
  590. "TOML inline arrays/tables are nested more than the allowed"
  591. f" {MAX_INLINE_NESTING} levels"
  592. )
  593. try:
  594. char: str | None = src[pos]
  595. except IndexError:
  596. char = None
  597. # IMPORTANT: order conditions based on speed of checking and likelihood
  598. # Basic strings
  599. if char == '"':
  600. if src.startswith('"""', pos):
  601. return parse_multiline_str(src, pos, literal=False)
  602. return parse_one_line_basic_str(src, pos)
  603. # Literal strings
  604. if char == "'":
  605. if src.startswith("'''", pos):
  606. return parse_multiline_str(src, pos, literal=True)
  607. return parse_literal_str(src, pos)
  608. # Booleans
  609. if char == "t":
  610. if src.startswith("true", pos):
  611. return pos + 4, True
  612. if char == "f":
  613. if src.startswith("false", pos):
  614. return pos + 5, False
  615. # Arrays
  616. if char == "[":
  617. return parse_array(src, pos, parse_float, nest_lvl + 1)
  618. # Inline tables
  619. if char == "{":
  620. return parse_inline_table(src, pos, parse_float, nest_lvl + 1)
  621. # Dates and times
  622. datetime_match = RE_DATETIME.match(src, pos)
  623. if datetime_match:
  624. try:
  625. datetime_obj = match_to_datetime(datetime_match)
  626. except ValueError as e:
  627. raise TOMLDecodeError("Invalid date or datetime", src, pos) from e
  628. return datetime_match.end(), datetime_obj
  629. localtime_match = RE_LOCALTIME.match(src, pos)
  630. if localtime_match:
  631. return localtime_match.end(), match_to_localtime(localtime_match)
  632. # Integers and "normal" floats.
  633. # The regex will greedily match any type starting with a decimal
  634. # char, so needs to be located after handling of dates and times.
  635. number_match = RE_NUMBER.match(src, pos)
  636. if number_match:
  637. return number_match.end(), match_to_number(number_match, parse_float)
  638. # Special floats
  639. first_three = src[pos : pos + 3]
  640. if first_three in {"inf", "nan"}:
  641. return pos + 3, parse_float(first_three)
  642. first_four = src[pos : pos + 4]
  643. if first_four in {"-inf", "+inf", "-nan", "+nan"}:
  644. return pos + 4, parse_float(first_four)
  645. raise TOMLDecodeError("Invalid value", src, pos)
  646. def is_unicode_scalar_value(codepoint: int) -> bool:
  647. return (0 <= codepoint <= 55295) or (57344 <= codepoint <= 1114111)
  648. def make_safe_parse_float(parse_float: ParseFloat) -> ParseFloat:
  649. """A decorator to make `parse_float` safe.
  650. `parse_float` must not return dicts or lists, because these types
  651. would be mixed with parsed TOML tables and arrays, thus confusing
  652. the parser. The returned decorated callable raises `ValueError`
  653. instead of returning illegal types.
  654. """
  655. # The default `float` callable never returns illegal types. Optimize it.
  656. if parse_float is float:
  657. return float
  658. def safe_parse_float(float_str: str) -> Any:
  659. float_value = parse_float(float_str)
  660. if isinstance(float_value, (dict, list)):
  661. raise ValueError("parse_float must not return dicts or lists")
  662. return float_value
  663. return safe_parse_float