_parser.py 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777
  1. # SPDX-License-Identifier: MIT
  2. # SPDX-FileCopyrightText: 2021 Taneli Hukkinen
  3. # Licensed to PSF under a Contributor Agreement.
  4. from __future__ import annotations
  5. import sys
  6. from types import MappingProxyType
  7. from ._re import (
  8. RE_DATETIME,
  9. RE_LOCALTIME,
  10. RE_NUMBER,
  11. match_to_datetime,
  12. match_to_localtime,
  13. match_to_number,
  14. )
  15. TYPE_CHECKING = False
  16. if TYPE_CHECKING:
  17. from collections.abc import Iterable
  18. from typing import IO, Any, Final
  19. from ._types import Key, ParseFloat, Pos
  20. # Inline tables/arrays are implemented using recursion. Pathologically
  21. # nested documents cause pure Python to raise RecursionError (which is OK),
  22. # but mypyc binary wheels will crash unrecoverably (not OK). According to
  23. # mypyc docs this will be fixed in the future:
  24. # https://mypyc.readthedocs.io/en/latest/differences_from_python.html#stack-overflows
  25. # Before mypyc's fix is in, recursion needs to be limited by this library.
  26. # Choosing `sys.getrecursionlimit()` as maximum inline table/array nesting
  27. # level, as it allows more nesting than pure Python, but still seems a far
  28. # lower number than where mypyc binaries crash.
  29. MAX_INLINE_NESTING: Final = sys.getrecursionlimit()
  30. ASCII_CTRL: Final = frozenset(chr(i) for i in range(32)) | frozenset(chr(127))
  31. # Neither of these sets include quotation mark or backslash. They are
  32. # currently handled as separate cases in the parser functions.
  33. ILLEGAL_BASIC_STR_CHARS: Final = ASCII_CTRL - frozenset("\t")
  34. ILLEGAL_MULTILINE_BASIC_STR_CHARS: Final = ASCII_CTRL - frozenset("\t\n")
  35. ILLEGAL_LITERAL_STR_CHARS: Final = ILLEGAL_BASIC_STR_CHARS
  36. ILLEGAL_MULTILINE_LITERAL_STR_CHARS: Final = ILLEGAL_MULTILINE_BASIC_STR_CHARS
  37. ILLEGAL_COMMENT_CHARS: Final = ILLEGAL_BASIC_STR_CHARS
  38. TOML_WS: Final = frozenset(" \t")
  39. TOML_WS_AND_NEWLINE: Final = TOML_WS | frozenset("\n")
  40. BARE_KEY_CHARS: Final = frozenset(
  41. "abcdefghijklmnopqrstuvwxyz" "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "0123456789" "-_"
  42. )
  43. KEY_INITIAL_CHARS: Final = BARE_KEY_CHARS | frozenset("\"'")
  44. HEXDIGIT_CHARS: Final = frozenset("abcdef" "ABCDEF" "0123456789")
  45. BASIC_STR_ESCAPE_REPLACEMENTS: Final = MappingProxyType(
  46. {
  47. "\\b": "\u0008", # backspace
  48. "\\t": "\u0009", # tab
  49. "\\n": "\u000a", # linefeed
  50. "\\f": "\u000c", # form feed
  51. "\\r": "\u000d", # carriage return
  52. '\\"': "\u0022", # quote
  53. "\\\\": "\u005c", # backslash
  54. }
  55. )
  56. class DEPRECATED_DEFAULT:
  57. """Sentinel to be used as default arg during deprecation
  58. period of TOMLDecodeError's free-form arguments."""
  59. class TOMLDecodeError(ValueError):
  60. """An error raised if a document is not valid TOML.
  61. Adds the following attributes to ValueError:
  62. msg: The unformatted error message
  63. doc: The TOML document being parsed
  64. pos: The index of doc where parsing failed
  65. lineno: The line corresponding to pos
  66. colno: The column corresponding to pos
  67. """
  68. def __init__(
  69. self,
  70. msg: str | type[DEPRECATED_DEFAULT] = DEPRECATED_DEFAULT,
  71. doc: str | type[DEPRECATED_DEFAULT] = DEPRECATED_DEFAULT,
  72. pos: Pos | type[DEPRECATED_DEFAULT] = DEPRECATED_DEFAULT,
  73. *args: Any,
  74. ):
  75. if (
  76. args
  77. or not isinstance(msg, str)
  78. or not isinstance(doc, str)
  79. or not isinstance(pos, int)
  80. ):
  81. import warnings
  82. warnings.warn(
  83. "Free-form arguments for TOMLDecodeError are deprecated. "
  84. "Please set 'msg' (str), 'doc' (str) and 'pos' (int) arguments only.",
  85. DeprecationWarning,
  86. stacklevel=2,
  87. )
  88. if pos is not DEPRECATED_DEFAULT:
  89. args = pos, *args
  90. if doc is not DEPRECATED_DEFAULT:
  91. args = doc, *args
  92. if msg is not DEPRECATED_DEFAULT:
  93. args = msg, *args
  94. ValueError.__init__(self, *args)
  95. return
  96. lineno = doc.count("\n", 0, pos) + 1
  97. if lineno == 1:
  98. colno = pos + 1
  99. else:
  100. colno = pos - doc.rindex("\n", 0, pos)
  101. if pos >= len(doc):
  102. coord_repr = "end of document"
  103. else:
  104. coord_repr = f"line {lineno}, column {colno}"
  105. errmsg = f"{msg} (at {coord_repr})"
  106. ValueError.__init__(self, errmsg)
  107. self.msg = msg
  108. self.doc = doc
  109. self.pos = pos
  110. self.lineno = lineno
  111. self.colno = colno
  112. def load(__fp: IO[bytes], *, parse_float: ParseFloat = float) -> dict[str, Any]:
  113. """Parse TOML from a binary file object."""
  114. b = __fp.read()
  115. try:
  116. s = b.decode()
  117. except AttributeError:
  118. raise TypeError(
  119. "File must be opened in binary mode, e.g. use `open('foo.toml', 'rb')`"
  120. ) from None
  121. return loads(s, parse_float=parse_float)
  122. def loads(__s: str, *, parse_float: ParseFloat = float) -> dict[str, Any]: # noqa: C901
  123. """Parse TOML from a string."""
  124. # The spec allows converting "\r\n" to "\n", even in string
  125. # literals. Let's do so to simplify parsing.
  126. try:
  127. src = __s.replace("\r\n", "\n")
  128. except (AttributeError, TypeError):
  129. raise TypeError(
  130. f"Expected str object, not '{type(__s).__qualname__}'"
  131. ) from None
  132. pos = 0
  133. out = Output()
  134. header: Key = ()
  135. parse_float = make_safe_parse_float(parse_float)
  136. # Parse one statement at a time
  137. # (typically means one line in TOML source)
  138. while True:
  139. # 1. Skip line leading whitespace
  140. pos = skip_chars(src, pos, TOML_WS)
  141. # 2. Parse rules. Expect one of the following:
  142. # - end of file
  143. # - end of line
  144. # - comment
  145. # - key/value pair
  146. # - append dict to list (and move to its namespace)
  147. # - create dict (and move to its namespace)
  148. # Skip trailing whitespace when applicable.
  149. try:
  150. char = src[pos]
  151. except IndexError:
  152. break
  153. if char == "\n":
  154. pos += 1
  155. continue
  156. if char in KEY_INITIAL_CHARS:
  157. pos = key_value_rule(src, pos, out, header, parse_float)
  158. pos = skip_chars(src, pos, TOML_WS)
  159. elif char == "[":
  160. try:
  161. second_char: str | None = src[pos + 1]
  162. except IndexError:
  163. second_char = None
  164. out.flags.finalize_pending()
  165. if second_char == "[":
  166. pos, header = create_list_rule(src, pos, out)
  167. else:
  168. pos, header = create_dict_rule(src, pos, out)
  169. pos = skip_chars(src, pos, TOML_WS)
  170. elif char != "#":
  171. raise TOMLDecodeError("Invalid statement", src, pos)
  172. # 3. Skip comment
  173. pos = skip_comment(src, pos)
  174. # 4. Expect end of line or end of file
  175. try:
  176. char = src[pos]
  177. except IndexError:
  178. break
  179. if char != "\n":
  180. raise TOMLDecodeError(
  181. "Expected newline or end of document after a statement", src, pos
  182. )
  183. pos += 1
  184. return out.data.dict
  185. class Flags:
  186. """Flags that map to parsed keys/namespaces."""
  187. # Marks an immutable namespace (inline array or inline table).
  188. FROZEN: Final = 0
  189. # Marks a nest that has been explicitly created and can no longer
  190. # be opened using the "[table]" syntax.
  191. EXPLICIT_NEST: Final = 1
  192. def __init__(self) -> None:
  193. self._flags: dict[str, dict[Any, Any]] = {}
  194. self._pending_flags: set[tuple[Key, int]] = set()
  195. def add_pending(self, key: Key, flag: int) -> None:
  196. self._pending_flags.add((key, flag))
  197. def finalize_pending(self) -> None:
  198. for key, flag in self._pending_flags:
  199. self.set(key, flag, recursive=False)
  200. self._pending_flags.clear()
  201. def unset_all(self, key: Key) -> None:
  202. cont = self._flags
  203. for k in key[:-1]:
  204. if k not in cont:
  205. return
  206. cont = cont[k]["nested"]
  207. cont.pop(key[-1], None)
  208. def set(self, key: Key, flag: int, *, recursive: bool) -> None: # noqa: A003
  209. cont = self._flags
  210. key_parent, key_stem = key[:-1], key[-1]
  211. for k in key_parent:
  212. if k not in cont:
  213. cont[k] = {"flags": set(), "recursive_flags": set(), "nested": {}}
  214. cont = cont[k]["nested"]
  215. if key_stem not in cont:
  216. cont[key_stem] = {"flags": set(), "recursive_flags": set(), "nested": {}}
  217. cont[key_stem]["recursive_flags" if recursive else "flags"].add(flag)
  218. def is_(self, key: Key, flag: int) -> bool:
  219. if not key:
  220. return False # document root has no flags
  221. cont = self._flags
  222. for k in key[:-1]:
  223. if k not in cont:
  224. return False
  225. inner_cont = cont[k]
  226. if flag in inner_cont["recursive_flags"]:
  227. return True
  228. cont = inner_cont["nested"]
  229. key_stem = key[-1]
  230. if key_stem in cont:
  231. inner_cont = cont[key_stem]
  232. return flag in inner_cont["flags"] or flag in inner_cont["recursive_flags"]
  233. return False
  234. class NestedDict:
  235. def __init__(self) -> None:
  236. # The parsed content of the TOML document
  237. self.dict: dict[str, Any] = {}
  238. def get_or_create_nest(
  239. self,
  240. key: Key,
  241. *,
  242. access_lists: bool = True,
  243. ) -> dict[str, Any]:
  244. cont: Any = self.dict
  245. for k in key:
  246. if k not in cont:
  247. cont[k] = {}
  248. cont = cont[k]
  249. if access_lists and isinstance(cont, list):
  250. cont = cont[-1]
  251. if not isinstance(cont, dict):
  252. raise KeyError("There is no nest behind this key")
  253. return cont # type: ignore[no-any-return]
  254. def append_nest_to_list(self, key: Key) -> None:
  255. cont = self.get_or_create_nest(key[:-1])
  256. last_key = key[-1]
  257. if last_key in cont:
  258. list_ = cont[last_key]
  259. if not isinstance(list_, list):
  260. raise KeyError("An object other than list found behind this key")
  261. list_.append({})
  262. else:
  263. cont[last_key] = [{}]
  264. class Output:
  265. def __init__(self) -> None:
  266. self.data = NestedDict()
  267. self.flags = Flags()
  268. def skip_chars(src: str, pos: Pos, chars: Iterable[str]) -> Pos:
  269. try:
  270. while src[pos] in chars:
  271. pos += 1
  272. except IndexError:
  273. pass
  274. return pos
  275. def skip_until(
  276. src: str,
  277. pos: Pos,
  278. expect: str,
  279. *,
  280. error_on: frozenset[str],
  281. error_on_eof: bool,
  282. ) -> Pos:
  283. try:
  284. new_pos = src.index(expect, pos)
  285. except ValueError:
  286. new_pos = len(src)
  287. if error_on_eof:
  288. raise TOMLDecodeError(f"Expected {expect!r}", src, new_pos) from None
  289. if not error_on.isdisjoint(src[pos:new_pos]):
  290. while src[pos] not in error_on:
  291. pos += 1
  292. raise TOMLDecodeError(f"Found invalid character {src[pos]!r}", src, pos)
  293. return new_pos
  294. def skip_comment(src: str, pos: Pos) -> Pos:
  295. try:
  296. char: str | None = src[pos]
  297. except IndexError:
  298. char = None
  299. if char == "#":
  300. return skip_until(
  301. src, pos + 1, "\n", error_on=ILLEGAL_COMMENT_CHARS, error_on_eof=False
  302. )
  303. return pos
  304. def skip_comments_and_array_ws(src: str, pos: Pos) -> Pos:
  305. while True:
  306. pos_before_skip = pos
  307. pos = skip_chars(src, pos, TOML_WS_AND_NEWLINE)
  308. pos = skip_comment(src, pos)
  309. if pos == pos_before_skip:
  310. return pos
  311. def create_dict_rule(src: str, pos: Pos, out: Output) -> tuple[Pos, Key]:
  312. pos += 1 # Skip "["
  313. pos = skip_chars(src, pos, TOML_WS)
  314. pos, key = parse_key(src, pos)
  315. if out.flags.is_(key, Flags.EXPLICIT_NEST) or out.flags.is_(key, Flags.FROZEN):
  316. raise TOMLDecodeError(f"Cannot declare {key} twice", src, pos)
  317. out.flags.set(key, Flags.EXPLICIT_NEST, recursive=False)
  318. try:
  319. out.data.get_or_create_nest(key)
  320. except KeyError:
  321. raise TOMLDecodeError("Cannot overwrite a value", src, pos) from None
  322. if not src.startswith("]", pos):
  323. raise TOMLDecodeError(
  324. "Expected ']' at the end of a table declaration", src, pos
  325. )
  326. return pos + 1, key
  327. def create_list_rule(src: str, pos: Pos, out: Output) -> tuple[Pos, Key]:
  328. pos += 2 # Skip "[["
  329. pos = skip_chars(src, pos, TOML_WS)
  330. pos, key = parse_key(src, pos)
  331. if out.flags.is_(key, Flags.FROZEN):
  332. raise TOMLDecodeError(f"Cannot mutate immutable namespace {key}", src, pos)
  333. # Free the namespace now that it points to another empty list item...
  334. out.flags.unset_all(key)
  335. # ...but this key precisely is still prohibited from table declaration
  336. out.flags.set(key, Flags.EXPLICIT_NEST, recursive=False)
  337. try:
  338. out.data.append_nest_to_list(key)
  339. except KeyError:
  340. raise TOMLDecodeError("Cannot overwrite a value", src, pos) from None
  341. if not src.startswith("]]", pos):
  342. raise TOMLDecodeError(
  343. "Expected ']]' at the end of an array declaration", src, pos
  344. )
  345. return pos + 2, key
  346. def key_value_rule(
  347. src: str, pos: Pos, out: Output, header: Key, parse_float: ParseFloat
  348. ) -> Pos:
  349. pos, key, value = parse_key_value_pair(src, pos, parse_float, nest_lvl=0)
  350. key_parent, key_stem = key[:-1], key[-1]
  351. abs_key_parent = header + key_parent
  352. relative_path_cont_keys = (header + key[:i] for i in range(1, len(key)))
  353. for cont_key in relative_path_cont_keys:
  354. # Check that dotted key syntax does not redefine an existing table
  355. if out.flags.is_(cont_key, Flags.EXPLICIT_NEST):
  356. raise TOMLDecodeError(f"Cannot redefine namespace {cont_key}", src, pos)
  357. # Containers in the relative path can't be opened with the table syntax or
  358. # dotted key/value syntax in following table sections.
  359. out.flags.add_pending(cont_key, Flags.EXPLICIT_NEST)
  360. if out.flags.is_(abs_key_parent, Flags.FROZEN):
  361. raise TOMLDecodeError(
  362. f"Cannot mutate immutable namespace {abs_key_parent}", src, pos
  363. )
  364. try:
  365. nest = out.data.get_or_create_nest(abs_key_parent)
  366. except KeyError:
  367. raise TOMLDecodeError("Cannot overwrite a value", src, pos) from None
  368. if key_stem in nest:
  369. raise TOMLDecodeError("Cannot overwrite a value", src, pos)
  370. # Mark inline table and array namespaces recursively immutable
  371. if isinstance(value, (dict, list)):
  372. out.flags.set(header + key, Flags.FROZEN, recursive=True)
  373. nest[key_stem] = value
  374. return pos
  375. def parse_key_value_pair(
  376. src: str, pos: Pos, parse_float: ParseFloat, nest_lvl: int
  377. ) -> tuple[Pos, Key, Any]:
  378. pos, key = parse_key(src, pos)
  379. try:
  380. char: str | None = src[pos]
  381. except IndexError:
  382. char = None
  383. if char != "=":
  384. raise TOMLDecodeError("Expected '=' after a key in a key/value pair", src, pos)
  385. pos += 1
  386. pos = skip_chars(src, pos, TOML_WS)
  387. pos, value = parse_value(src, pos, parse_float, nest_lvl)
  388. return pos, key, value
  389. def parse_key(src: str, pos: Pos) -> tuple[Pos, Key]:
  390. pos, key_part = parse_key_part(src, pos)
  391. key: Key = (key_part,)
  392. pos = skip_chars(src, pos, TOML_WS)
  393. while True:
  394. try:
  395. char: str | None = src[pos]
  396. except IndexError:
  397. char = None
  398. if char != ".":
  399. return pos, key
  400. pos += 1
  401. pos = skip_chars(src, pos, TOML_WS)
  402. pos, key_part = parse_key_part(src, pos)
  403. key += (key_part,)
  404. pos = skip_chars(src, pos, TOML_WS)
  405. def parse_key_part(src: str, pos: Pos) -> tuple[Pos, str]:
  406. try:
  407. char: str | None = src[pos]
  408. except IndexError:
  409. char = None
  410. if char in BARE_KEY_CHARS:
  411. start_pos = pos
  412. pos = skip_chars(src, pos, BARE_KEY_CHARS)
  413. return pos, src[start_pos:pos]
  414. if char == "'":
  415. return parse_literal_str(src, pos)
  416. if char == '"':
  417. return parse_one_line_basic_str(src, pos)
  418. raise TOMLDecodeError("Invalid initial character for a key part", src, pos)
  419. def parse_one_line_basic_str(src: str, pos: Pos) -> tuple[Pos, str]:
  420. pos += 1
  421. return parse_basic_str(src, pos, multiline=False)
  422. def parse_array(
  423. src: str, pos: Pos, parse_float: ParseFloat, nest_lvl: int
  424. ) -> tuple[Pos, list[Any]]:
  425. pos += 1
  426. array: list[Any] = []
  427. pos = skip_comments_and_array_ws(src, pos)
  428. if src.startswith("]", pos):
  429. return pos + 1, array
  430. while True:
  431. pos, val = parse_value(src, pos, parse_float, nest_lvl)
  432. array.append(val)
  433. pos = skip_comments_and_array_ws(src, pos)
  434. c = src[pos : pos + 1]
  435. if c == "]":
  436. return pos + 1, array
  437. if c != ",":
  438. raise TOMLDecodeError("Unclosed array", src, pos)
  439. pos += 1
  440. pos = skip_comments_and_array_ws(src, pos)
  441. if src.startswith("]", pos):
  442. return pos + 1, array
  443. def parse_inline_table(
  444. src: str, pos: Pos, parse_float: ParseFloat, nest_lvl: int
  445. ) -> tuple[Pos, dict[str, Any]]:
  446. pos += 1
  447. nested_dict = NestedDict()
  448. flags = Flags()
  449. pos = skip_chars(src, pos, TOML_WS)
  450. if src.startswith("}", pos):
  451. return pos + 1, nested_dict.dict
  452. while True:
  453. pos, key, value = parse_key_value_pair(src, pos, parse_float, nest_lvl)
  454. key_parent, key_stem = key[:-1], key[-1]
  455. if flags.is_(key, Flags.FROZEN):
  456. raise TOMLDecodeError(f"Cannot mutate immutable namespace {key}", src, pos)
  457. try:
  458. nest = nested_dict.get_or_create_nest(key_parent, access_lists=False)
  459. except KeyError:
  460. raise TOMLDecodeError("Cannot overwrite a value", src, pos) from None
  461. if key_stem in nest:
  462. raise TOMLDecodeError(f"Duplicate inline table key {key_stem!r}", src, pos)
  463. nest[key_stem] = value
  464. pos = skip_chars(src, pos, TOML_WS)
  465. c = src[pos : pos + 1]
  466. if c == "}":
  467. return pos + 1, nested_dict.dict
  468. if c != ",":
  469. raise TOMLDecodeError("Unclosed inline table", src, pos)
  470. if isinstance(value, (dict, list)):
  471. flags.set(key, Flags.FROZEN, recursive=True)
  472. pos += 1
  473. pos = skip_chars(src, pos, TOML_WS)
  474. def parse_basic_str_escape(
  475. src: str, pos: Pos, *, multiline: bool = False
  476. ) -> tuple[Pos, str]:
  477. escape_id = src[pos : pos + 2]
  478. pos += 2
  479. if multiline and escape_id in {"\\ ", "\\\t", "\\\n"}:
  480. # Skip whitespace until next non-whitespace character or end of
  481. # the doc. Error if non-whitespace is found before newline.
  482. if escape_id != "\\\n":
  483. pos = skip_chars(src, pos, TOML_WS)
  484. try:
  485. char = src[pos]
  486. except IndexError:
  487. return pos, ""
  488. if char != "\n":
  489. raise TOMLDecodeError("Unescaped '\\' in a string", src, pos)
  490. pos += 1
  491. pos = skip_chars(src, pos, TOML_WS_AND_NEWLINE)
  492. return pos, ""
  493. if escape_id == "\\u":
  494. return parse_hex_char(src, pos, 4)
  495. if escape_id == "\\U":
  496. return parse_hex_char(src, pos, 8)
  497. try:
  498. return pos, BASIC_STR_ESCAPE_REPLACEMENTS[escape_id]
  499. except KeyError:
  500. raise TOMLDecodeError("Unescaped '\\' in a string", src, pos) from None
  501. def parse_basic_str_escape_multiline(src: str, pos: Pos) -> tuple[Pos, str]:
  502. return parse_basic_str_escape(src, pos, multiline=True)
  503. def parse_hex_char(src: str, pos: Pos, hex_len: int) -> tuple[Pos, str]:
  504. hex_str = src[pos : pos + hex_len]
  505. if len(hex_str) != hex_len or not HEXDIGIT_CHARS.issuperset(hex_str):
  506. raise TOMLDecodeError("Invalid hex value", src, pos)
  507. pos += hex_len
  508. hex_int = int(hex_str, 16)
  509. if not is_unicode_scalar_value(hex_int):
  510. raise TOMLDecodeError(
  511. "Escaped character is not a Unicode scalar value", src, pos
  512. )
  513. return pos, chr(hex_int)
  514. def parse_literal_str(src: str, pos: Pos) -> tuple[Pos, str]:
  515. pos += 1 # Skip starting apostrophe
  516. start_pos = pos
  517. pos = skip_until(
  518. src, pos, "'", error_on=ILLEGAL_LITERAL_STR_CHARS, error_on_eof=True
  519. )
  520. return pos + 1, src[start_pos:pos] # Skip ending apostrophe
  521. def parse_multiline_str(src: str, pos: Pos, *, literal: bool) -> tuple[Pos, str]:
  522. pos += 3
  523. if src.startswith("\n", pos):
  524. pos += 1
  525. if literal:
  526. delim = "'"
  527. end_pos = skip_until(
  528. src,
  529. pos,
  530. "'''",
  531. error_on=ILLEGAL_MULTILINE_LITERAL_STR_CHARS,
  532. error_on_eof=True,
  533. )
  534. result = src[pos:end_pos]
  535. pos = end_pos + 3
  536. else:
  537. delim = '"'
  538. pos, result = parse_basic_str(src, pos, multiline=True)
  539. # Add at maximum two extra apostrophes/quotes if the end sequence
  540. # is 4 or 5 chars long instead of just 3.
  541. if not src.startswith(delim, pos):
  542. return pos, result
  543. pos += 1
  544. if not src.startswith(delim, pos):
  545. return pos, result + delim
  546. pos += 1
  547. return pos, result + (delim * 2)
  548. def parse_basic_str(src: str, pos: Pos, *, multiline: bool) -> tuple[Pos, str]:
  549. if multiline:
  550. error_on = ILLEGAL_MULTILINE_BASIC_STR_CHARS
  551. parse_escapes = parse_basic_str_escape_multiline
  552. else:
  553. error_on = ILLEGAL_BASIC_STR_CHARS
  554. parse_escapes = parse_basic_str_escape
  555. result = ""
  556. start_pos = pos
  557. while True:
  558. try:
  559. char = src[pos]
  560. except IndexError:
  561. raise TOMLDecodeError("Unterminated string", src, pos) from None
  562. if char == '"':
  563. if not multiline:
  564. return pos + 1, result + src[start_pos:pos]
  565. if src.startswith('"""', pos):
  566. return pos + 3, result + src[start_pos:pos]
  567. pos += 1
  568. continue
  569. if char == "\\":
  570. result += src[start_pos:pos]
  571. pos, parsed_escape = parse_escapes(src, pos)
  572. result += parsed_escape
  573. start_pos = pos
  574. continue
  575. if char in error_on:
  576. raise TOMLDecodeError(f"Illegal character {char!r}", src, pos)
  577. pos += 1
  578. def parse_value( # noqa: C901
  579. src: str, pos: Pos, parse_float: ParseFloat, nest_lvl: int
  580. ) -> tuple[Pos, Any]:
  581. if nest_lvl > MAX_INLINE_NESTING:
  582. # Pure Python should have raised RecursionError already.
  583. # This ensures mypyc binaries eventually do the same.
  584. raise RecursionError( # pragma: no cover
  585. "TOML inline arrays/tables are nested more than the allowed"
  586. f" {MAX_INLINE_NESTING} levels"
  587. )
  588. try:
  589. char: str | None = src[pos]
  590. except IndexError:
  591. char = None
  592. # IMPORTANT: order conditions based on speed of checking and likelihood
  593. # Basic strings
  594. if char == '"':
  595. if src.startswith('"""', pos):
  596. return parse_multiline_str(src, pos, literal=False)
  597. return parse_one_line_basic_str(src, pos)
  598. # Literal strings
  599. if char == "'":
  600. if src.startswith("'''", pos):
  601. return parse_multiline_str(src, pos, literal=True)
  602. return parse_literal_str(src, pos)
  603. # Booleans
  604. if char == "t":
  605. if src.startswith("true", pos):
  606. return pos + 4, True
  607. if char == "f":
  608. if src.startswith("false", pos):
  609. return pos + 5, False
  610. # Arrays
  611. if char == "[":
  612. return parse_array(src, pos, parse_float, nest_lvl + 1)
  613. # Inline tables
  614. if char == "{":
  615. return parse_inline_table(src, pos, parse_float, nest_lvl + 1)
  616. # Dates and times
  617. datetime_match = RE_DATETIME.match(src, pos)
  618. if datetime_match:
  619. try:
  620. datetime_obj = match_to_datetime(datetime_match)
  621. except ValueError as e:
  622. raise TOMLDecodeError("Invalid date or datetime", src, pos) from e
  623. return datetime_match.end(), datetime_obj
  624. localtime_match = RE_LOCALTIME.match(src, pos)
  625. if localtime_match:
  626. return localtime_match.end(), match_to_localtime(localtime_match)
  627. # Integers and "normal" floats.
  628. # The regex will greedily match any type starting with a decimal
  629. # char, so needs to be located after handling of dates and times.
  630. number_match = RE_NUMBER.match(src, pos)
  631. if number_match:
  632. return number_match.end(), match_to_number(number_match, parse_float)
  633. # Special floats
  634. first_three = src[pos : pos + 3]
  635. if first_three in {"inf", "nan"}:
  636. return pos + 3, parse_float(first_three)
  637. first_four = src[pos : pos + 4]
  638. if first_four in {"-inf", "+inf", "-nan", "+nan"}:
  639. return pos + 4, parse_float(first_four)
  640. raise TOMLDecodeError("Invalid value", src, pos)
  641. def is_unicode_scalar_value(codepoint: int) -> bool:
  642. return (0 <= codepoint <= 55295) or (57344 <= codepoint <= 1114111)
  643. def make_safe_parse_float(parse_float: ParseFloat) -> ParseFloat:
  644. """A decorator to make `parse_float` safe.
  645. `parse_float` must not return dicts or lists, because these types
  646. would be mixed with parsed TOML tables and arrays, thus confusing
  647. the parser. The returned decorated callable raises `ValueError`
  648. instead of returning illegal types.
  649. """
  650. # The default `float` callable never returns illegal types. Optimize it.
  651. if parse_float is float:
  652. return float
  653. def safe_parse_float(float_str: str) -> Any:
  654. float_value = parse_float(float_str)
  655. if isinstance(float_value, (dict, list)):
  656. raise ValueError("parse_float must not return dicts or lists")
  657. return float_value
  658. return safe_parse_float