parse_string.py 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546
  1. from typing import TYPE_CHECKING
  2. from .parse_string_helpers.parse_boolean_or_null import parse_boolean_or_null
  3. from .parse_string_helpers.parse_json_llm_block import parse_json_llm_block
  4. from .utils.constants import STRING_DELIMITERS, JSONReturnType
  5. from .utils.json_context import ContextValues
  6. if TYPE_CHECKING:
  7. from .json_parser import JSONParser
  8. def _try_parse_simple_quoted_string(self: "JSONParser") -> str | None:
  9. if self.get_char_at() != '"':
  10. return None
  11. start = self.index + 1
  12. json_str = self.json_str
  13. if isinstance(json_str, str):
  14. end = json_str.find('"', start)
  15. if end == -1:
  16. return None
  17. value = json_str[start:end]
  18. if "\\" in value or "\n" in value or "\r" in value:
  19. return None
  20. else:
  21. end = start
  22. limit = len(json_str)
  23. while end < limit:
  24. char = json_str[end]
  25. if char == '"':
  26. break
  27. if char in {"\\", "\n", "\r"}:
  28. return None
  29. end += 1
  30. if end >= limit:
  31. return None
  32. value = json_str[start:end]
  33. next_index = end + 1
  34. limit = len(json_str)
  35. while next_index < limit and self.json_str[next_index].isspace():
  36. next_index += 1
  37. next_char = self.json_str[next_index] if next_index < limit else None
  38. current_context = self.context.current
  39. if current_context == ContextValues.OBJECT_KEY:
  40. if next_char != ":":
  41. return None
  42. elif current_context == ContextValues.OBJECT_VALUE:
  43. if next_char not in {",", "}", None}:
  44. return None
  45. elif current_context == ContextValues.ARRAY:
  46. if next_char not in {",", "]", None}:
  47. return None
  48. elif next_char is not None:
  49. return None
  50. self.index = end + 1
  51. return value
  52. def parse_string(self: "JSONParser") -> JSONReturnType:
  53. # Utility function to append a character to the accumulator and update the index
  54. def _append_literal_char(acc: str, current_char: str) -> tuple[str, str | None]:
  55. acc += current_char
  56. self.index += 1
  57. char = self.get_char_at()
  58. return acc, char
  59. # <string> is a string of valid characters enclosed in quotes
  60. # i.e. { name: "John" }
  61. # Somehow all weird cases in an invalid JSON happen to be resolved in this function, so be careful here
  62. # Flag to manage corner cases related to missing starting quote
  63. missing_quotes = False
  64. doubled_quotes = False
  65. lstring_delimiter = rstring_delimiter = '"'
  66. char = self.get_char_at()
  67. if char in ["#", "/"]:
  68. return self.parse_comment()
  69. # A valid string can only start with a valid quote or, in our case, with a literal
  70. while char and char not in STRING_DELIMITERS and not char.isalnum():
  71. self.index += 1
  72. char = self.get_char_at()
  73. if not char:
  74. # This is an empty string
  75. return ""
  76. # Most benchmark strings are ordinary quoted values; keep a narrow fast path for them
  77. # and let the slower repair logic handle anything ambiguous or escaped.
  78. fast_path_value = _try_parse_simple_quoted_string(self)
  79. if fast_path_value is not None:
  80. return fast_path_value
  81. # Ensuring we use the right delimiter
  82. if char == "'":
  83. lstring_delimiter = rstring_delimiter = "'"
  84. elif char == "“":
  85. lstring_delimiter = "“"
  86. rstring_delimiter = "”"
  87. elif char.isalnum():
  88. # This could be a <boolean> and not a string. Because (T)rue or (F)alse or (N)ull are valid
  89. # But remember, object keys are only of type string
  90. if char.lower() in ["t", "f", "n"] and self.context.current != ContextValues.OBJECT_KEY:
  91. value = parse_boolean_or_null(self)
  92. if value != "":
  93. return value
  94. self.log(
  95. "While parsing a string, we found a literal instead of a quote",
  96. )
  97. missing_quotes = True
  98. if not missing_quotes:
  99. self.index += 1
  100. if self.get_char_at() == "`":
  101. ret_val = parse_json_llm_block(self)
  102. # If we found a valid JSON block, return it, otherwise continue parsing the string
  103. if ret_val is not False:
  104. return ret_val
  105. self.log(
  106. "While parsing a string, we found code fences but they did not enclose valid JSON, continuing parsing the string",
  107. )
  108. # There is sometimes a weird case of doubled quotes, we manage this also later in the while loop
  109. if self.get_char_at() == lstring_delimiter:
  110. # If it's an empty key, this was easy
  111. if (
  112. (self.context.current == ContextValues.OBJECT_KEY and self.get_char_at(1) == ":")
  113. or (self.context.current == ContextValues.OBJECT_VALUE and self.get_char_at(1) in [",", "}"])
  114. or (self.context.current == ContextValues.ARRAY and self.get_char_at(1) in [",", "]"])
  115. ):
  116. self.index += 1
  117. return ""
  118. if self.get_char_at(1) == lstring_delimiter:
  119. # There's something fishy about this, we found doubled quotes and then again quotes
  120. self.log(
  121. "While parsing a string, we found a doubled quote and then a quote again, ignoring it",
  122. )
  123. if self.strict:
  124. raise ValueError("Found doubled quotes followed by another quote.")
  125. return ""
  126. # Find the next delimiter
  127. i = self.skip_to_character(character=rstring_delimiter, idx=1)
  128. next_c = self.get_char_at(i)
  129. # Now check that the next character is also a delimiter to ensure that we have "".....""
  130. # In that case we ignore this rstring delimiter
  131. if self.get_char_at(i + 1) == rstring_delimiter:
  132. self.log(
  133. "While parsing a string, we found a valid starting doubled quote",
  134. )
  135. doubled_quotes = True
  136. self.index += 1
  137. else:
  138. # Ok this is not a doubled quote, check if this is an empty string or not
  139. i = self.scroll_whitespaces(idx=1)
  140. next_c = self.get_char_at(i)
  141. if next_c in [*STRING_DELIMITERS, "{", "["]:
  142. # something fishy is going on here
  143. self.log(
  144. "While parsing a string, we found a doubled quote but also another quote afterwards, ignoring it",
  145. )
  146. if self.strict:
  147. raise ValueError(
  148. "Found doubled quotes followed by another quote while parsing a string.",
  149. )
  150. self.index += 1
  151. return ""
  152. if next_c not in [",", "]", "}"]:
  153. self.log(
  154. "While parsing a string, we found a doubled quote but it was a mistake, removing one quote",
  155. )
  156. self.index += 1
  157. # Initialize our return value
  158. string_acc = ""
  159. # Here things get a bit hairy because a string missing the final quote can also be a key or a value in an object
  160. # In that case we need to use the ":|,|}" characters as terminators of the string
  161. # So this will stop if:
  162. # * It finds a closing quote
  163. # * It iterated over the entire sequence
  164. # * If we are fixing missing quotes in an object, when it finds the special terminators
  165. char = self.get_char_at()
  166. unmatched_delimiter = False
  167. while char and char != rstring_delimiter:
  168. if missing_quotes:
  169. if self.context.current == ContextValues.OBJECT_KEY and (char == ":" or char.isspace()):
  170. self.log(
  171. "While parsing a string missing the left delimiter in object key context, we found a :, stopping here",
  172. )
  173. break
  174. if self.context.current == ContextValues.ARRAY and char in ["]", ","]:
  175. self.log(
  176. "While parsing a string missing the left delimiter in array context, we found a ] or ,, stopping here",
  177. )
  178. break
  179. if (
  180. not self.stream_stable
  181. and self.context.current == ContextValues.OBJECT_VALUE
  182. and char
  183. in [
  184. ",",
  185. "}",
  186. ]
  187. and (not string_acc or string_acc[-1] != rstring_delimiter)
  188. ):
  189. rstring_delimiter_missing = True
  190. # check if this is a case in which the closing comma is NOT missing instead
  191. self.skip_whitespaces()
  192. if self.get_char_at(1) == "\\":
  193. # Ok this is a quoted string, skip
  194. rstring_delimiter_missing = False
  195. i = self.skip_to_character(character=rstring_delimiter, idx=1)
  196. next_c = self.get_char_at(i)
  197. if next_c:
  198. i += 1
  199. # found a delimiter, now we need to check that is followed strictly by a comma or brace
  200. # or the string ended
  201. i = self.scroll_whitespaces(idx=i)
  202. next_c = self.get_char_at(i)
  203. if not next_c or next_c in [",", "}"]:
  204. rstring_delimiter_missing = False
  205. else:
  206. # OK but this could still be some garbage at the end of the string
  207. # So we need to check if we find a new lstring_delimiter afterwards
  208. # If we do, maybe this is a missing delimiter
  209. i = self.skip_to_character(character=lstring_delimiter, idx=i)
  210. next_c = self.get_char_at(i)
  211. if not next_c:
  212. rstring_delimiter_missing = False
  213. else:
  214. # But again, this could just be something a bit stupid like "lorem, "ipsum" sic"
  215. # Check if we find a : afterwards (skipping space)
  216. i = self.scroll_whitespaces(idx=i + 1)
  217. next_c = self.get_char_at(i)
  218. if next_c and next_c != ":":
  219. rstring_delimiter_missing = False
  220. else:
  221. # There could be a case in which even the next key:value is missing delimeters
  222. # because it might be a systemic issue with the output
  223. # So let's check if we can find a : in the string instead
  224. i = self.skip_to_character(character=":", idx=1)
  225. next_c = self.get_char_at(i)
  226. if next_c:
  227. # OK then this is a systemic issue with the output
  228. break
  229. # skip any whitespace first
  230. i = self.scroll_whitespaces(idx=1)
  231. # We couldn't find any rstring_delimeter before the end of the string
  232. # check if this is the last string of an object and therefore we can keep going
  233. # make an exception if this is the last char before the closing brace
  234. j = self.skip_to_character(character="}", idx=i)
  235. if j - i > 1:
  236. # Ok it's not right after the comma
  237. # Let's ignore
  238. rstring_delimiter_missing = False
  239. # Check that j was not out of bound
  240. elif self.get_char_at(j):
  241. # Check for an unmatched opening brace in string_acc
  242. for c in reversed(string_acc):
  243. if c == "{":
  244. # Ok then this is part of the string
  245. rstring_delimiter_missing = False
  246. break
  247. if rstring_delimiter_missing:
  248. self.log(
  249. "While parsing a string missing the left delimiter in object value context, we found a , or } and we couldn't determine that a right delimiter was present. Stopping here",
  250. )
  251. break
  252. if (
  253. not self.stream_stable
  254. and char == "]"
  255. and ContextValues.ARRAY in self.context.context
  256. and (not string_acc or string_acc[-1] != rstring_delimiter)
  257. ):
  258. # We found the end of an array and we are in array context
  259. # So let's check if we find a rstring_delimiter forward otherwise end early
  260. i = self.skip_to_character(rstring_delimiter)
  261. if not self.get_char_at(i):
  262. # No delimiter found
  263. break
  264. if self.context.current == ContextValues.OBJECT_VALUE and char == "}":
  265. # We found the end of an object while parsing a value
  266. # Check if the object is really over, to avoid doubling the closing brace
  267. i = self.scroll_whitespaces(idx=1)
  268. next_c = self.get_char_at(i)
  269. if next_c == "`" and self.get_char_at(i + 1) == "`" and self.get_char_at(i + 2) == "`":
  270. # This could be a special case in which the LLM added code fences after the object
  271. # So we need to check if there are another two ` after this one`
  272. self.log(
  273. "While parsing a string in object value context, we found a } that closes the object before code fences, stopping here",
  274. )
  275. break
  276. if not next_c:
  277. self.log(
  278. "While parsing a string in object value context, we found a } that closes the object, stopping here",
  279. )
  280. break
  281. string_acc += char
  282. self.index += 1
  283. char = self.get_char_at()
  284. if char is None:
  285. # Unclosed string ends with a \ character. This character is ignored if stream_stable = True.
  286. if self.stream_stable and string_acc and string_acc[-1] == "\\":
  287. string_acc = string_acc[:-1]
  288. break
  289. if string_acc and string_acc[-1] == "\\":
  290. # This is a special case, if people use real strings this might happen
  291. self.log("Found a stray escape sequence, normalizing it")
  292. if char in [rstring_delimiter, "t", "n", "r", "b", "\\"]:
  293. string_acc = string_acc[:-1]
  294. escape_seqs = {"t": "\t", "n": "\n", "r": "\r", "b": "\b"}
  295. string_acc += escape_seqs.get(char, char)
  296. self.index += 1
  297. char = self.get_char_at()
  298. while char and string_acc and string_acc[-1] == "\\" and char in [rstring_delimiter, "\\"]:
  299. # this is a bit of a special case, if I don't do this it will close the loop or create a train of \\
  300. # I don't love it though
  301. string_acc = string_acc[:-1] + char
  302. self.index += 1
  303. char = self.get_char_at()
  304. continue
  305. if char in ["u", "x"]:
  306. # If we find a unicode escape sequence, normalize it
  307. num_chars = 4 if char == "u" else 2
  308. next_chars = self.json_str[self.index + 1 : self.index + 1 + num_chars]
  309. if len(next_chars) == num_chars and all(c in "0123456789abcdefABCDEF" for c in next_chars):
  310. self.log("Found a unicode escape sequence, normalizing it")
  311. string_acc = string_acc[:-1] + chr(int(next_chars, 16))
  312. self.index += 1 + num_chars
  313. char = self.get_char_at()
  314. continue
  315. elif char in STRING_DELIMITERS and char != rstring_delimiter:
  316. self.log("Found a delimiter that was escaped but shouldn't be escaped, removing the escape")
  317. string_acc = string_acc[:-1] + char
  318. self.index += 1
  319. char = self.get_char_at()
  320. continue
  321. # If we are in object key context and we find a colon, it could be a missing right quote
  322. if char == ":" and not missing_quotes and self.context.current == ContextValues.OBJECT_KEY:
  323. # Ok now we need to check if this is followed by a value like "..."
  324. i = self.skip_to_character(character=lstring_delimiter, idx=1)
  325. next_c = self.get_char_at(i)
  326. if next_c:
  327. i += 1
  328. # found the first delimiter
  329. i = self.skip_to_character(character=rstring_delimiter, idx=i)
  330. next_c = self.get_char_at(i)
  331. if next_c:
  332. # found a second delimiter
  333. i += 1
  334. # Skip spaces
  335. i = self.scroll_whitespaces(idx=i)
  336. ch = self.get_char_at(i)
  337. if ch in [",", "}"]:
  338. # Ok then this is a missing right quote
  339. self.log(
  340. f"While parsing a string missing the right delimiter in object key context, we found a {ch} stopping here",
  341. )
  342. break
  343. else:
  344. # The string ended without finding a lstring_delimiter, I will assume this is a missing right quote
  345. self.log(
  346. "While parsing a string missing the right delimiter in object key context, we found a :, stopping here",
  347. )
  348. break
  349. # ChatGPT sometimes forget to quote stuff in html tags or markdown, so we do this whole thing here
  350. if char == rstring_delimiter and string_acc and string_acc[-1] != "\\":
  351. # Special case here, in case of double quotes one after another
  352. if doubled_quotes and self.get_char_at(1) == rstring_delimiter:
  353. self.log("While parsing a string, we found a doubled quote, ignoring it")
  354. self.index += 1
  355. elif missing_quotes and self.context.current == ContextValues.OBJECT_VALUE:
  356. # In case of missing starting quote I need to check if the delimeter is the end or the beginning of a key
  357. i = 1
  358. next_c = self.get_char_at(i)
  359. while next_c and next_c not in [
  360. rstring_delimiter,
  361. lstring_delimiter,
  362. ]:
  363. i += 1
  364. next_c = self.get_char_at(i)
  365. if next_c:
  366. # We found a quote, now let's make sure there's a ":" following
  367. i += 1
  368. # found a delimiter, now we need to check that is followed strictly by a comma or brace
  369. i = self.scroll_whitespaces(idx=i)
  370. if self.get_char_at(i) == ":":
  371. # Reset the cursor
  372. self.index -= 1
  373. char = self.get_char_at()
  374. self.log(
  375. "In a string with missing quotes and object value context, I found a delimeter but it turns out it was the beginning on the next key. Stopping here.",
  376. )
  377. break
  378. elif unmatched_delimiter:
  379. unmatched_delimiter = False
  380. string_acc, char = _append_literal_char(string_acc, char)
  381. else:
  382. # Check if eventually there is a rstring delimiter, otherwise we bail
  383. i = 1
  384. next_c = self.get_char_at(i)
  385. check_comma_in_object_value = True
  386. while next_c and next_c not in [
  387. rstring_delimiter,
  388. lstring_delimiter,
  389. ]:
  390. # This is a bit of a weird workaround, essentially in object_value context we don't always break on commas
  391. # This is because the routine after will make sure to correct any bad guess and this solves a corner case
  392. if check_comma_in_object_value and next_c.isalpha():
  393. check_comma_in_object_value = False
  394. # If we are in an object context, let's check for the right delimiters
  395. if (
  396. (ContextValues.OBJECT_KEY in self.context.context and next_c in [":", "}"])
  397. or (ContextValues.OBJECT_VALUE in self.context.context and next_c == "}")
  398. or (ContextValues.ARRAY in self.context.context and next_c in ["]", ","])
  399. or (
  400. check_comma_in_object_value
  401. and self.context.current == ContextValues.OBJECT_VALUE
  402. and next_c == ","
  403. )
  404. ):
  405. break
  406. i += 1
  407. next_c = self.get_char_at(i)
  408. # If we stopped for a comma in object_value context, let's check if find a "} at the end of the string
  409. if next_c == "," and self.context.current == ContextValues.OBJECT_VALUE:
  410. i += 1
  411. i = self.skip_to_character(character=rstring_delimiter, idx=i)
  412. next_c = self.get_char_at(i)
  413. # Ok now I found a delimiter, let's skip whitespaces and see if next we find a } or a ,
  414. i += 1
  415. i = self.scroll_whitespaces(idx=i)
  416. next_c = self.get_char_at(i)
  417. if next_c in ["}", ","]:
  418. self.log(
  419. "While parsing a string, we found a misplaced quote that would have closed the string but has a different meaning here, ignoring it",
  420. )
  421. string_acc, char = _append_literal_char(string_acc, char)
  422. continue
  423. elif next_c == rstring_delimiter and self.get_char_at(i - 1) != "\\":
  424. # Check if self.index:self.index+i is only whitespaces, break if that's the case
  425. if _only_whitespace_until(self, i):
  426. break
  427. if self.context.current == ContextValues.OBJECT_VALUE:
  428. i = self.scroll_whitespaces(idx=i + 1)
  429. if self.get_char_at(i) == ",":
  430. # So we found a comma, this could be a case of a single quote like "va"lue",
  431. # Search if it's followed by another key, starting with the first delimeter
  432. i = self.skip_to_character(character=lstring_delimiter, idx=i + 1)
  433. i += 1
  434. i = self.skip_to_character(character=rstring_delimiter, idx=i + 1)
  435. i += 1
  436. i = self.scroll_whitespaces(idx=i)
  437. next_c = self.get_char_at(i)
  438. if next_c == ":":
  439. self.log(
  440. "While parsing a string, we found a misplaced quote that would have closed the string but has a different meaning here, ignoring it",
  441. )
  442. string_acc, char = _append_literal_char(string_acc, char)
  443. continue
  444. # We found a delimiter and we need to check if this is a key
  445. # so find a rstring_delimiter and a colon after
  446. i = self.skip_to_character(character=rstring_delimiter, idx=i + 1)
  447. i += 1
  448. next_c = self.get_char_at(i)
  449. while next_c and next_c != ":":
  450. if next_c in [",", "]", "}"] or (
  451. next_c == rstring_delimiter and self.get_char_at(i - 1) != "\\"
  452. ):
  453. break
  454. i += 1
  455. next_c = self.get_char_at(i)
  456. # Only if we fail to find a ':' then we know this is misplaced quote
  457. if next_c != ":":
  458. self.log(
  459. "While parsing a string, we found a misplaced quote that would have closed the string but has a different meaning here, ignoring it",
  460. )
  461. unmatched_delimiter = not unmatched_delimiter
  462. string_acc, char = _append_literal_char(string_acc, char)
  463. elif self.context.current == ContextValues.ARRAY:
  464. # So here we can have a few valid cases:
  465. # ["bla bla bla "puppy" bla bla bla "kitty" bla bla"]
  466. # ["value1" value2", "value3"]
  467. # The basic idea is that if we find an even number of delimiters after this delimiter
  468. # we ignore this delimiter as it should be fine
  469. even_delimiters = next_c == rstring_delimiter
  470. while next_c == rstring_delimiter:
  471. i = self.skip_to_character(character=[rstring_delimiter, "]"], idx=i + 1)
  472. next_c = self.get_char_at(i)
  473. if next_c != rstring_delimiter:
  474. even_delimiters = False
  475. break
  476. i = self.skip_to_character(character=[rstring_delimiter, "]"], idx=i + 1)
  477. next_c = self.get_char_at(i)
  478. if even_delimiters:
  479. # If we got up to here it means that this is a situation like this:
  480. # ["bla bla bla "puppy" bla bla bla "kitty" bla bla"]
  481. # So we need to ignore this quote
  482. self.log(
  483. "While parsing a string in Array context, we detected a quoted section that would have closed the string but has a different meaning here, ignoring it",
  484. )
  485. unmatched_delimiter = not unmatched_delimiter
  486. string_acc, char = _append_literal_char(string_acc, char)
  487. else:
  488. break
  489. elif self.context.current == ContextValues.OBJECT_KEY:
  490. # In this case we just ignore this and move on
  491. self.log(
  492. "While parsing a string in Object Key context, we detected a quoted section that would have closed the string but has a different meaning here, ignoring it",
  493. )
  494. string_acc, char = _append_literal_char(string_acc, char)
  495. if char and missing_quotes and self.context.current == ContextValues.OBJECT_KEY and char.isspace():
  496. self.log(
  497. "While parsing a string, handling an extreme corner case in which the LLM added a comment instead of valid string, invalidate the string and return an empty value",
  498. )
  499. self.skip_whitespaces()
  500. if self.get_char_at() not in [":", ","]:
  501. return ""
  502. # A fallout of the previous special case in the while loop,
  503. # we need to update the index only if we had a closing quote
  504. if char != rstring_delimiter:
  505. # if stream_stable = True, unclosed strings do not trim trailing whitespace characters
  506. if not self.stream_stable:
  507. self.log(
  508. "While parsing a string, we missed the closing quote, ignoring",
  509. )
  510. string_acc = string_acc.rstrip()
  511. else:
  512. self.index += 1
  513. if not self.stream_stable and (missing_quotes or (string_acc and string_acc[-1] == "\n")):
  514. # Clean the whitespaces for some corner cases
  515. string_acc = string_acc.rstrip()
  516. return string_acc
  517. def _only_whitespace_until(self: "JSONParser", end: int) -> bool:
  518. for j in range(1, end):
  519. c = self.get_char_at(j)
  520. if c is not None and not c.isspace():
  521. return False
  522. return True