parse_object.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346
  1. from typing import TYPE_CHECKING, Any
  2. from .utils.constants import MISSING_VALUE, STRING_DELIMITERS, JSONReturnType
  3. from .utils.json_context import ContextValues
  4. from .utils.pattern_properties import match_pattern_properties
  5. if TYPE_CHECKING:
  6. from .json_parser import JSONParser
  7. from .schema_repair import SchemaRepairer
  8. def parse_object(
  9. self: "JSONParser",
  10. schema: dict[str, Any] | bool | None = None,
  11. path: str = "$",
  12. ) -> JSONReturnType:
  13. # <object> ::= '{' [ <member> *(', ' <member>) ] '}' ; A sequence of 'members'
  14. obj: dict[str, JSONReturnType] = {}
  15. start_index = self.index
  16. parsing_object_value = self.context.current == ContextValues.OBJECT_VALUE
  17. # Only activate schema-guided parsing if a repairer is available and schema looks object-like.
  18. schema_repairer: SchemaRepairer | None = None
  19. properties: dict[str, Any] = {}
  20. pattern_properties: dict[str, Any] = {}
  21. additional_properties: object | None = None
  22. required: set[str] = set()
  23. if schema is not None and schema is not True:
  24. repairer = self.schema_repairer
  25. if repairer is not None:
  26. schema = repairer.resolve_schema(schema)
  27. if schema is False:
  28. raise ValueError("Schema does not allow any values.")
  29. if schema is not True and repairer.is_object_schema(schema):
  30. schema_repairer = repairer
  31. properties = schema.get("properties", {})
  32. if not isinstance(properties, dict):
  33. properties = {}
  34. pattern_properties = schema.get("patternProperties", {})
  35. if not isinstance(pattern_properties, dict):
  36. pattern_properties = {}
  37. additional_properties = schema.get("additionalProperties", None)
  38. required = set(schema.get("required", []))
  39. def finalize_obj() -> dict[str, JSONReturnType]:
  40. if schema_repairer is None:
  41. return obj
  42. schema_repairer_local = schema_repairer
  43. # In salvage mode defer required-field enforcement to SchemaRepairer._repair_object,
  44. # so salvage-only required fills can run in one place.
  45. missing_required = [key for key in required if key not in obj]
  46. if missing_required and schema_repairer_local.schema_repair_mode != "salvage":
  47. raise ValueError(f"Missing required properties at {path}: {', '.join(missing_required)}")
  48. for key, prop_schema in properties.items():
  49. if key in obj or key in required:
  50. continue
  51. if isinstance(prop_schema, dict) and "default" in prop_schema:
  52. obj[key] = schema_repairer_local._copy_json_value(prop_schema["default"], f"{path}.{key}", "default")
  53. schema_repairer_local._log("Inserted default value for missing property", f"{path}.{key}")
  54. return obj
  55. # Stop when you either find the closing parentheses or you have iterated over the entire string
  56. while (self.get_char_at() or "}") != "}":
  57. # This is what we expect to find:
  58. # <member> ::= <string> ': ' <json>
  59. # Skip filler whitespaces
  60. self.skip_whitespaces()
  61. # Sometimes LLMs do weird things, if we find a ":" so early, we'll change it to "," and move on
  62. if self.get_char_at() == ":":
  63. self.log(
  64. "While parsing an object we found a : before a key, ignoring",
  65. )
  66. self.index += 1
  67. # We are now searching for they string key
  68. # Context is used in the string parser to manage the lack of quotes
  69. self.context.set(ContextValues.OBJECT_KEY)
  70. # Save this index in case we need find a duplicate key
  71. rollback_index = self.index
  72. # <member> starts with a <string>
  73. key = ""
  74. while self.get_char_at():
  75. # The rollback index needs to be updated here in case the key is empty
  76. rollback_index = self.index
  77. if self.get_char_at() == "[" and key == "":
  78. # Is this an array?
  79. # Need to check if the previous parsed value contained in obj is an array and in that case parse and merge the two
  80. prev_key = list(obj.keys())[-1] if obj else None
  81. if prev_key and isinstance(obj[prev_key], list) and not self.strict:
  82. # If the previous key's value is an array, parse the new array and merge
  83. self.index += 1
  84. new_array = self.parse_array()
  85. if isinstance(new_array, list):
  86. # Merge and flatten the arrays
  87. prev_value = obj[prev_key]
  88. if isinstance(prev_value, list):
  89. list_lengths = [len(item) for item in prev_value if isinstance(item, list)]
  90. expected_len = (
  91. list_lengths[0]
  92. if list_lengths and all(length == list_lengths[0] for length in list_lengths)
  93. else None
  94. )
  95. if expected_len:
  96. # Matrix-style JSON: list of uniform-length rows.
  97. # Repair a missing inner "[" by regrouping trailing scalar cells into rows.
  98. tail = []
  99. while prev_value and not isinstance(prev_value[-1], list):
  100. tail.append(prev_value.pop())
  101. if tail:
  102. tail.reverse()
  103. if len(tail) % expected_len == 0:
  104. self.log(
  105. "While parsing an object we found row values without an inner array, grouping them into rows",
  106. )
  107. for i in range(0, len(tail), expected_len):
  108. prev_value.append(tail[i : i + expected_len])
  109. else:
  110. prev_value.extend(tail)
  111. # Keep incoming rows as rows instead of flattening them into the table.
  112. if new_array:
  113. if all(isinstance(item, list) for item in new_array):
  114. self.log(
  115. "While parsing an object we found additional rows, appending them without flattening",
  116. )
  117. prev_value.extend(new_array)
  118. else:
  119. prev_value.append(new_array)
  120. else:
  121. # Fallback to legacy merge behavior when not a uniform row list or in strict mode.
  122. prev_value.extend(
  123. new_array[0]
  124. if len(new_array) == 1 and isinstance(new_array[0], list)
  125. else new_array
  126. )
  127. self.skip_whitespaces()
  128. if self.get_char_at() == ",":
  129. self.index += 1
  130. self.skip_whitespaces()
  131. continue
  132. raw_key = self.parse_string()
  133. assert isinstance(raw_key, str)
  134. key = raw_key
  135. if key == "":
  136. self.skip_whitespaces()
  137. if key != "" or (key == "" and self.get_char_at() in [":", "}"]):
  138. # Empty keys now trigger in strict mode, otherwise we keep repairing as before
  139. if key == "" and self.strict:
  140. self.log(
  141. "Empty key found in strict mode while parsing object, raising an error",
  142. )
  143. raise ValueError("Empty key found in strict mode while parsing object.")
  144. break
  145. if ContextValues.ARRAY in self.context.context and key in obj:
  146. if self.strict:
  147. self.log("Duplicate key found in strict mode while parsing object, raising an error")
  148. raise ValueError("Duplicate key found in strict mode while parsing object.")
  149. # Only split objects on duplicates when this object started as a direct array item.
  150. # Nested object values should keep standard duplicate-key overwrite behavior.
  151. if not parsing_object_value:
  152. # Keep regular duplicate-key overwrite only for clearly valid duplicate members.
  153. # If key parsing started from malformed prefix characters, or there's no ':' after the key,
  154. # treat this as a likely missing object boundary and split.
  155. lookback_idx = rollback_index - self.index - 1
  156. prev_non_whitespace = self.get_char_at(lookback_idx)
  157. while prev_non_whitespace and prev_non_whitespace.isspace():
  158. lookback_idx -= 1
  159. prev_non_whitespace = self.get_char_at(lookback_idx)
  160. key_start_char = self.get_char_at(rollback_index - self.index)
  161. next_non_whitespace = self.get_char_at(self.scroll_whitespaces())
  162. is_normal_duplicate_member = (
  163. key_start_char in STRING_DELIMITERS and prev_non_whitespace == "," and next_non_whitespace == ":"
  164. )
  165. if is_normal_duplicate_member:
  166. self.log(
  167. "While parsing an object we found a duplicate key with a normal comma separator, keeping duplicate-key overwrite behavior",
  168. )
  169. else:
  170. self.log(
  171. "While parsing an object we found a duplicate key, closing the object here and rolling back the index",
  172. )
  173. self.index = rollback_index - 1
  174. # add an opening curly brace to make this work
  175. self.json_str = self.json_str[: self.index + 1] + "{" + self.json_str[self.index + 1 :]
  176. break
  177. # Skip filler whitespaces
  178. self.skip_whitespaces()
  179. # We reached the end here
  180. if (self.get_char_at() or "}") == "}":
  181. continue
  182. self.skip_whitespaces()
  183. # An extreme case of missing ":" after a key
  184. if self.get_char_at() != ":":
  185. if self.strict:
  186. self.log(
  187. "Missing ':' after key in strict mode while parsing object, raising an error",
  188. )
  189. raise ValueError("Missing ':' after key in strict mode while parsing object.")
  190. self.log(
  191. "While parsing an object we missed a : after a key",
  192. )
  193. self.index += 1
  194. self.context.reset()
  195. self.context.set(ContextValues.OBJECT_VALUE)
  196. # The value can be any valid json; strict mode will refuse repaired empties
  197. self.skip_whitespaces()
  198. # Corner case, a lone comma
  199. value: JSONReturnType = ""
  200. prop_schema: dict[str, Any] | bool | None = None
  201. extra_schemas: list[dict[str, Any] | bool | None] = []
  202. drop_property = False
  203. if schema_repairer is not None:
  204. if key in properties:
  205. schema_value = properties[key]
  206. # Schema entries must be dict/bool; reject invalid metadata early.
  207. if schema_value is not None and not isinstance(schema_value, (dict, bool)):
  208. raise ValueError("Schema must be an object.")
  209. prop_schema = schema_value
  210. else:
  211. matched: list[Any] = []
  212. unsupported_patterns: list[str] = []
  213. if pattern_properties:
  214. matched, unsupported_patterns = match_pattern_properties(pattern_properties, key)
  215. for pattern in unsupported_patterns:
  216. self.log(
  217. f"Skipped unsupported patternProperties regex '{pattern}' while parsing object key '{key}'",
  218. )
  219. if matched:
  220. # patternProperties can stack: apply the first schema, then any extras in order.
  221. primary_schema = matched[0]
  222. if primary_schema is not None and not isinstance(primary_schema, (dict, bool)):
  223. raise ValueError("Schema must be an object.")
  224. prop_schema = primary_schema
  225. for extra_schema in matched[1:]:
  226. if extra_schema is not None and not isinstance(extra_schema, (dict, bool)):
  227. raise ValueError("Schema must be an object.")
  228. extra_schemas.append(extra_schema)
  229. else:
  230. if additional_properties is False:
  231. # Schema forbids unknown keys: parse but drop this property.
  232. drop_property = True
  233. elif isinstance(additional_properties, dict):
  234. prop_schema = additional_properties
  235. else:
  236. prop_schema = True
  237. char = self.get_char_at()
  238. key_path = f"{path}.{key}"
  239. if char in [",", "}"]:
  240. self.log(
  241. f"While parsing an object value we found a stray {char}, ignoring it",
  242. )
  243. if schema_repairer is not None:
  244. # Missing value: fill according to schema (defaults/const/enum/type).
  245. value = schema_repairer.repair_value(MISSING_VALUE, prop_schema, key_path)
  246. else:
  247. # Schema-aware parsing guides repairs inside nested values.
  248. value = self.parse_json(prop_schema, key_path) if schema_repairer is not None else self.parse_json()
  249. if schema_repairer is not None and extra_schemas:
  250. # Apply any additional pattern schemas in order.
  251. for extra_schema in extra_schemas:
  252. value = schema_repairer.repair_value(value, extra_schema, key_path)
  253. if schema_repairer is None and value == "" and self.strict and self.get_char_at(-1) not in STRING_DELIMITERS:
  254. self.log(
  255. "Parsed value is empty in strict mode while parsing object, raising an error",
  256. )
  257. raise ValueError("Parsed value is empty in strict mode while parsing object.")
  258. # Reset context since our job is done
  259. self.context.reset()
  260. if schema_repairer is None or not drop_property:
  261. obj[key] = value
  262. else:
  263. # Keep parsing but omit forbidden properties to respect the schema.
  264. schema_repairer._log("Dropped extra property not covered by schema", key_path)
  265. if self.get_char_at() in [",", "'", '"']:
  266. self.index += 1
  267. if self.get_char_at() == "]" and ContextValues.ARRAY in self.context.context:
  268. self.log(
  269. "While parsing an object we found a closing array bracket, closing the object here and rolling back the index"
  270. )
  271. self.index -= 1
  272. break
  273. # Remove trailing spaces
  274. self.skip_whitespaces()
  275. self.index += 1
  276. # If the object is empty but also isn't just {}
  277. if not obj and self.index - start_index > 2:
  278. if self.strict:
  279. self.log(
  280. "Parsed object is empty but contains extra characters in strict mode, raising an error",
  281. )
  282. raise ValueError("Parsed object is empty but contains extra characters in strict mode.")
  283. self.log("Parsed object is empty, we will try to parse this as an array instead")
  284. self.index = start_index
  285. return self.parse_array()
  286. # Check if there are more key-value pairs after the closing brace
  287. # This handles cases like '{"key": "value"}, "key2": "value2"}'
  288. # But only if we're not in a nested context
  289. if not self.context.empty:
  290. # Sometimes there could be an extra closing brace that closes the object twice
  291. # So we check the context to see if the next one in the stack is an object or not
  292. # If not we skip it
  293. if self.get_char_at() == "}" and self.context.current not in [
  294. ContextValues.OBJECT_KEY,
  295. ContextValues.OBJECT_VALUE,
  296. ]:
  297. self.log(
  298. "Found an extra closing brace that shouldn't be there, skipping it",
  299. )
  300. self.index += 1
  301. return obj
  302. self.skip_whitespaces()
  303. if self.get_char_at() != ",":
  304. return finalize_obj()
  305. self.index += 1
  306. self.skip_whitespaces()
  307. if self.get_char_at() not in STRING_DELIMITERS:
  308. return finalize_obj()
  309. if not self.strict:
  310. self.log(
  311. "Found a comma and string delimiter after object closing brace, checking for additional key-value pairs",
  312. )
  313. additional_obj = self.parse_object(schema, path)
  314. if isinstance(additional_obj, dict):
  315. obj.update(additional_obj)
  316. return finalize_obj()