validator.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666
  1. """Notebook format validators."""
  2. # Copyright (c) IPython Development Team.
  3. # Distributed under the terms of the Modified BSD License.
  4. from __future__ import annotations
  5. import json
  6. import pprint
  7. import warnings
  8. from copy import deepcopy
  9. from pathlib import Path
  10. from textwrap import dedent
  11. from typing import Any, Optional
  12. from ._imports import import_item
  13. from .corpus.words import generate_corpus_id
  14. from .json_compat import ValidationError, _validator_for_name, get_current_validator
  15. from .reader import get_version
  16. from .warnings import DuplicateCellId, MissingIDFieldWarning
  17. validators = {}
  18. _deprecated = object()
  19. __all__ = [
  20. "ValidationError",
  21. "get_validator",
  22. "isvalid",
  23. "NotebookValidationError",
  24. "better_validation_error",
  25. "normalize",
  26. "validate",
  27. "iter_validate",
  28. ]
  29. def _relax_additional_properties(obj):
  30. """relax any `additionalProperties`"""
  31. if isinstance(obj, dict):
  32. for key, value in obj.items():
  33. value = ( # noqa: PLW2901
  34. True if key == "additionalProperties" else _relax_additional_properties(value)
  35. )
  36. obj[key] = value
  37. elif isinstance(obj, list):
  38. for i, value in enumerate(obj):
  39. obj[i] = _relax_additional_properties(value)
  40. return obj
  41. def _allow_undefined(schema):
  42. schema["definitions"]["cell"]["oneOf"].append({"$ref": "#/definitions/unrecognized_cell"})
  43. schema["definitions"]["output"]["oneOf"].append({"$ref": "#/definitions/unrecognized_output"})
  44. return schema
  45. def get_validator(version=None, version_minor=None, relax_add_props=False, name=None):
  46. """Load the JSON schema into a Validator"""
  47. if version is None:
  48. from . import current_nbformat
  49. version = current_nbformat
  50. v = import_item("nbformat.v%s" % version)
  51. current_minor = getattr(v, "nbformat_minor", 0)
  52. if version_minor is None:
  53. version_minor = current_minor
  54. current_validator = _validator_for_name(name) if name else get_current_validator()
  55. version_tuple = (current_validator.name, version, version_minor)
  56. if version_tuple not in validators:
  57. try:
  58. schema_json = _get_schema_json(v, version=version, version_minor=version_minor)
  59. except AttributeError:
  60. return None
  61. if current_minor < version_minor:
  62. # notebook from the future, relax all `additionalProperties: False` requirements
  63. schema_json = _relax_additional_properties(schema_json)
  64. # and allow undefined cell types and outputs
  65. schema_json = _allow_undefined(schema_json)
  66. validators[version_tuple] = current_validator(schema_json)
  67. if relax_add_props:
  68. try:
  69. schema_json = _get_schema_json(v, version=version, version_minor=version_minor)
  70. except AttributeError:
  71. return None
  72. # this allows properties to be added for intermediate
  73. # representations while validating for all other kinds of errors
  74. schema_json = _relax_additional_properties(schema_json)
  75. validators[version_tuple] = current_validator(schema_json)
  76. return validators[version_tuple]
  77. def _get_schema_json(v, version=None, version_minor=None):
  78. """
  79. Gets the json schema from a given imported library and nbformat version.
  80. """
  81. if (version, version_minor) in v.nbformat_schema:
  82. schema_path = str(Path(v.__file__).parent / v.nbformat_schema[(version, version_minor)])
  83. elif version_minor > v.nbformat_minor:
  84. # load the latest schema
  85. schema_path = str(Path(v.__file__).parent / v.nbformat_schema[(None, None)])
  86. else:
  87. msg = "Cannot find appropriate nbformat schema file."
  88. raise AttributeError(msg)
  89. with Path(schema_path).open(encoding="utf8") as f:
  90. schema_json = json.load(f)
  91. return schema_json # noqa: RET504
  92. def isvalid(nbjson, ref=None, version=None, version_minor=None):
  93. """Checks whether the given notebook JSON conforms to the current
  94. notebook format schema. Returns True if the JSON is valid, and
  95. False otherwise.
  96. To see the individual errors that were encountered, please use the
  97. `validate` function instead.
  98. """
  99. orig = deepcopy(nbjson)
  100. try:
  101. with warnings.catch_warnings():
  102. warnings.filterwarnings("ignore", category=DeprecationWarning)
  103. warnings.filterwarnings("ignore", category=MissingIDFieldWarning)
  104. validate(nbjson, ref, version, version_minor, repair_duplicate_cell_ids=False)
  105. except ValidationError:
  106. return False
  107. else:
  108. return True
  109. finally:
  110. if nbjson != orig:
  111. raise AssertionError
  112. def _format_as_index(indices):
  113. """
  114. (from jsonschema._utils.format_as_index, copied to avoid relying on private API)
  115. Construct a single string containing indexing operations for the indices.
  116. For example, [1, 2, "foo"] -> [1][2]["foo"]
  117. """
  118. if not indices:
  119. return ""
  120. return "[%s]" % "][".join(repr(index) for index in indices)
  121. _ITEM_LIMIT = 16
  122. _STR_LIMIT = 64
  123. def _truncate_obj(obj):
  124. """Truncate objects for use in validation tracebacks
  125. Cell and output lists are squashed, as are long strings, lists, and dicts.
  126. """
  127. if isinstance(obj, dict):
  128. truncated_dict = {k: _truncate_obj(v) for k, v in list(obj.items())[:_ITEM_LIMIT]}
  129. if isinstance(truncated_dict.get("cells"), list):
  130. truncated_dict["cells"] = ["...%i cells..." % len(obj["cells"])]
  131. if isinstance(truncated_dict.get("outputs"), list):
  132. truncated_dict["outputs"] = ["...%i outputs..." % len(obj["outputs"])]
  133. if len(obj) > _ITEM_LIMIT:
  134. truncated_dict["..."] = "%i keys truncated" % (len(obj) - _ITEM_LIMIT)
  135. return truncated_dict
  136. if isinstance(obj, list):
  137. truncated_list = [_truncate_obj(item) for item in obj[:_ITEM_LIMIT]]
  138. if len(obj) > _ITEM_LIMIT:
  139. truncated_list.append("...%i items truncated..." % (len(obj) - _ITEM_LIMIT))
  140. return truncated_list
  141. if isinstance(obj, str):
  142. truncated_str = obj[:_STR_LIMIT]
  143. if len(obj) > _STR_LIMIT:
  144. truncated_str += "..."
  145. return truncated_str
  146. return obj
  147. class NotebookValidationError(ValidationError): # type:ignore[misc]
  148. """Schema ValidationError with truncated representation
  149. to avoid massive verbose tracebacks.
  150. """
  151. def __init__(self, original, ref=None):
  152. """Initialize the error class."""
  153. self.original = original
  154. self.ref = getattr(self.original, "ref", ref)
  155. self.message = self.original.message
  156. def __getattr__(self, key):
  157. """Get an attribute from the error."""
  158. return getattr(self.original, key)
  159. def __unicode__(self):
  160. """Custom str for validation errors
  161. avoids dumping full schema and notebook to logs
  162. """
  163. error = self.original
  164. instance = _truncate_obj(error.instance)
  165. return "\n".join(
  166. [
  167. error.message,
  168. "",
  169. "Failed validating {!r} in {}{}:".format(
  170. error.validator,
  171. self.ref or "notebook",
  172. _format_as_index(list(error.relative_schema_path)[:-1]),
  173. ),
  174. "",
  175. "On instance%s:" % _format_as_index(error.relative_path),
  176. pprint.pformat(instance, width=78),
  177. ]
  178. )
  179. __str__ = __unicode__
  180. def better_validation_error(error, version, version_minor):
  181. """Get better ValidationError on oneOf failures
  182. oneOf errors aren't informative.
  183. if it's a cell type or output_type error,
  184. try validating directly based on the type for a better error message
  185. """
  186. if not len(error.schema_path):
  187. return error
  188. key = error.schema_path[-1]
  189. ref = None
  190. if key.endswith("Of"):
  191. if isinstance(error.instance, dict):
  192. if "cell_type" in error.instance:
  193. ref = error.instance["cell_type"] + "_cell"
  194. elif "output_type" in error.instance:
  195. ref = error.instance["output_type"]
  196. if ref:
  197. try:
  198. validate(
  199. error.instance,
  200. ref,
  201. version=version,
  202. version_minor=version_minor,
  203. )
  204. except ValidationError as sub_error:
  205. # keep extending relative path
  206. error.relative_path.extend(sub_error.relative_path)
  207. sub_error.relative_path = error.relative_path
  208. better = better_validation_error(sub_error, version, version_minor)
  209. if better.ref is None:
  210. better.ref = ref
  211. return better
  212. except Exception: # noqa: S110
  213. # if it fails for some reason,
  214. # let the original error through
  215. pass
  216. return NotebookValidationError(error, ref)
  217. def normalize(
  218. nbdict: Any,
  219. version: Optional[int] = None,
  220. version_minor: Optional[int] = None,
  221. *,
  222. relax_add_props: bool = False,
  223. strip_invalid_metadata: bool = False,
  224. ) -> tuple[int, Any]:
  225. """
  226. Normalise a notebook prior to validation.
  227. This tries to implement a couple of normalisation steps to standardise
  228. notebooks and make validation easier.
  229. You should in general not rely on this function and make sure the notebooks
  230. that reach nbformat are already in a normal form. If not you likely have a bug,
  231. and may have security issues.
  232. Parameters
  233. ----------
  234. nbdict : dict
  235. notebook document
  236. version : int
  237. version_minor : int
  238. relax_add_props : bool
  239. Whether to allow extra property in the Json schema validating the
  240. notebook.
  241. strip_invalid_metadata : bool
  242. Whether to strip metadata that does not exist in the Json schema when
  243. validating the notebook.
  244. Returns
  245. -------
  246. changes : int
  247. number of changes in the notebooks
  248. notebook : dict
  249. deep-copy of the original object with relevant changes.
  250. """
  251. nbdict = deepcopy(nbdict)
  252. nbdict_version, nbdict_version_minor = get_version(nbdict)
  253. if version is None:
  254. version = nbdict_version
  255. if version_minor is None:
  256. version_minor = nbdict_version_minor
  257. return _normalize(
  258. nbdict,
  259. version,
  260. version_minor,
  261. True,
  262. relax_add_props=relax_add_props,
  263. strip_invalid_metadata=strip_invalid_metadata,
  264. )
  265. def _normalize(
  266. nbdict: Any,
  267. version: int,
  268. version_minor: int,
  269. repair_duplicate_cell_ids: bool,
  270. relax_add_props: bool,
  271. strip_invalid_metadata: bool,
  272. ) -> tuple[int, Any]:
  273. """
  274. Private normalisation routine.
  275. This function attempts to normalize the `nbdict` passed to it.
  276. As `_normalize()` is currently used both in `validate()` (for
  277. historical reasons), and in the `normalize()` public function,
  278. `_normalize()` does currently mutate `nbdict`.
  279. Ideally, once `validate()` stops calling `_normalize()`, `_normalize()`
  280. may stop mutating `nbdict`.
  281. """
  282. changes = 0
  283. if (version, version_minor) >= (4, 5):
  284. # if we support cell ids ensure default ids are provided
  285. for cell in nbdict["cells"]:
  286. if "id" not in cell:
  287. warnings.warn(
  288. "Cell is missing an id field, this will become"
  289. " a hard error in future nbformat versions. You may want"
  290. " to use `normalize()` on your notebooks before validations"
  291. " (available since nbformat 5.1.4). Previous versions of nbformat"
  292. " are fixing this issue transparently, and will stop doing so"
  293. " in the future.",
  294. MissingIDFieldWarning,
  295. stacklevel=3,
  296. )
  297. # Generate cell ids if any are missing
  298. if repair_duplicate_cell_ids:
  299. cell["id"] = generate_corpus_id()
  300. changes += 1
  301. # if we support cell ids check for uniqueness when validating the whole notebook
  302. seen_ids = set()
  303. for cell in nbdict["cells"]:
  304. if "id" not in cell:
  305. continue
  306. cell_id = cell["id"]
  307. if cell_id in seen_ids:
  308. # Best effort to repair if we find a duplicate id
  309. if repair_duplicate_cell_ids:
  310. new_id = generate_corpus_id()
  311. cell["id"] = new_id
  312. changes += 1
  313. warnings.warn(
  314. f"Non-unique cell id {cell_id!r} detected. Corrected to {new_id!r}.",
  315. DuplicateCellId,
  316. stacklevel=3,
  317. )
  318. else:
  319. msg = f"Non-unique cell id '{cell_id}' detected."
  320. raise ValidationError(msg)
  321. seen_ids.add(cell_id)
  322. if strip_invalid_metadata:
  323. changes += _strip_invalida_metadata(
  324. nbdict, version, version_minor, relax_add_props=relax_add_props
  325. )
  326. return changes, nbdict
  327. def _dep_warn(field):
  328. warnings.warn(
  329. dedent(
  330. f"""`{field}` kwargs of validate has been deprecated for security
  331. reasons, and will be removed soon.
  332. Please explicitly use the `n_changes, new_notebook = nbformat.validator.normalize(old_notebook, ...)` if you wish to
  333. normalise your notebook. `normalize` is available since nbformat 5.5.0
  334. """
  335. ),
  336. DeprecationWarning,
  337. stacklevel=3,
  338. )
  339. def validate(
  340. nbdict: Any = None,
  341. ref: Optional[str] = None,
  342. version: Optional[int] = None,
  343. version_minor: Optional[int] = None,
  344. relax_add_props: bool = False,
  345. nbjson: Any = None,
  346. repair_duplicate_cell_ids: bool = _deprecated, # type: ignore[assignment]
  347. strip_invalid_metadata: bool = _deprecated, # type: ignore[assignment]
  348. ) -> None:
  349. """Checks whether the given notebook dict-like object
  350. conforms to the relevant notebook format schema.
  351. Parameters
  352. ----------
  353. nbdict : dict
  354. notebook document
  355. ref : optional, str
  356. reference to the subset of the schema we want to validate against.
  357. for example ``"markdown_cell"``, `"code_cell"` ....
  358. version : int
  359. version_minor : int
  360. relax_add_props : bool
  361. Whether to allow extra properties in the JSON schema validating the notebook.
  362. When True, all known fields are validated, but unknown fields are ignored.
  363. nbjson
  364. repair_duplicate_cell_ids : bool
  365. Deprecated since 5.5.0 - will be removed in the future.
  366. strip_invalid_metadata : bool
  367. Deprecated since 5.5.0 - will be removed in the future.
  368. Returns
  369. -------
  370. None
  371. Raises
  372. ------
  373. ValidationError if not valid.
  374. Notes
  375. -----
  376. Prior to Nbformat 5.5.0 the `validate` and `isvalid` method would silently
  377. try to fix invalid notebook and mutate arguments. This behavior is deprecated
  378. and will be removed in a near future.
  379. Please explicitly call `normalize` if you need to normalize notebooks.
  380. """
  381. assert isinstance(ref, str) or ref is None
  382. if strip_invalid_metadata is _deprecated:
  383. strip_invalid_metadata = False
  384. else:
  385. _dep_warn("strip_invalid_metadata")
  386. if repair_duplicate_cell_ids is _deprecated:
  387. repair_duplicate_cell_ids = True
  388. else:
  389. _dep_warn("repair_duplicate_cell_ids")
  390. # backwards compatibility for nbjson argument
  391. if nbdict is not None:
  392. pass
  393. elif nbjson is not None:
  394. nbdict = nbjson
  395. else:
  396. msg = "validate() missing 1 required argument: 'nbdict'"
  397. raise TypeError(msg)
  398. if ref is None:
  399. # if ref is not specified, we have a whole notebook, so we can get the version
  400. nbdict_version, nbdict_version_minor = get_version(nbdict)
  401. if version is None:
  402. version = nbdict_version
  403. if version_minor is None:
  404. version_minor = nbdict_version_minor
  405. # if ref is specified, and we don't have a version number, assume we're validating against 1.0
  406. elif version is None:
  407. version, version_minor = 1, 0
  408. if ref is None:
  409. assert isinstance(version, int)
  410. assert isinstance(version_minor, int)
  411. _normalize(
  412. nbdict,
  413. version,
  414. version_minor,
  415. repair_duplicate_cell_ids,
  416. relax_add_props=relax_add_props,
  417. strip_invalid_metadata=strip_invalid_metadata,
  418. )
  419. for error in iter_validate(
  420. nbdict,
  421. ref=ref,
  422. version=version,
  423. version_minor=version_minor,
  424. relax_add_props=relax_add_props,
  425. strip_invalid_metadata=strip_invalid_metadata,
  426. ):
  427. raise error
  428. def _get_errors(
  429. nbdict: Any, version: int, version_minor: int, relax_add_props: bool, *args: Any
  430. ) -> Any:
  431. validator = get_validator(version, version_minor, relax_add_props=relax_add_props)
  432. if not validator:
  433. msg = f"No schema for validating v{version}.{version_minor} notebooks"
  434. raise ValidationError(msg)
  435. iter_errors = validator.iter_errors(nbdict, *args)
  436. errors = list(iter_errors)
  437. # jsonschema gives the best error messages.
  438. if len(errors) and validator.name != "jsonschema":
  439. validator = get_validator(
  440. version=version,
  441. version_minor=version_minor,
  442. relax_add_props=relax_add_props,
  443. name="jsonschema",
  444. )
  445. return validator.iter_errors(nbdict, *args)
  446. return iter(errors)
  447. def _strip_invalida_metadata(
  448. nbdict: Any, version: int, version_minor: int, relax_add_props: bool
  449. ) -> int:
  450. """
  451. This function tries to extract metadata errors from the validator and fix
  452. them if necessary. This mostly mean stripping unknown keys from metadata
  453. fields, or removing metadata fields altogether.
  454. Parameters
  455. ----------
  456. nbdict : dict
  457. notebook document
  458. version : int
  459. version_minor : int
  460. relax_add_props : bool
  461. Whether to allow extra property in the Json schema validating the
  462. notebook.
  463. Returns
  464. -------
  465. int
  466. number of modifications
  467. """
  468. errors = _get_errors(nbdict, version, version_minor, relax_add_props)
  469. changes = 0
  470. if len(list(errors)) > 0:
  471. # jsonschema gives a better error tree.
  472. validator = get_validator(
  473. version=version,
  474. version_minor=version_minor,
  475. relax_add_props=relax_add_props,
  476. name="jsonschema",
  477. )
  478. if not validator:
  479. msg = f"No jsonschema for validating v{version}.{version_minor} notebooks"
  480. raise ValidationError(msg)
  481. errors = validator.iter_errors(nbdict)
  482. error_tree = validator.error_tree(errors)
  483. if "metadata" in error_tree:
  484. for key in error_tree["metadata"]:
  485. nbdict["metadata"].pop(key, None)
  486. changes += 1
  487. if "cells" in error_tree:
  488. number_of_cells = len(nbdict.get("cells", 0))
  489. for cell_idx in range(number_of_cells):
  490. # Cells don't report individual metadata keys as having failed validation
  491. # Instead it reports that it failed to validate against each cell-type definition.
  492. # We have to delve into why those definitions failed to uncover which metadata
  493. # keys are misbehaving.
  494. if "oneOf" in error_tree["cells"][cell_idx].errors:
  495. intended_cell_type = nbdict["cells"][cell_idx]["cell_type"]
  496. schemas_by_index = [
  497. ref["$ref"]
  498. for ref in error_tree["cells"][cell_idx].errors["oneOf"].schema["oneOf"]
  499. ]
  500. cell_type_definition_name = f"#/definitions/{intended_cell_type}_cell"
  501. if cell_type_definition_name in schemas_by_index:
  502. schema_index = schemas_by_index.index(cell_type_definition_name)
  503. for error in error_tree["cells"][cell_idx].errors["oneOf"].context:
  504. rel_path = error.relative_path
  505. error_for_intended_schema = error.schema_path[0] == schema_index
  506. is_top_level_metadata_key = (
  507. len(rel_path) == 2 and rel_path[0] == "metadata"
  508. )
  509. if error_for_intended_schema and is_top_level_metadata_key:
  510. nbdict["cells"][cell_idx]["metadata"].pop(rel_path[1], None)
  511. changes += 1
  512. return changes
  513. def iter_validate(
  514. nbdict=None,
  515. ref=None,
  516. version=None,
  517. version_minor=None,
  518. relax_add_props=False,
  519. nbjson=None,
  520. strip_invalid_metadata=False,
  521. ):
  522. """Checks whether the given notebook dict-like object conforms to the
  523. relevant notebook format schema.
  524. Returns a generator of all ValidationErrors if not valid.
  525. Notes
  526. -----
  527. To fix: For security reasons, this function should *never* mutate its `nbdict` argument, and
  528. should *never* try to validate a mutated or modified version of its notebook.
  529. """
  530. # backwards compatibility for nbjson argument
  531. if nbdict is not None:
  532. pass
  533. elif nbjson is not None:
  534. nbdict = nbjson
  535. else:
  536. msg = "iter_validate() missing 1 required argument: 'nbdict'"
  537. raise TypeError(msg)
  538. if version is None:
  539. version, version_minor = get_version(nbdict)
  540. if ref:
  541. try:
  542. errors = _get_errors(
  543. nbdict,
  544. version,
  545. version_minor,
  546. relax_add_props,
  547. {"$ref": "#/definitions/%s" % ref},
  548. )
  549. except ValidationError as e:
  550. yield e
  551. return
  552. else:
  553. if strip_invalid_metadata:
  554. _strip_invalida_metadata(nbdict, version, version_minor, relax_add_props)
  555. # Validate one more time to ensure that us removing metadata
  556. # didn't cause another complex validation issue in the schema.
  557. # Also to ensure that higher-level errors produced by individual metadata validation
  558. # failures are removed.
  559. try:
  560. errors = _get_errors(nbdict, version, version_minor, relax_add_props)
  561. except ValidationError as e:
  562. yield e
  563. return
  564. for error in errors:
  565. yield better_validation_error(error, version, version_minor)