sanitize.py 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179
  1. """
  2. NBConvert Preprocessor for sanitizing HTML rendering of notebooks.
  3. """
  4. import warnings
  5. from bleach import ALLOWED_ATTRIBUTES, ALLOWED_TAGS, clean
  6. from traitlets import Any, Bool, List, Set, Unicode
  7. from .base import Preprocessor
  8. _USE_BLEACH_CSS_SANITIZER = False
  9. _USE_BLEACH_STYLES = False
  10. try:
  11. # bleach[css] >=5.0
  12. from bleach.css_sanitizer import ALLOWED_CSS_PROPERTIES as ALLOWED_STYLES
  13. from bleach.css_sanitizer import CSSSanitizer
  14. _USE_BLEACH_CSS_SANITIZER = True
  15. _USE_BLEACH_STYLES = False
  16. except ImportError:
  17. try:
  18. # bleach <5
  19. from bleach import ALLOWED_STYLES # type:ignore[attr-defined, no-redef]
  20. _USE_BLEACH_CSS_SANITIZER = False
  21. _USE_BLEACH_STYLES = True
  22. warnings.warn(
  23. "Support for bleach <5 will be removed in a future version of nbconvert",
  24. DeprecationWarning,
  25. stacklevel=2,
  26. )
  27. except ImportError:
  28. warnings.warn(
  29. "The installed bleach/tinycss2 do not provide CSS sanitization, "
  30. "please upgrade to bleach >=5",
  31. UserWarning,
  32. stacklevel=2,
  33. )
  34. __all__ = ["SanitizeHTML"]
  35. class SanitizeHTML(Preprocessor):
  36. """A preprocessor to sanitize html."""
  37. # Bleach config.
  38. attributes = Any(
  39. config=True,
  40. default_value=ALLOWED_ATTRIBUTES,
  41. help="Allowed HTML tag attributes",
  42. )
  43. tags = List(
  44. Unicode(),
  45. config=True,
  46. default_value=ALLOWED_TAGS, # type:ignore[arg-type]
  47. help="List of HTML tags to allow",
  48. )
  49. styles = List(
  50. Unicode(),
  51. config=True,
  52. default_value=ALLOWED_STYLES, # type:ignore[arg-type]
  53. help="Allowed CSS styles if <style> tag is allowed",
  54. )
  55. strip = Bool(
  56. config=True,
  57. default_value=False,
  58. help="If True, remove unsafe markup entirely instead of escaping",
  59. )
  60. strip_comments = Bool(
  61. config=True,
  62. default_value=True,
  63. help="If True, strip comments from escaped HTML",
  64. )
  65. # Display data config.
  66. safe_output_keys = Set(
  67. config=True,
  68. default_value={
  69. "metadata", # Not a mimetype per-se, but expected and safe.
  70. "text/plain",
  71. "text/latex",
  72. "application/json",
  73. "image/png",
  74. "image/jpeg",
  75. },
  76. help="Cell output mimetypes to render without modification",
  77. )
  78. sanitized_output_types = Set(
  79. config=True,
  80. default_value={
  81. "text/html",
  82. "text/markdown",
  83. },
  84. help="Cell output types to display after escaping with Bleach.",
  85. )
  86. def preprocess_cell(self, cell, resources, cell_index):
  87. """
  88. Sanitize potentially-dangerous contents of the cell.
  89. Cell Types:
  90. raw:
  91. Sanitize literal HTML
  92. markdown:
  93. Sanitize literal HTML
  94. code:
  95. Sanitize outputs that could result in code execution
  96. """
  97. if cell.cell_type == "raw":
  98. # Sanitize all raw cells anyway.
  99. # Only ones with the text/html mimetype should be emitted
  100. # but erring on the side of safety maybe.
  101. cell.source = self.sanitize_html_tags(cell.source)
  102. return cell, resources
  103. if cell.cell_type == "markdown":
  104. cell.source = self.sanitize_html_tags(cell.source)
  105. return cell, resources
  106. if cell.cell_type == "code":
  107. cell.outputs = self.sanitize_code_outputs(cell.outputs)
  108. return cell, resources
  109. return None
  110. def sanitize_code_outputs(self, outputs):
  111. """
  112. Sanitize code cell outputs.
  113. Removes 'text/javascript' fields from display_data outputs, and
  114. runs `sanitize_html_tags` over 'text/html'.
  115. """
  116. for output in outputs:
  117. # These are always ascii, so nothing to escape.
  118. if output["output_type"] in ("stream", "error"):
  119. continue
  120. data = output.data
  121. to_remove = []
  122. for key in data:
  123. if key in self.safe_output_keys:
  124. continue
  125. if key in self.sanitized_output_types:
  126. self.log.info("Sanitizing %s", key)
  127. data[key] = self.sanitize_html_tags(data[key])
  128. else:
  129. # Mark key for removal. (Python doesn't allow deletion of
  130. # keys from a dict during iteration)
  131. to_remove.append(key)
  132. for key in to_remove:
  133. self.log.info("Removing %s", key)
  134. del data[key]
  135. return outputs
  136. def sanitize_html_tags(self, html_str):
  137. """
  138. Sanitize a string containing raw HTML tags.
  139. """
  140. kwargs = {
  141. "tags": self.tags,
  142. "attributes": self.attributes,
  143. "strip": self.strip,
  144. "strip_comments": self.strip_comments,
  145. }
  146. if _USE_BLEACH_CSS_SANITIZER:
  147. css_sanitizer = CSSSanitizer(allowed_css_properties=self.styles)
  148. kwargs.update(css_sanitizer=css_sanitizer)
  149. elif _USE_BLEACH_STYLES:
  150. kwargs.update(styles=self.styles)
  151. return clean(html_str, **kwargs)
  152. def _get_default_css_sanitizer():
  153. if _USE_BLEACH_CSS_SANITIZER:
  154. return CSSSanitizer(allowed_css_properties=ALLOWED_STYLES)
  155. return None