extractoutput.py 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150
  1. """A preprocessor that extracts all of the outputs from the
  2. notebook file. The extracted outputs are returned in the 'resources' dictionary.
  3. """
  4. # Copyright (c) IPython Development Team.
  5. # Distributed under the terms of the Modified BSD License.
  6. import json
  7. import os
  8. import sys
  9. from binascii import a2b_base64
  10. from mimetypes import guess_extension
  11. from textwrap import dedent
  12. from traitlets import Set, Unicode
  13. from .base import Preprocessor
  14. def guess_extension_without_jpe(mimetype):
  15. """
  16. This function fixes a problem with '.jpe' extensions
  17. of jpeg images which are then not recognised by latex.
  18. For any other case, the function works in the same way
  19. as mimetypes.guess_extension
  20. """
  21. ext = guess_extension(mimetype)
  22. if ext == ".jpe":
  23. ext = ".jpeg"
  24. return ext
  25. def platform_utf_8_encode(data):
  26. """Encode data based on platform."""
  27. if isinstance(data, str):
  28. if sys.platform == "win32":
  29. data = data.replace("\n", "\r\n")
  30. data = data.encode("utf-8")
  31. return data
  32. class ExtractOutputPreprocessor(Preprocessor):
  33. """
  34. Extracts all of the outputs from the notebook file. The extracted
  35. outputs are returned in the 'resources' dictionary.
  36. """
  37. output_filename_template = Unicode("{unique_key}_{cell_index}_{index}{extension}").tag(
  38. config=True
  39. )
  40. extract_output_types = Set({"image/png", "image/jpeg", "image/svg+xml", "application/pdf"}).tag(
  41. config=True
  42. )
  43. def preprocess_cell(self, cell, resources, cell_index):
  44. """
  45. Apply a transformation on each cell,
  46. Parameters
  47. ----------
  48. cell : NotebookNode cell
  49. Notebook cell being processed
  50. resources : dictionary
  51. Additional resources used in the conversion process. Allows
  52. preprocessors to pass variables into the Jinja engine.
  53. cell_index : int
  54. Index of the cell being processed (see base.py)
  55. """
  56. # Get the unique key from the resource dict if it exists. If it does not
  57. # exist, use 'output' as the default. Also, get files directory if it
  58. # has been specified
  59. unique_key = resources.get("unique_key", "output")
  60. output_files_dir = resources.get("output_files_dir", None)
  61. # Make sure outputs key exists
  62. if not isinstance(resources["outputs"], dict):
  63. resources["outputs"] = {}
  64. # Loop through all of the outputs in the cell
  65. for index, out in enumerate(cell.get("outputs", [])):
  66. if out.output_type not in {"display_data", "execute_result"}:
  67. continue
  68. if "text/html" in out.data:
  69. out["data"]["text/html"] = dedent(out["data"]["text/html"])
  70. # Get the output in data formats that the template needs extracted
  71. for mime_type in self.extract_output_types:
  72. if mime_type in out.data:
  73. data = out.data[mime_type]
  74. # Binary files are base64-encoded, SVG is already XML
  75. if mime_type in {"image/png", "image/jpeg", "application/pdf"}:
  76. # data is b64-encoded as text (str, unicode),
  77. # we want the original bytes
  78. data = a2b_base64(data)
  79. elif mime_type == "application/json" or not isinstance(data, str):
  80. # Data is either JSON-like and was parsed into a Python
  81. # object according to the spec, or data is for sure
  82. # JSON. In the latter case we want to go extra sure that
  83. # we enclose a scalar string value into extra quotes by
  84. # serializing it properly.
  85. if isinstance(data, bytes):
  86. # We need to guess the encoding in this
  87. # instance. Some modules that return raw data like
  88. # svg can leave the data in byte form instead of str
  89. data = data.decode("utf-8")
  90. data = platform_utf_8_encode(json.dumps(data))
  91. else:
  92. # All other text_type data will fall into this path
  93. data = platform_utf_8_encode(data)
  94. ext = guess_extension_without_jpe(mime_type)
  95. if ext is None:
  96. ext = "." + mime_type.rsplit("/")[-1]
  97. if out.metadata.get("filename", ""):
  98. filename = out.metadata["filename"]
  99. if not filename.endswith(ext):
  100. filename += ext
  101. else:
  102. filename = self.output_filename_template.format(
  103. unique_key=unique_key, cell_index=cell_index, index=index, extension=ext
  104. )
  105. # On the cell, make the figure available via
  106. # cell.outputs[i].metadata.filenames['mime/type']
  107. # where
  108. # cell.outputs[i].data['mime/type'] contains the data
  109. if output_files_dir is not None:
  110. filename = os.path.join(output_files_dir, filename)
  111. out.metadata.setdefault("filenames", {})
  112. out.metadata["filenames"][mime_type] = filename
  113. if filename in resources["outputs"]:
  114. msg = (
  115. "Your outputs have filename metadata associated "
  116. "with them. Nbconvert saves these outputs to "
  117. "external files using this filename metadata. "
  118. "Filenames need to be unique across the notebook, "
  119. f"or images will be overwritten. The filename {filename} is "
  120. "associated with more than one output. The second "
  121. "output associated with this filename is in cell "
  122. f"{cell_index}."
  123. )
  124. raise ValueError(msg)
  125. # In the resources, make the figure available via
  126. # resources['outputs']['filename'] = data
  127. resources["outputs"][filename] = data
  128. return cell, resources