extractattachments.py 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130
  1. """
  2. Module that extracts attachments from notebooks into their own files
  3. """
  4. # Copyright (c) Jupyter Development Team.
  5. # Distributed under the terms of the Modified BSD License.
  6. import os
  7. from base64 import b64decode
  8. from traitlets import Bool, Unicode
  9. from .base import Preprocessor
  10. class ExtractAttachmentsPreprocessor(Preprocessor):
  11. """
  12. Extracts attachments from all (markdown and raw) cells in a notebook.
  13. The extracted attachments are stored in a directory ('attachments' by default).
  14. https://nbformat.readthedocs.io/en/latest/format_description.html#cell-attachments
  15. """
  16. attachments_directory_template = Unicode(
  17. "{notebook_name}_attachments",
  18. help="Directory to place attachments if use_separate_dir is True",
  19. ).tag(config=True)
  20. use_separate_dir = Bool(
  21. False,
  22. help="Whether to use output_files_dir (which ExtractOutput also uses) or "
  23. "create a separate directory for attachments",
  24. ).tag(config=True)
  25. def __init__(self, **kw):
  26. """
  27. Public constructor
  28. """
  29. super().__init__(**kw)
  30. # directory path,
  31. self.path_name = "" # will be set in self.preprocess, needs resources
  32. # Where extracted attachments are stored in resources
  33. self.resources_item_key = (
  34. "attachments" # Here as a default, in case someone doesn't want to call preprocess
  35. )
  36. # Add condition and configurability here
  37. def preprocess(self, nb, resources):
  38. """
  39. Determine some settings and apply preprocessor to notebook
  40. """
  41. if self.use_separate_dir:
  42. self.path_name = self.attachments_directory_template.format(
  43. notebook_name=resources["unique_key"]
  44. )
  45. # Initialize resources for attachments
  46. resources["attachment_files_dir"] = self.path_name
  47. resources["attachments"] = {}
  48. self.resources_item_key = "attachments"
  49. else:
  50. # Use same resources as ExtractOutput
  51. self.path_name = resources["output_files_dir"]
  52. self.resources_item_key = "outputs"
  53. # Make sure key exists
  54. if not isinstance(resources[self.resources_item_key], dict):
  55. resources[self.resources_item_key] = {}
  56. nb, resources = super().preprocess(nb, resources)
  57. return nb, resources
  58. def preprocess_cell(self, cell, resources, index):
  59. """
  60. Extract attachments to individual files and
  61. change references to them.
  62. E.g.
  63. '![image.png](attachment:021fdd80.png)'
  64. becomes
  65. '![image.png]({path_name}/021fdd80.png)'
  66. Assumes self.path_name and self.resources_item_key is set properly (usually in preprocess).
  67. """
  68. if "attachments" in cell:
  69. for fname in cell.attachments:
  70. self.log.debug("Encountered attachment %s", fname)
  71. # Sanitize: use only the basename to prevent path traversal
  72. safe_fname = os.path.basename(fname)
  73. if not safe_fname:
  74. self.log.warning(
  75. "Attachment filename '%s' is invalid (empty basename), skipping",
  76. fname,
  77. )
  78. continue
  79. if safe_fname != fname:
  80. self.log.warning(
  81. "Attachment filename '%s' contained path components, using basename '%s'",
  82. fname,
  83. safe_fname,
  84. )
  85. # Add file for writer
  86. # Right now I don't know of a situation where there would be multiple
  87. # mime types under same filename, and I can't index into it without the mimetype.
  88. # So I only read the first one.
  89. for mimetype in cell.attachments[fname]:
  90. # convert to bytes and decode
  91. data = cell.attachments[fname][mimetype].encode("utf-8")
  92. decoded = b64decode(data)
  93. break
  94. # FilesWriter wants path to be in attachment filename here
  95. new_filename = os.path.join(self.path_name, safe_fname)
  96. if new_filename in resources[self.resources_item_key]:
  97. self.log.warning(
  98. "Attachment filename '%s' (from '%s') overwrites a previous "
  99. "attachment with the same name",
  100. safe_fname,
  101. fname,
  102. )
  103. resources[self.resources_item_key][new_filename] = decoded
  104. # Edit the reference to the attachment
  105. # os.path.join on windows uses "\\" separator,
  106. # but files like markdown still want "/"
  107. if os.path.sep != "/":
  108. new_filename = new_filename.replace(os.path.sep, "/")
  109. cell.source = cell.source.replace("attachment:" + fname, new_filename)
  110. return cell, resources