| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130 |
- """
- Module that extracts attachments from notebooks into their own files
- """
- # Copyright (c) Jupyter Development Team.
- # Distributed under the terms of the Modified BSD License.
- import os
- from base64 import b64decode
- from traitlets import Bool, Unicode
- from .base import Preprocessor
- class ExtractAttachmentsPreprocessor(Preprocessor):
- """
- Extracts attachments from all (markdown and raw) cells in a notebook.
- The extracted attachments are stored in a directory ('attachments' by default).
- https://nbformat.readthedocs.io/en/latest/format_description.html#cell-attachments
- """
- attachments_directory_template = Unicode(
- "{notebook_name}_attachments",
- help="Directory to place attachments if use_separate_dir is True",
- ).tag(config=True)
- use_separate_dir = Bool(
- False,
- help="Whether to use output_files_dir (which ExtractOutput also uses) or "
- "create a separate directory for attachments",
- ).tag(config=True)
- def __init__(self, **kw):
- """
- Public constructor
- """
- super().__init__(**kw)
- # directory path,
- self.path_name = "" # will be set in self.preprocess, needs resources
- # Where extracted attachments are stored in resources
- self.resources_item_key = (
- "attachments" # Here as a default, in case someone doesn't want to call preprocess
- )
- # Add condition and configurability here
- def preprocess(self, nb, resources):
- """
- Determine some settings and apply preprocessor to notebook
- """
- if self.use_separate_dir:
- self.path_name = self.attachments_directory_template.format(
- notebook_name=resources["unique_key"]
- )
- # Initialize resources for attachments
- resources["attachment_files_dir"] = self.path_name
- resources["attachments"] = {}
- self.resources_item_key = "attachments"
- else:
- # Use same resources as ExtractOutput
- self.path_name = resources["output_files_dir"]
- self.resources_item_key = "outputs"
- # Make sure key exists
- if not isinstance(resources[self.resources_item_key], dict):
- resources[self.resources_item_key] = {}
- nb, resources = super().preprocess(nb, resources)
- return nb, resources
- def preprocess_cell(self, cell, resources, index):
- """
- Extract attachments to individual files and
- change references to them.
- E.g.
- ''
- becomes
- ''
- Assumes self.path_name and self.resources_item_key is set properly (usually in preprocess).
- """
- if "attachments" in cell:
- for fname in cell.attachments:
- self.log.debug("Encountered attachment %s", fname)
- # Sanitize: use only the basename to prevent path traversal
- safe_fname = os.path.basename(fname)
- if not safe_fname:
- self.log.warning(
- "Attachment filename '%s' is invalid (empty basename), skipping",
- fname,
- )
- continue
- if safe_fname != fname:
- self.log.warning(
- "Attachment filename '%s' contained path components, using basename '%s'",
- fname,
- safe_fname,
- )
- # Add file for writer
- # Right now I don't know of a situation where there would be multiple
- # mime types under same filename, and I can't index into it without the mimetype.
- # So I only read the first one.
- for mimetype in cell.attachments[fname]:
- # convert to bytes and decode
- data = cell.attachments[fname][mimetype].encode("utf-8")
- decoded = b64decode(data)
- break
- # FilesWriter wants path to be in attachment filename here
- new_filename = os.path.join(self.path_name, safe_fname)
- if new_filename in resources[self.resources_item_key]:
- self.log.warning(
- "Attachment filename '%s' (from '%s') overwrites a previous "
- "attachment with the same name",
- safe_fname,
- fname,
- )
- resources[self.resources_item_key][new_filename] = decoded
- # Edit the reference to the attachment
- # os.path.join on windows uses "\\" separator,
- # but files like markdown still want "/"
- if os.path.sep != "/":
- new_filename = new_filename.replace(os.path.sep, "/")
- cell.source = cell.source.replace("attachment:" + fname, new_filename)
- return cell, resources
|