yichael
/
image-match


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130
							"""
Module that extracts attachments from notebooks into their own files
"""

# Copyright (c) Jupyter Development Team.
# Distributed under the terms of the Modified BSD License.

import os
from base64 import b64decode

from traitlets import Bool, Unicode

from .base import Preprocessor


class ExtractAttachmentsPreprocessor(Preprocessor):
    """
    Extracts attachments from all (markdown and raw) cells in a notebook.
    The extracted attachments are stored in a directory ('attachments' by default).
    https://nbformat.readthedocs.io/en/latest/format_description.html#cell-attachments
    """

    attachments_directory_template = Unicode(
        "{notebook_name}_attachments",
        help="Directory to place attachments if use_separate_dir is True",
    ).tag(config=True)

    use_separate_dir = Bool(
        False,
        help="Whether to use output_files_dir (which ExtractOutput also uses) or "
        "create a separate directory for attachments",
    ).tag(config=True)

    def __init__(self, **kw):
        """
        Public constructor
        """
        super().__init__(**kw)
        # directory path,
        self.path_name = ""  # will be set in self.preprocess, needs resources
        # Where extracted attachments are stored in resources
        self.resources_item_key = (
            "attachments"  # Here as a default, in case someone doesn't want to call preprocess
        )

    # Add condition and configurability here
    def preprocess(self, nb, resources):
        """
        Determine some settings and apply preprocessor to notebook
        """
        if self.use_separate_dir:
            self.path_name = self.attachments_directory_template.format(
                notebook_name=resources["unique_key"]
            )
            # Initialize resources for attachments
            resources["attachment_files_dir"] = self.path_name
            resources["attachments"] = {}
            self.resources_item_key = "attachments"
        else:
            # Use same resources as ExtractOutput
            self.path_name = resources["output_files_dir"]
            self.resources_item_key = "outputs"

        # Make sure key exists
        if not isinstance(resources[self.resources_item_key], dict):
            resources[self.resources_item_key] = {}

        nb, resources = super().preprocess(nb, resources)
        return nb, resources

    def preprocess_cell(self, cell, resources, index):
        """
        Extract attachments to individual files and
        change references to them.
        E.g.
        '![image.png](attachment:021fdd80.png)'
        becomes
        '![image.png]({path_name}/021fdd80.png)'
        Assumes self.path_name and self.resources_item_key is set properly (usually in preprocess).
        """
        if "attachments" in cell:
            for fname in cell.attachments:
                self.log.debug("Encountered attachment %s", fname)

                # Sanitize: use only the basename to prevent path traversal
                safe_fname = os.path.basename(fname)
                if not safe_fname:
                    self.log.warning(
                        "Attachment filename '%s' is invalid (empty basename), skipping",
                        fname,
                    )
                    continue
                if safe_fname != fname:
                    self.log.warning(
                        "Attachment filename '%s' contained path components, using basename '%s'",
                        fname,
                        safe_fname,
                    )

                # Add file for writer

                # Right now I don't know of a situation where there would be multiple
                # mime types under same filename, and I can't index into it without the mimetype.
                # So I only read the first one.
                for mimetype in cell.attachments[fname]:
                    # convert to bytes and decode
                    data = cell.attachments[fname][mimetype].encode("utf-8")
                    decoded = b64decode(data)
                    break

                # FilesWriter wants path to be in attachment filename here
                new_filename = os.path.join(self.path_name, safe_fname)
                if new_filename in resources[self.resources_item_key]:
                    self.log.warning(
                        "Attachment filename '%s' (from '%s') overwrites a previous "
                        "attachment with the same name",
                        safe_fname,
                        fname,
                    )
                resources[self.resources_item_key][new_filename] = decoded

                # Edit the reference to the attachment

                # os.path.join on windows uses "\\" separator,
                # but files like markdown still want "/"
                if os.path.sep != "/":
                    new_filename = new_filename.replace(os.path.sep, "/")
                cell.source = cell.source.replace("attachment:" + fname, new_filename)

        return cell, resources