| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516 |
- """
- .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
- """
- import ntpath
- import os.path
- import posixpath
- import re
- import warnings
- from collections.abc import Sequence
- from pathlib import Path, PurePath
- from re import Pattern
- from typing import Final, Optional
- from ._base import AbstractSanitizer, AbstractValidator, BaseFile, BaseValidator
- from ._common import findall_to_str, is_nt_abspath, to_str, validate_pathtype
- from ._const import _NTFS_RESERVED_FILE_NAMES, DEFAULT_MIN_LEN, INVALID_CHAR_ERR_MSG_TMPL, Platform
- from ._filename import FileNameSanitizer, FileNameValidator
- from ._types import PathType, PlatformType
- from .error import ErrorAttrKey, ErrorReason, InvalidCharError, ReservedNameError, ValidationError
- from .handler import ReservedNameHandler, ValidationErrorHandler
- _RE_INVALID_PATH: Final = re.compile(f"[{re.escape(BaseFile._INVALID_PATH_CHARS):s}]", re.UNICODE)
- _RE_INVALID_WIN_PATH: Final = re.compile(
- f"[{re.escape(BaseFile._INVALID_WIN_PATH_CHARS):s}]", re.UNICODE
- )
- class FilePathSanitizer(AbstractSanitizer):
- def __init__(
- self,
- max_len: int = -1,
- fs_encoding: Optional[str] = None,
- platform: Optional[PlatformType] = None,
- null_value_handler: Optional[ValidationErrorHandler] = None,
- reserved_name_handler: Optional[ValidationErrorHandler] = None,
- additional_reserved_names: Optional[Sequence[str]] = None,
- normalize: bool = True,
- validate_after_sanitize: bool = False,
- validator: Optional[AbstractValidator] = None,
- ) -> None:
- if validator:
- fpath_validator = validator
- else:
- fpath_validator = FilePathValidator(
- min_len=DEFAULT_MIN_LEN,
- max_len=max_len,
- fs_encoding=fs_encoding,
- check_reserved=True,
- additional_reserved_names=additional_reserved_names,
- platform=platform,
- )
- super().__init__(
- max_len=max_len,
- fs_encoding=fs_encoding,
- validator=fpath_validator,
- null_value_handler=null_value_handler,
- reserved_name_handler=reserved_name_handler,
- additional_reserved_names=additional_reserved_names,
- platform=platform,
- validate_after_sanitize=validate_after_sanitize,
- )
- self._sanitize_regexp = self._get_sanitize_regexp()
- self.__fname_sanitizer = FileNameSanitizer(
- max_len=self.max_len,
- fs_encoding=fs_encoding,
- null_value_handler=null_value_handler,
- reserved_name_handler=reserved_name_handler,
- additional_reserved_names=additional_reserved_names,
- platform=self.platform,
- validate_after_sanitize=validate_after_sanitize,
- )
- self.__normalize = normalize
- if self._is_windows(include_universal=True):
- self.__split_drive = ntpath.splitdrive
- else:
- self.__split_drive = posixpath.splitdrive
- def sanitize(self, value: PathType, replacement_text: str = "") -> PathType:
- try:
- validate_pathtype(value, allow_whitespaces=not self._is_windows(include_universal=True))
- except ValidationError as e:
- if e.reason == ErrorReason.NULL_NAME:
- if isinstance(value, PurePath):
- raise
- return self._null_value_handler(e) # type: ignore
- raise
- unicode_filepath = to_str(value)
- drive, unicode_filepath = self.__split_drive(unicode_filepath)
- unicode_filepath = self._sanitize_regexp.sub(replacement_text, unicode_filepath)
- if self.__normalize and unicode_filepath:
- unicode_filepath = os.path.normpath(unicode_filepath)
- sanitized_path = unicode_filepath
- sanitized_entries: list[str] = []
- if drive:
- sanitized_entries.append(drive)
- for entry in sanitized_path.replace("\\", "/").split("/"):
- if entry in _NTFS_RESERVED_FILE_NAMES:
- sanitized_entries.append(f"{entry}_")
- continue
- sanitized_entry = str(
- self.__fname_sanitizer.sanitize(entry, replacement_text=replacement_text)
- )
- if not sanitized_entry:
- if not sanitized_entries:
- sanitized_entries.append("")
- continue
- sanitized_entries.append(sanitized_entry)
- sanitized_path = self.__get_path_separator().join(sanitized_entries)
- try:
- self._validator.validate(sanitized_path)
- except ValidationError as e:
- if e.reason == ErrorReason.NULL_NAME:
- sanitized_path = self._null_value_handler(e)
- if self._validate_after_sanitize:
- self._validator.validate(sanitized_path)
- if isinstance(value, PurePath):
- return Path(sanitized_path) # type: ignore
- return sanitized_path # type: ignore
- def _get_sanitize_regexp(self) -> Pattern[str]:
- if self._is_windows(include_universal=True):
- return _RE_INVALID_WIN_PATH
- return _RE_INVALID_PATH
- def __get_path_separator(self) -> str:
- if self._is_windows():
- return "\\"
- return "/"
- class FilePathValidator(BaseValidator):
- _RE_NTFS_RESERVED: Final = re.compile(
- "|".join(f"^/{re.escape(pattern)}$" for pattern in _NTFS_RESERVED_FILE_NAMES),
- re.IGNORECASE,
- )
- _MACOS_RESERVED_FILE_PATHS: Final = ("/", ":")
- @property
- def reserved_keywords(self) -> tuple[str, ...]:
- common_keywords = super().reserved_keywords
- if any([self._is_universal(), self._is_posix(), self._is_macos()]):
- return common_keywords + self._MACOS_RESERVED_FILE_PATHS
- if self._is_linux():
- return common_keywords + ("/",)
- return common_keywords
- def __init__(
- self,
- min_len: int = DEFAULT_MIN_LEN,
- max_len: int = -1,
- fs_encoding: Optional[str] = None,
- platform: Optional[PlatformType] = None,
- check_reserved: bool = True,
- additional_reserved_names: Optional[Sequence[str]] = None,
- ) -> None:
- super().__init__(
- min_len=min_len,
- max_len=max_len,
- fs_encoding=fs_encoding,
- check_reserved=check_reserved,
- additional_reserved_names=additional_reserved_names,
- platform=platform,
- )
- self.__fname_validator = FileNameValidator(
- min_len=min_len,
- max_len=self.max_len,
- fs_encoding=fs_encoding,
- check_reserved=check_reserved,
- additional_reserved_names=additional_reserved_names,
- platform=platform,
- )
- if self._is_windows(include_universal=True):
- self.__split_drive = ntpath.splitdrive
- else:
- self.__split_drive = posixpath.splitdrive
- def validate(self, value: PathType) -> None:
- validate_pathtype(value, allow_whitespaces=not self._is_windows(include_universal=True))
- self.validate_abspath(value)
- _drive, tail = self.__split_drive(value)
- if not tail:
- return
- unicode_filepath = to_str(tail)
- byte_ct = len(unicode_filepath.encode(self._fs_encoding))
- err_kwargs = {
- ErrorAttrKey.REASON: ErrorReason.INVALID_LENGTH,
- ErrorAttrKey.PLATFORM: self.platform,
- ErrorAttrKey.FS_ENCODING: self._fs_encoding,
- ErrorAttrKey.BYTE_COUNT: byte_ct,
- ErrorAttrKey.VALUE: unicode_filepath,
- }
- if byte_ct > self.max_len:
- raise ValidationError(
- [
- f"file path is too long: expected<={self.max_len:d} bytes, actual={byte_ct:d} bytes"
- ],
- **err_kwargs,
- )
- if byte_ct < self.min_len:
- raise ValidationError(
- [
- "file path is too short: expected>={:d} bytes, actual={:d} bytes".format(
- self.min_len, byte_ct
- )
- ],
- **err_kwargs,
- )
- self._validate_reserved_keywords(unicode_filepath)
- unicode_filepath = unicode_filepath.replace("\\", "/")
- for entry in unicode_filepath.split("/"):
- if not entry or entry in (".", ".."):
- continue
- self.__fname_validator.validate(entry)
- if self._is_windows(include_universal=True):
- self.__validate_win_filepath(unicode_filepath)
- else:
- self.__validate_unix_filepath(unicode_filepath)
- def validate_abspath(self, value: PathType) -> None:
- is_posix_abs = posixpath.isabs(value)
- is_nt_abs = is_nt_abspath(to_str(value))
- if any([self._is_windows() and is_nt_abs, self._is_posix() and is_posix_abs]):
- return
- if self._is_universal() and any([is_nt_abs, is_posix_abs]):
- ValidationError(
- "platform-independent absolute file path is not supported",
- platform=self.platform,
- reason=ErrorReason.MALFORMED_ABS_PATH,
- )
- err_object = ValidationError(
- description=(
- f"an invalid absolute file path ({value!r}) for the platform ({self.platform.value})."
- + " to avoid the error, specify an appropriate platform corresponding to"
- + " the path format or 'auto'."
- ),
- platform=self.platform,
- reason=ErrorReason.MALFORMED_ABS_PATH,
- )
- if self._is_windows(include_universal=True) and is_posix_abs:
- raise err_object
- if not self._is_windows():
- drive, _tail = ntpath.splitdrive(value)
- if drive and is_nt_abs:
- raise err_object
- def __validate_unix_filepath(self, unicode_filepath: str) -> None:
- match = _RE_INVALID_PATH.findall(unicode_filepath)
- if match:
- raise InvalidCharError(
- INVALID_CHAR_ERR_MSG_TMPL.format(invalid=findall_to_str(match)),
- value=unicode_filepath,
- )
- def __validate_win_filepath(self, unicode_filepath: str) -> None:
- match = _RE_INVALID_WIN_PATH.findall(unicode_filepath)
- if match:
- raise InvalidCharError(
- INVALID_CHAR_ERR_MSG_TMPL.format(invalid=findall_to_str(match)),
- platform=Platform.WINDOWS,
- value=unicode_filepath,
- )
- _drive, value = self.__split_drive(unicode_filepath)
- if value:
- match_reserved = self._RE_NTFS_RESERVED.search(value)
- if match_reserved:
- reserved_name = match_reserved.group()
- raise ReservedNameError(
- f"'{reserved_name}' is a reserved name",
- reusable_name=False,
- reserved_name=reserved_name,
- platform=self.platform,
- )
- def validate_filepath(
- file_path: PathType,
- platform: Optional[PlatformType] = None,
- min_len: int = DEFAULT_MIN_LEN,
- max_len: Optional[int] = None,
- fs_encoding: Optional[str] = None,
- check_reserved: bool = True,
- additional_reserved_names: Optional[Sequence[str]] = None,
- ) -> None:
- """Verifying whether the ``file_path`` is a valid file path or not.
- Args:
- file_path (PathType):
- File path to be validated.
- platform (Optional[PlatformType], optional):
- Target platform name of the file path.
- .. include:: platform.txt
- min_len (int, optional):
- Minimum byte length of the ``file_path``. The value must be greater or equal to one.
- Defaults to ``1``.
- max_len (Optional[int], optional):
- Maximum byte length of the ``file_path``. If the value is |None| or minus,
- automatically determined by the ``platform``:
- - ``Linux``: 4096
- - ``macOS``: 1024
- - ``Windows``: 260
- - ``universal``: 260
- fs_encoding (Optional[str], optional):
- Filesystem encoding that is used to calculate the byte length of the file path.
- If |None|, get the encoding from the execution environment.
- check_reserved (bool, optional):
- If |True|, check the reserved names of the ``platform``.
- Defaults to |True|.
- additional_reserved_names (Optional[Sequence[str]], optional):
- Additional reserved names to check.
- Raises:
- ValidationError (ErrorReason.INVALID_CHARACTER):
- If the ``file_path`` includes invalid char(s):
- |invalid_file_path_chars|.
- The following characters are also invalid for Windows platforms:
- |invalid_win_file_path_chars|
- ValidationError (ErrorReason.INVALID_LENGTH):
- If the ``file_path`` is longer than ``max_len`` characters.
- ValidationError:
- If ``file_path`` includes invalid values.
- Example:
- :ref:`example-validate-file-path`
- See Also:
- `Naming Files, Paths, and Namespaces - Win32 apps | Microsoft Docs
- <https://docs.microsoft.com/en-us/windows/win32/fileio/naming-a-file>`__
- """
- FilePathValidator(
- platform=platform,
- min_len=min_len,
- max_len=-1 if max_len is None else max_len,
- fs_encoding=fs_encoding,
- check_reserved=check_reserved,
- additional_reserved_names=additional_reserved_names,
- ).validate(file_path)
- def is_valid_filepath(
- file_path: PathType,
- platform: Optional[PlatformType] = None,
- min_len: int = DEFAULT_MIN_LEN,
- max_len: Optional[int] = None,
- fs_encoding: Optional[str] = None,
- check_reserved: bool = True,
- additional_reserved_names: Optional[Sequence[str]] = None,
- ) -> bool:
- """Check whether the ``file_path`` is a valid name or not.
- Args:
- file_path:
- A filepath to be checked.
- platform:
- Target platform name of the file path.
- Example:
- :ref:`example-is-valid-filepath`
- See Also:
- :py:func:`.validate_filepath()`
- """
- return FilePathValidator(
- platform=platform,
- min_len=min_len,
- max_len=-1 if max_len is None else max_len,
- fs_encoding=fs_encoding,
- check_reserved=check_reserved,
- additional_reserved_names=additional_reserved_names,
- ).is_valid(file_path)
- def sanitize_filepath(
- file_path: PathType,
- replacement_text: str = "",
- platform: Optional[PlatformType] = None,
- max_len: Optional[int] = None,
- fs_encoding: Optional[str] = None,
- check_reserved: Optional[bool] = None,
- null_value_handler: Optional[ValidationErrorHandler] = None,
- reserved_name_handler: Optional[ValidationErrorHandler] = None,
- additional_reserved_names: Optional[Sequence[str]] = None,
- normalize: bool = True,
- validate_after_sanitize: bool = False,
- ) -> PathType:
- """Make a valid file path from a string.
- To make a valid file path, the function does the following:
- - Replace invalid characters for a file path within the ``file_path``
- with the ``replacement_text``. Invalid characters are as follows:
- - unprintable characters
- - |invalid_file_path_chars|
- - for Windows (or universal) only: |invalid_win_file_path_chars|
- - Replace a value if a sanitized value is a reserved name by operating systems
- with a specified handler by ``reserved_name_handler``.
- Args:
- file_path:
- File path to sanitize.
- replacement_text:
- Replacement text for invalid characters.
- Defaults to ``""``.
- platform:
- Target platform name of the file path.
- .. include:: platform.txt
- max_len:
- Maximum byte length of the file path.
- Truncate the path if the value length exceeds the `max_len`.
- If the value is |None| or minus, ``max_len`` will automatically determined by the ``platform``:
- - ``Linux``: 4096
- - ``macOS``: 1024
- - ``Windows``: 260
- - ``universal``: 260
- fs_encoding:
- Filesystem encoding that is used to calculate the byte length of the file path.
- If |None|, get the encoding from the execution environment.
- check_reserved:
- [Deprecated] Use 'reserved_name_handler' instead.
- null_value_handler:
- Function called when a value after sanitization is an empty string.
- You can specify predefined handlers:
- - :py:func:`.handler.NullValueHandler.return_null_string`
- - :py:func:`.handler.NullValueHandler.return_timestamp`
- - :py:func:`.handler.raise_error`
- Defaults to :py:func:`.handler.NullValueHandler.return_null_string` that just return ``""``.
- reserved_name_handler:
- Function called when a value after sanitization is one of the reserved names.
- You can specify predefined handlers:
- - :py:meth:`~.handler.ReservedNameHandler.add_leading_underscore`
- - :py:meth:`~.handler.ReservedNameHandler.add_trailing_underscore`
- - :py:meth:`~.handler.ReservedNameHandler.as_is`
- - :py:func:`~.handler.raise_error`
- Defaults to :py:func:`.handler.add_trailing_underscore`.
- additional_reserved_names:
- Additional reserved names to sanitize.
- Case insensitive.
- normalize:
- If |True|, normalize the the file path.
- validate_after_sanitize:
- Execute validation after sanitization to the file path.
- Returns:
- Same type as the argument (str or PathLike object):
- Sanitized filepath.
- Raises:
- ValueError:
- If the ``file_path`` is an invalid file path.
- Example:
- :ref:`example-sanitize-file-path`
- """
- if check_reserved is not None:
- warnings.warn(
- "'check_reserved' is deprecated. Use 'reserved_name_handler' instead.",
- DeprecationWarning,
- )
- if check_reserved is False:
- reserved_name_handler = ReservedNameHandler.as_is
- return FilePathSanitizer(
- platform=platform,
- max_len=-1 if max_len is None else max_len,
- fs_encoding=fs_encoding,
- normalize=normalize,
- null_value_handler=null_value_handler,
- reserved_name_handler=reserved_name_handler,
- additional_reserved_names=additional_reserved_names,
- validate_after_sanitize=validate_after_sanitize,
- ).sanitize(file_path, replacement_text)
|