_filepath.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516
  1. """
  2. .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
  3. """
  4. import ntpath
  5. import os.path
  6. import posixpath
  7. import re
  8. import warnings
  9. from collections.abc import Sequence
  10. from pathlib import Path, PurePath
  11. from re import Pattern
  12. from typing import Final, Optional
  13. from ._base import AbstractSanitizer, AbstractValidator, BaseFile, BaseValidator
  14. from ._common import findall_to_str, is_nt_abspath, to_str, validate_pathtype
  15. from ._const import _NTFS_RESERVED_FILE_NAMES, DEFAULT_MIN_LEN, INVALID_CHAR_ERR_MSG_TMPL, Platform
  16. from ._filename import FileNameSanitizer, FileNameValidator
  17. from ._types import PathType, PlatformType
  18. from .error import ErrorAttrKey, ErrorReason, InvalidCharError, ReservedNameError, ValidationError
  19. from .handler import ReservedNameHandler, ValidationErrorHandler
  20. _RE_INVALID_PATH: Final = re.compile(f"[{re.escape(BaseFile._INVALID_PATH_CHARS):s}]", re.UNICODE)
  21. _RE_INVALID_WIN_PATH: Final = re.compile(
  22. f"[{re.escape(BaseFile._INVALID_WIN_PATH_CHARS):s}]", re.UNICODE
  23. )
  24. class FilePathSanitizer(AbstractSanitizer):
  25. def __init__(
  26. self,
  27. max_len: int = -1,
  28. fs_encoding: Optional[str] = None,
  29. platform: Optional[PlatformType] = None,
  30. null_value_handler: Optional[ValidationErrorHandler] = None,
  31. reserved_name_handler: Optional[ValidationErrorHandler] = None,
  32. additional_reserved_names: Optional[Sequence[str]] = None,
  33. normalize: bool = True,
  34. validate_after_sanitize: bool = False,
  35. validator: Optional[AbstractValidator] = None,
  36. ) -> None:
  37. if validator:
  38. fpath_validator = validator
  39. else:
  40. fpath_validator = FilePathValidator(
  41. min_len=DEFAULT_MIN_LEN,
  42. max_len=max_len,
  43. fs_encoding=fs_encoding,
  44. check_reserved=True,
  45. additional_reserved_names=additional_reserved_names,
  46. platform=platform,
  47. )
  48. super().__init__(
  49. max_len=max_len,
  50. fs_encoding=fs_encoding,
  51. validator=fpath_validator,
  52. null_value_handler=null_value_handler,
  53. reserved_name_handler=reserved_name_handler,
  54. additional_reserved_names=additional_reserved_names,
  55. platform=platform,
  56. validate_after_sanitize=validate_after_sanitize,
  57. )
  58. self._sanitize_regexp = self._get_sanitize_regexp()
  59. self.__fname_sanitizer = FileNameSanitizer(
  60. max_len=self.max_len,
  61. fs_encoding=fs_encoding,
  62. null_value_handler=null_value_handler,
  63. reserved_name_handler=reserved_name_handler,
  64. additional_reserved_names=additional_reserved_names,
  65. platform=self.platform,
  66. validate_after_sanitize=validate_after_sanitize,
  67. )
  68. self.__normalize = normalize
  69. if self._is_windows(include_universal=True):
  70. self.__split_drive = ntpath.splitdrive
  71. else:
  72. self.__split_drive = posixpath.splitdrive
  73. def sanitize(self, value: PathType, replacement_text: str = "") -> PathType:
  74. try:
  75. validate_pathtype(value, allow_whitespaces=not self._is_windows(include_universal=True))
  76. except ValidationError as e:
  77. if e.reason == ErrorReason.NULL_NAME:
  78. if isinstance(value, PurePath):
  79. raise
  80. return self._null_value_handler(e) # type: ignore
  81. raise
  82. unicode_filepath = to_str(value)
  83. drive, unicode_filepath = self.__split_drive(unicode_filepath)
  84. unicode_filepath = self._sanitize_regexp.sub(replacement_text, unicode_filepath)
  85. if self.__normalize and unicode_filepath:
  86. unicode_filepath = os.path.normpath(unicode_filepath)
  87. sanitized_path = unicode_filepath
  88. sanitized_entries: list[str] = []
  89. if drive:
  90. sanitized_entries.append(drive)
  91. for entry in sanitized_path.replace("\\", "/").split("/"):
  92. if entry in _NTFS_RESERVED_FILE_NAMES:
  93. sanitized_entries.append(f"{entry}_")
  94. continue
  95. sanitized_entry = str(
  96. self.__fname_sanitizer.sanitize(entry, replacement_text=replacement_text)
  97. )
  98. if not sanitized_entry:
  99. if not sanitized_entries:
  100. sanitized_entries.append("")
  101. continue
  102. sanitized_entries.append(sanitized_entry)
  103. sanitized_path = self.__get_path_separator().join(sanitized_entries)
  104. try:
  105. self._validator.validate(sanitized_path)
  106. except ValidationError as e:
  107. if e.reason == ErrorReason.NULL_NAME:
  108. sanitized_path = self._null_value_handler(e)
  109. if self._validate_after_sanitize:
  110. self._validator.validate(sanitized_path)
  111. if isinstance(value, PurePath):
  112. return Path(sanitized_path) # type: ignore
  113. return sanitized_path # type: ignore
  114. def _get_sanitize_regexp(self) -> Pattern[str]:
  115. if self._is_windows(include_universal=True):
  116. return _RE_INVALID_WIN_PATH
  117. return _RE_INVALID_PATH
  118. def __get_path_separator(self) -> str:
  119. if self._is_windows():
  120. return "\\"
  121. return "/"
  122. class FilePathValidator(BaseValidator):
  123. _RE_NTFS_RESERVED: Final = re.compile(
  124. "|".join(f"^/{re.escape(pattern)}$" for pattern in _NTFS_RESERVED_FILE_NAMES),
  125. re.IGNORECASE,
  126. )
  127. _MACOS_RESERVED_FILE_PATHS: Final = ("/", ":")
  128. @property
  129. def reserved_keywords(self) -> tuple[str, ...]:
  130. common_keywords = super().reserved_keywords
  131. if any([self._is_universal(), self._is_posix(), self._is_macos()]):
  132. return common_keywords + self._MACOS_RESERVED_FILE_PATHS
  133. if self._is_linux():
  134. return common_keywords + ("/",)
  135. return common_keywords
  136. def __init__(
  137. self,
  138. min_len: int = DEFAULT_MIN_LEN,
  139. max_len: int = -1,
  140. fs_encoding: Optional[str] = None,
  141. platform: Optional[PlatformType] = None,
  142. check_reserved: bool = True,
  143. additional_reserved_names: Optional[Sequence[str]] = None,
  144. ) -> None:
  145. super().__init__(
  146. min_len=min_len,
  147. max_len=max_len,
  148. fs_encoding=fs_encoding,
  149. check_reserved=check_reserved,
  150. additional_reserved_names=additional_reserved_names,
  151. platform=platform,
  152. )
  153. self.__fname_validator = FileNameValidator(
  154. min_len=min_len,
  155. max_len=self.max_len,
  156. fs_encoding=fs_encoding,
  157. check_reserved=check_reserved,
  158. additional_reserved_names=additional_reserved_names,
  159. platform=platform,
  160. )
  161. if self._is_windows(include_universal=True):
  162. self.__split_drive = ntpath.splitdrive
  163. else:
  164. self.__split_drive = posixpath.splitdrive
  165. def validate(self, value: PathType) -> None:
  166. validate_pathtype(value, allow_whitespaces=not self._is_windows(include_universal=True))
  167. self.validate_abspath(value)
  168. _drive, tail = self.__split_drive(value)
  169. if not tail:
  170. return
  171. unicode_filepath = to_str(tail)
  172. byte_ct = len(unicode_filepath.encode(self._fs_encoding))
  173. err_kwargs = {
  174. ErrorAttrKey.REASON: ErrorReason.INVALID_LENGTH,
  175. ErrorAttrKey.PLATFORM: self.platform,
  176. ErrorAttrKey.FS_ENCODING: self._fs_encoding,
  177. ErrorAttrKey.BYTE_COUNT: byte_ct,
  178. ErrorAttrKey.VALUE: unicode_filepath,
  179. }
  180. if byte_ct > self.max_len:
  181. raise ValidationError(
  182. [
  183. f"file path is too long: expected<={self.max_len:d} bytes, actual={byte_ct:d} bytes"
  184. ],
  185. **err_kwargs,
  186. )
  187. if byte_ct < self.min_len:
  188. raise ValidationError(
  189. [
  190. "file path is too short: expected>={:d} bytes, actual={:d} bytes".format(
  191. self.min_len, byte_ct
  192. )
  193. ],
  194. **err_kwargs,
  195. )
  196. self._validate_reserved_keywords(unicode_filepath)
  197. unicode_filepath = unicode_filepath.replace("\\", "/")
  198. for entry in unicode_filepath.split("/"):
  199. if not entry or entry in (".", ".."):
  200. continue
  201. self.__fname_validator.validate(entry)
  202. if self._is_windows(include_universal=True):
  203. self.__validate_win_filepath(unicode_filepath)
  204. else:
  205. self.__validate_unix_filepath(unicode_filepath)
  206. def validate_abspath(self, value: PathType) -> None:
  207. is_posix_abs = posixpath.isabs(value)
  208. is_nt_abs = is_nt_abspath(to_str(value))
  209. if any([self._is_windows() and is_nt_abs, self._is_posix() and is_posix_abs]):
  210. return
  211. if self._is_universal() and any([is_nt_abs, is_posix_abs]):
  212. ValidationError(
  213. "platform-independent absolute file path is not supported",
  214. platform=self.platform,
  215. reason=ErrorReason.MALFORMED_ABS_PATH,
  216. )
  217. err_object = ValidationError(
  218. description=(
  219. f"an invalid absolute file path ({value!r}) for the platform ({self.platform.value})."
  220. + " to avoid the error, specify an appropriate platform corresponding to"
  221. + " the path format or 'auto'."
  222. ),
  223. platform=self.platform,
  224. reason=ErrorReason.MALFORMED_ABS_PATH,
  225. )
  226. if self._is_windows(include_universal=True) and is_posix_abs:
  227. raise err_object
  228. if not self._is_windows():
  229. drive, _tail = ntpath.splitdrive(value)
  230. if drive and is_nt_abs:
  231. raise err_object
  232. def __validate_unix_filepath(self, unicode_filepath: str) -> None:
  233. match = _RE_INVALID_PATH.findall(unicode_filepath)
  234. if match:
  235. raise InvalidCharError(
  236. INVALID_CHAR_ERR_MSG_TMPL.format(invalid=findall_to_str(match)),
  237. value=unicode_filepath,
  238. )
  239. def __validate_win_filepath(self, unicode_filepath: str) -> None:
  240. match = _RE_INVALID_WIN_PATH.findall(unicode_filepath)
  241. if match:
  242. raise InvalidCharError(
  243. INVALID_CHAR_ERR_MSG_TMPL.format(invalid=findall_to_str(match)),
  244. platform=Platform.WINDOWS,
  245. value=unicode_filepath,
  246. )
  247. _drive, value = self.__split_drive(unicode_filepath)
  248. if value:
  249. match_reserved = self._RE_NTFS_RESERVED.search(value)
  250. if match_reserved:
  251. reserved_name = match_reserved.group()
  252. raise ReservedNameError(
  253. f"'{reserved_name}' is a reserved name",
  254. reusable_name=False,
  255. reserved_name=reserved_name,
  256. platform=self.platform,
  257. )
  258. def validate_filepath(
  259. file_path: PathType,
  260. platform: Optional[PlatformType] = None,
  261. min_len: int = DEFAULT_MIN_LEN,
  262. max_len: Optional[int] = None,
  263. fs_encoding: Optional[str] = None,
  264. check_reserved: bool = True,
  265. additional_reserved_names: Optional[Sequence[str]] = None,
  266. ) -> None:
  267. """Verifying whether the ``file_path`` is a valid file path or not.
  268. Args:
  269. file_path (PathType):
  270. File path to be validated.
  271. platform (Optional[PlatformType], optional):
  272. Target platform name of the file path.
  273. .. include:: platform.txt
  274. min_len (int, optional):
  275. Minimum byte length of the ``file_path``. The value must be greater or equal to one.
  276. Defaults to ``1``.
  277. max_len (Optional[int], optional):
  278. Maximum byte length of the ``file_path``. If the value is |None| or minus,
  279. automatically determined by the ``platform``:
  280. - ``Linux``: 4096
  281. - ``macOS``: 1024
  282. - ``Windows``: 260
  283. - ``universal``: 260
  284. fs_encoding (Optional[str], optional):
  285. Filesystem encoding that is used to calculate the byte length of the file path.
  286. If |None|, get the encoding from the execution environment.
  287. check_reserved (bool, optional):
  288. If |True|, check the reserved names of the ``platform``.
  289. Defaults to |True|.
  290. additional_reserved_names (Optional[Sequence[str]], optional):
  291. Additional reserved names to check.
  292. Raises:
  293. ValidationError (ErrorReason.INVALID_CHARACTER):
  294. If the ``file_path`` includes invalid char(s):
  295. |invalid_file_path_chars|.
  296. The following characters are also invalid for Windows platforms:
  297. |invalid_win_file_path_chars|
  298. ValidationError (ErrorReason.INVALID_LENGTH):
  299. If the ``file_path`` is longer than ``max_len`` characters.
  300. ValidationError:
  301. If ``file_path`` includes invalid values.
  302. Example:
  303. :ref:`example-validate-file-path`
  304. See Also:
  305. `Naming Files, Paths, and Namespaces - Win32 apps | Microsoft Docs
  306. <https://docs.microsoft.com/en-us/windows/win32/fileio/naming-a-file>`__
  307. """
  308. FilePathValidator(
  309. platform=platform,
  310. min_len=min_len,
  311. max_len=-1 if max_len is None else max_len,
  312. fs_encoding=fs_encoding,
  313. check_reserved=check_reserved,
  314. additional_reserved_names=additional_reserved_names,
  315. ).validate(file_path)
  316. def is_valid_filepath(
  317. file_path: PathType,
  318. platform: Optional[PlatformType] = None,
  319. min_len: int = DEFAULT_MIN_LEN,
  320. max_len: Optional[int] = None,
  321. fs_encoding: Optional[str] = None,
  322. check_reserved: bool = True,
  323. additional_reserved_names: Optional[Sequence[str]] = None,
  324. ) -> bool:
  325. """Check whether the ``file_path`` is a valid name or not.
  326. Args:
  327. file_path:
  328. A filepath to be checked.
  329. platform:
  330. Target platform name of the file path.
  331. Example:
  332. :ref:`example-is-valid-filepath`
  333. See Also:
  334. :py:func:`.validate_filepath()`
  335. """
  336. return FilePathValidator(
  337. platform=platform,
  338. min_len=min_len,
  339. max_len=-1 if max_len is None else max_len,
  340. fs_encoding=fs_encoding,
  341. check_reserved=check_reserved,
  342. additional_reserved_names=additional_reserved_names,
  343. ).is_valid(file_path)
  344. def sanitize_filepath(
  345. file_path: PathType,
  346. replacement_text: str = "",
  347. platform: Optional[PlatformType] = None,
  348. max_len: Optional[int] = None,
  349. fs_encoding: Optional[str] = None,
  350. check_reserved: Optional[bool] = None,
  351. null_value_handler: Optional[ValidationErrorHandler] = None,
  352. reserved_name_handler: Optional[ValidationErrorHandler] = None,
  353. additional_reserved_names: Optional[Sequence[str]] = None,
  354. normalize: bool = True,
  355. validate_after_sanitize: bool = False,
  356. ) -> PathType:
  357. """Make a valid file path from a string.
  358. To make a valid file path, the function does the following:
  359. - Replace invalid characters for a file path within the ``file_path``
  360. with the ``replacement_text``. Invalid characters are as follows:
  361. - unprintable characters
  362. - |invalid_file_path_chars|
  363. - for Windows (or universal) only: |invalid_win_file_path_chars|
  364. - Replace a value if a sanitized value is a reserved name by operating systems
  365. with a specified handler by ``reserved_name_handler``.
  366. Args:
  367. file_path:
  368. File path to sanitize.
  369. replacement_text:
  370. Replacement text for invalid characters.
  371. Defaults to ``""``.
  372. platform:
  373. Target platform name of the file path.
  374. .. include:: platform.txt
  375. max_len:
  376. Maximum byte length of the file path.
  377. Truncate the path if the value length exceeds the `max_len`.
  378. If the value is |None| or minus, ``max_len`` will automatically determined by the ``platform``:
  379. - ``Linux``: 4096
  380. - ``macOS``: 1024
  381. - ``Windows``: 260
  382. - ``universal``: 260
  383. fs_encoding:
  384. Filesystem encoding that is used to calculate the byte length of the file path.
  385. If |None|, get the encoding from the execution environment.
  386. check_reserved:
  387. [Deprecated] Use 'reserved_name_handler' instead.
  388. null_value_handler:
  389. Function called when a value after sanitization is an empty string.
  390. You can specify predefined handlers:
  391. - :py:func:`.handler.NullValueHandler.return_null_string`
  392. - :py:func:`.handler.NullValueHandler.return_timestamp`
  393. - :py:func:`.handler.raise_error`
  394. Defaults to :py:func:`.handler.NullValueHandler.return_null_string` that just return ``""``.
  395. reserved_name_handler:
  396. Function called when a value after sanitization is one of the reserved names.
  397. You can specify predefined handlers:
  398. - :py:meth:`~.handler.ReservedNameHandler.add_leading_underscore`
  399. - :py:meth:`~.handler.ReservedNameHandler.add_trailing_underscore`
  400. - :py:meth:`~.handler.ReservedNameHandler.as_is`
  401. - :py:func:`~.handler.raise_error`
  402. Defaults to :py:func:`.handler.add_trailing_underscore`.
  403. additional_reserved_names:
  404. Additional reserved names to sanitize.
  405. Case insensitive.
  406. normalize:
  407. If |True|, normalize the the file path.
  408. validate_after_sanitize:
  409. Execute validation after sanitization to the file path.
  410. Returns:
  411. Same type as the argument (str or PathLike object):
  412. Sanitized filepath.
  413. Raises:
  414. ValueError:
  415. If the ``file_path`` is an invalid file path.
  416. Example:
  417. :ref:`example-sanitize-file-path`
  418. """
  419. if check_reserved is not None:
  420. warnings.warn(
  421. "'check_reserved' is deprecated. Use 'reserved_name_handler' instead.",
  422. DeprecationWarning,
  423. )
  424. if check_reserved is False:
  425. reserved_name_handler = ReservedNameHandler.as_is
  426. return FilePathSanitizer(
  427. platform=platform,
  428. max_len=-1 if max_len is None else max_len,
  429. fs_encoding=fs_encoding,
  430. normalize=normalize,
  431. null_value_handler=null_value_handler,
  432. reserved_name_handler=reserved_name_handler,
  433. additional_reserved_names=additional_reserved_names,
  434. validate_after_sanitize=validate_after_sanitize,
  435. ).sanitize(file_path, replacement_text)