_filename.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478
  1. """
  2. .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
  3. """
  4. import itertools
  5. import posixpath
  6. import re
  7. import warnings
  8. from collections.abc import Sequence
  9. from pathlib import Path, PurePath
  10. from re import Pattern
  11. from typing import Final, Optional
  12. from ._base import AbstractSanitizer, AbstractValidator, BaseFile, BaseValidator
  13. from ._common import findall_to_str, is_nt_abspath, to_str, truncate_str, validate_pathtype
  14. from ._const import DEFAULT_MIN_LEN, INVALID_CHAR_ERR_MSG_TMPL, Platform
  15. from ._types import PathType, PlatformType
  16. from .error import ErrorAttrKey, ErrorReason, InvalidCharError, ValidationError
  17. from .handler import ReservedNameHandler, ValidationErrorHandler
  18. _DEFAULT_MAX_FILENAME_LEN: Final = 255
  19. _RE_INVALID_FILENAME: Final = re.compile(
  20. f"[{re.escape(BaseFile._INVALID_FILENAME_CHARS):s}]", re.UNICODE
  21. )
  22. _RE_INVALID_WIN_FILENAME: Final = re.compile(
  23. f"[{re.escape(BaseFile._INVALID_WIN_FILENAME_CHARS):s}]", re.UNICODE
  24. )
  25. class FileNameSanitizer(AbstractSanitizer):
  26. def __init__(
  27. self,
  28. max_len: int = _DEFAULT_MAX_FILENAME_LEN,
  29. fs_encoding: Optional[str] = None,
  30. platform: Optional[PlatformType] = None,
  31. null_value_handler: Optional[ValidationErrorHandler] = None,
  32. reserved_name_handler: Optional[ValidationErrorHandler] = None,
  33. additional_reserved_names: Optional[Sequence[str]] = None,
  34. validate_after_sanitize: bool = False,
  35. validator: Optional[AbstractValidator] = None,
  36. ) -> None:
  37. if validator:
  38. fname_validator = validator
  39. else:
  40. fname_validator = FileNameValidator(
  41. min_len=DEFAULT_MIN_LEN,
  42. max_len=max_len,
  43. fs_encoding=fs_encoding,
  44. check_reserved=True,
  45. additional_reserved_names=additional_reserved_names,
  46. platform=platform,
  47. )
  48. super().__init__(
  49. max_len=max_len,
  50. fs_encoding=fs_encoding,
  51. null_value_handler=null_value_handler,
  52. reserved_name_handler=reserved_name_handler,
  53. additional_reserved_names=additional_reserved_names,
  54. platform=platform,
  55. validate_after_sanitize=validate_after_sanitize,
  56. validator=fname_validator,
  57. )
  58. self._sanitize_regexp = self._get_sanitize_regexp()
  59. def sanitize(self, value: PathType, replacement_text: str = "") -> PathType:
  60. try:
  61. validate_pathtype(value, allow_whitespaces=not self._is_windows(include_universal=True))
  62. except ValidationError as e:
  63. if e.reason == ErrorReason.NULL_NAME:
  64. if isinstance(value, PurePath):
  65. raise
  66. return self._null_value_handler(e) # type: ignore
  67. raise
  68. sanitized_filename = self._sanitize_regexp.sub(replacement_text, str(value))
  69. sanitized_filename = truncate_str(sanitized_filename, self._fs_encoding, self.max_len)
  70. try:
  71. self._validator.validate(sanitized_filename)
  72. except ValidationError as e:
  73. if e.reason == ErrorReason.RESERVED_NAME:
  74. replacement_word = self._reserved_name_handler(e)
  75. if e.reserved_name != replacement_word:
  76. sanitized_filename = re.sub(
  77. re.escape(e.reserved_name), replacement_word, sanitized_filename
  78. )
  79. elif e.reason == ErrorReason.INVALID_CHARACTER and self._is_windows(
  80. include_universal=True
  81. ):
  82. # Do not start a file or directory name with a space
  83. sanitized_filename = sanitized_filename.lstrip(" ")
  84. # Do not end a file or directory name with a space or a period
  85. sanitized_filename = sanitized_filename.rstrip(" ")
  86. if sanitized_filename not in (".", ".."):
  87. sanitized_filename = sanitized_filename.rstrip(" .")
  88. elif e.reason == ErrorReason.NULL_NAME:
  89. sanitized_filename = self._null_value_handler(e)
  90. if self._validate_after_sanitize:
  91. try:
  92. self._validator.validate(sanitized_filename)
  93. except ValidationError as e:
  94. raise ValidationError(
  95. description=str(e),
  96. reason=ErrorReason.INVALID_AFTER_SANITIZE,
  97. platform=self.platform,
  98. )
  99. if isinstance(value, PurePath):
  100. return Path(sanitized_filename) # type: ignore
  101. return sanitized_filename # type: ignore
  102. def _get_sanitize_regexp(self) -> Pattern[str]:
  103. if self._is_windows(include_universal=True):
  104. return _RE_INVALID_WIN_FILENAME
  105. return _RE_INVALID_FILENAME
  106. class FileNameValidator(BaseValidator):
  107. _WINDOWS_RESERVED_FILE_NAMES: Final = (
  108. ("CON", "PRN", "AUX", "CLOCK$", "NUL")
  109. + tuple(f"{name:s}{num:d}" for name, num in itertools.product(("COM", "LPT"), range(0, 10)))
  110. + tuple(
  111. f"{name:s}{ssd:s}"
  112. for name, ssd in itertools.product(
  113. ("COM", "LPT"),
  114. ("\N{SUPERSCRIPT ONE}", "\N{SUPERSCRIPT TWO}", "\N{SUPERSCRIPT THREE}"),
  115. )
  116. )
  117. )
  118. _MACOS_RESERVED_FILE_NAMES: Final = (":",)
  119. @property
  120. def reserved_keywords(self) -> tuple[str, ...]:
  121. common_keywords = super().reserved_keywords
  122. if self._is_universal():
  123. word_set = set(
  124. common_keywords
  125. + self._WINDOWS_RESERVED_FILE_NAMES
  126. + self._MACOS_RESERVED_FILE_NAMES
  127. )
  128. elif self._is_windows():
  129. word_set = set(common_keywords + self._WINDOWS_RESERVED_FILE_NAMES)
  130. elif self._is_posix() or self._is_macos():
  131. word_set = set(common_keywords + self._MACOS_RESERVED_FILE_NAMES)
  132. else:
  133. word_set = set(common_keywords)
  134. return tuple(sorted(word_set))
  135. def __init__(
  136. self,
  137. min_len: int = DEFAULT_MIN_LEN,
  138. max_len: int = _DEFAULT_MAX_FILENAME_LEN,
  139. fs_encoding: Optional[str] = None,
  140. platform: Optional[PlatformType] = None,
  141. check_reserved: bool = True,
  142. additional_reserved_names: Optional[Sequence[str]] = None,
  143. ) -> None:
  144. super().__init__(
  145. min_len=min_len,
  146. max_len=max_len,
  147. fs_encoding=fs_encoding,
  148. check_reserved=check_reserved,
  149. additional_reserved_names=additional_reserved_names,
  150. platform=platform,
  151. )
  152. def validate(self, value: PathType) -> None:
  153. validate_pathtype(value, allow_whitespaces=not self._is_windows(include_universal=True))
  154. unicode_filename = to_str(value)
  155. byte_ct = len(unicode_filename.encode(self._fs_encoding))
  156. self.validate_abspath(unicode_filename)
  157. err_kwargs = {
  158. ErrorAttrKey.REASON: ErrorReason.INVALID_LENGTH,
  159. ErrorAttrKey.PLATFORM: self.platform,
  160. ErrorAttrKey.FS_ENCODING: self._fs_encoding,
  161. ErrorAttrKey.BYTE_COUNT: byte_ct,
  162. ErrorAttrKey.VALUE: unicode_filename,
  163. }
  164. if byte_ct > self.max_len:
  165. raise ValidationError(
  166. [
  167. f"filename is too long: expected<={self.max_len:d} bytes, actual={byte_ct:d} bytes"
  168. ],
  169. **err_kwargs,
  170. )
  171. if byte_ct < self.min_len:
  172. raise ValidationError(
  173. [
  174. f"filename is too short: expected>={self.min_len:d} bytes, actual={byte_ct:d} bytes"
  175. ],
  176. **err_kwargs,
  177. )
  178. self._validate_reserved_keywords(unicode_filename)
  179. self.__validate_universal_filename(unicode_filename)
  180. if self._is_windows(include_universal=True):
  181. self.__validate_win_filename(unicode_filename)
  182. def validate_abspath(self, value: str) -> None:
  183. err = ValidationError(
  184. description=f"found an absolute path ({value!r}), expected a filename",
  185. platform=self.platform,
  186. reason=ErrorReason.FOUND_ABS_PATH,
  187. )
  188. if self._is_windows(include_universal=True):
  189. if is_nt_abspath(value):
  190. raise err
  191. if posixpath.isabs(value):
  192. raise err
  193. def __validate_universal_filename(self, unicode_filename: str) -> None:
  194. match = _RE_INVALID_FILENAME.findall(unicode_filename)
  195. if match:
  196. raise InvalidCharError(
  197. INVALID_CHAR_ERR_MSG_TMPL.format(
  198. invalid=findall_to_str(match),
  199. ),
  200. platform=Platform.UNIVERSAL,
  201. value=unicode_filename,
  202. )
  203. def __validate_win_filename(self, unicode_filename: str) -> None:
  204. match = _RE_INVALID_WIN_FILENAME.findall(unicode_filename)
  205. if match:
  206. raise InvalidCharError(
  207. INVALID_CHAR_ERR_MSG_TMPL.format(
  208. invalid=findall_to_str(match),
  209. ),
  210. platform=Platform.WINDOWS,
  211. value=unicode_filename,
  212. )
  213. if unicode_filename in (".", ".."):
  214. return
  215. KB2829981_err_tmpl = "{}. Refer: https://learn.microsoft.com/en-us/troubleshoot/windows-client/shell-experience/file-folder-name-whitespace-characters" # noqa: E501
  216. err_kwargs = {
  217. ErrorAttrKey.PLATFORM: Platform.WINDOWS,
  218. ErrorAttrKey.VALUE: unicode_filename,
  219. }
  220. if unicode_filename[-1] in (" ", "."):
  221. raise InvalidCharError(
  222. INVALID_CHAR_ERR_MSG_TMPL.format(invalid=re.escape(unicode_filename[-1])),
  223. description=KB2829981_err_tmpl.format(
  224. "Do not end a file or directory name with a space or a period"
  225. ),
  226. **err_kwargs,
  227. )
  228. if unicode_filename[0] in (" "):
  229. raise InvalidCharError(
  230. INVALID_CHAR_ERR_MSG_TMPL.format(invalid=re.escape(unicode_filename[0])),
  231. description=KB2829981_err_tmpl.format(
  232. "Do not start a file or directory name with a space"
  233. ),
  234. **err_kwargs,
  235. )
  236. def validate_filename(
  237. filename: PathType,
  238. platform: Optional[PlatformType] = None,
  239. min_len: int = DEFAULT_MIN_LEN,
  240. max_len: int = _DEFAULT_MAX_FILENAME_LEN,
  241. fs_encoding: Optional[str] = None,
  242. check_reserved: bool = True,
  243. additional_reserved_names: Optional[Sequence[str]] = None,
  244. ) -> None:
  245. """Verifying whether the ``filename`` is a valid file name or not.
  246. Args:
  247. filename:
  248. Filename to validate.
  249. platform:
  250. Target platform name of the filename.
  251. .. include:: platform.txt
  252. min_len:
  253. Minimum byte length of the ``filename``. The value must be greater or equal to one.
  254. Defaults to ``1``.
  255. max_len:
  256. Maximum byte length of the ``filename``. The value must be lower than:
  257. - ``Linux``: 4096
  258. - ``macOS``: 1024
  259. - ``Windows``: 260
  260. - ``universal``: 260
  261. Defaults to ``255``.
  262. fs_encoding:
  263. Filesystem encoding that is used to calculate the byte length of the filename.
  264. If |None|, get the encoding from the execution environment.
  265. check_reserved:
  266. If |True|, check the reserved names of the ``platform``.
  267. additional_reserved_names:
  268. Additional reserved names to check.
  269. Case insensitive.
  270. Raises:
  271. ValidationError (ErrorReason.INVALID_LENGTH):
  272. If the ``filename`` is longer than ``max_len`` characters.
  273. ValidationError (ErrorReason.INVALID_CHARACTER):
  274. If the ``filename`` includes invalid character(s) for a filename:
  275. |invalid_filename_chars|.
  276. The following characters are also invalid for Windows platforms:
  277. |invalid_win_filename_chars|.
  278. ValidationError (ErrorReason.RESERVED_NAME):
  279. If the ``filename`` equals the reserved name by OS.
  280. Windows reserved name is as follows:
  281. ``"CON"``, ``"PRN"``, ``"AUX"``, ``"NUL"``, ``"COM[1-9]"``, ``"LPT[1-9]"``.
  282. Example:
  283. :ref:`example-validate-filename`
  284. See Also:
  285. `Naming Files, Paths, and Namespaces - Win32 apps | Microsoft Docs
  286. <https://docs.microsoft.com/en-us/windows/win32/fileio/naming-a-file>`__
  287. """
  288. FileNameValidator(
  289. platform=platform,
  290. min_len=min_len,
  291. max_len=max_len,
  292. fs_encoding=fs_encoding,
  293. check_reserved=check_reserved,
  294. additional_reserved_names=additional_reserved_names,
  295. ).validate(filename)
  296. def is_valid_filename(
  297. filename: PathType,
  298. platform: Optional[PlatformType] = None,
  299. min_len: int = DEFAULT_MIN_LEN,
  300. max_len: Optional[int] = None,
  301. fs_encoding: Optional[str] = None,
  302. check_reserved: bool = True,
  303. additional_reserved_names: Optional[Sequence[str]] = None,
  304. ) -> bool:
  305. """Check whether the ``filename`` is a valid name or not.
  306. Args:
  307. filename:
  308. A filename to be checked.
  309. platform:
  310. Target platform name of the filename.
  311. Example:
  312. :ref:`example-is-valid-filename`
  313. See Also:
  314. :py:func:`.validate_filename()`
  315. """
  316. return FileNameValidator(
  317. platform=platform,
  318. min_len=min_len,
  319. max_len=-1 if max_len is None else max_len,
  320. fs_encoding=fs_encoding,
  321. check_reserved=check_reserved,
  322. additional_reserved_names=additional_reserved_names,
  323. ).is_valid(filename)
  324. def sanitize_filename(
  325. filename: PathType,
  326. replacement_text: str = "",
  327. platform: Optional[PlatformType] = None,
  328. max_len: Optional[int] = _DEFAULT_MAX_FILENAME_LEN,
  329. fs_encoding: Optional[str] = None,
  330. check_reserved: Optional[bool] = None,
  331. null_value_handler: Optional[ValidationErrorHandler] = None,
  332. reserved_name_handler: Optional[ValidationErrorHandler] = None,
  333. additional_reserved_names: Optional[Sequence[str]] = None,
  334. validate_after_sanitize: bool = False,
  335. ) -> PathType:
  336. """Make a valid filename from a string.
  337. To make a valid filename, the function does the following:
  338. - Replace invalid characters as file names included in the ``filename``
  339. with the ``replacement_text``. Invalid characters are:
  340. - unprintable characters
  341. - |invalid_filename_chars|
  342. - for Windows (or universal) only: |invalid_win_filename_chars|
  343. - Replace a value if a sanitized value is a reserved name by operating systems
  344. with a specified handler by ``reserved_name_handler``.
  345. Args:
  346. filename: Filename to sanitize.
  347. replacement_text:
  348. Replacement text for invalid characters. Defaults to ``""``.
  349. platform:
  350. Target platform name of the filename.
  351. .. include:: platform.txt
  352. max_len:
  353. Maximum byte length of the ``filename``.
  354. Truncate the name length if the ``filename`` length exceeds this value.
  355. Defaults to ``255``.
  356. fs_encoding:
  357. Filesystem encoding that is used to calculate the byte length of the filename.
  358. If |None|, get the encoding from the execution environment.
  359. check_reserved:
  360. [Deprecated] Use 'reserved_name_handler' instead.
  361. null_value_handler:
  362. Function called when a value after sanitization is an empty string.
  363. You can specify predefined handlers:
  364. - :py:func:`~.handler.NullValueHandler.return_null_string`
  365. - :py:func:`~.handler.NullValueHandler.return_timestamp`
  366. - :py:func:`~.handler.raise_error`
  367. Defaults to :py:func:`.handler.NullValueHandler.return_null_string` that just return ``""``.
  368. reserved_name_handler:
  369. Function called when a value after sanitization is a reserved name.
  370. You can specify predefined handlers:
  371. - :py:meth:`~.handler.ReservedNameHandler.add_leading_underscore`
  372. - :py:meth:`~.handler.ReservedNameHandler.add_trailing_underscore`
  373. - :py:meth:`~.handler.ReservedNameHandler.as_is`
  374. - :py:func:`~.handler.raise_error`
  375. Defaults to :py:func:`.handler.add_trailing_underscore`.
  376. additional_reserved_names:
  377. Additional reserved names to sanitize.
  378. Case insensitive.
  379. validate_after_sanitize:
  380. Execute validation after sanitization to the file name.
  381. Returns:
  382. Same type as the ``filename`` (str or PathLike object):
  383. Sanitized filename.
  384. Raises:
  385. ValueError:
  386. If the ``filename`` is an invalid filename.
  387. Example:
  388. :ref:`example-sanitize-filename`
  389. """
  390. if check_reserved is not None:
  391. warnings.warn(
  392. "'check_reserved' is deprecated. Use 'reserved_name_handler' instead.",
  393. DeprecationWarning,
  394. )
  395. if check_reserved is False:
  396. reserved_name_handler = ReservedNameHandler.as_is
  397. return FileNameSanitizer(
  398. platform=platform,
  399. max_len=-1 if max_len is None else max_len,
  400. fs_encoding=fs_encoding,
  401. null_value_handler=null_value_handler,
  402. reserved_name_handler=reserved_name_handler,
  403. additional_reserved_names=additional_reserved_names,
  404. validate_after_sanitize=validate_after_sanitize,
  405. ).sanitize(filename, replacement_text)