filelist.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431
  1. """distutils.filelist
  2. Provides the FileList class, used for poking about the filesystem
  3. and building lists of files.
  4. """
  5. from __future__ import annotations
  6. import fnmatch
  7. import functools
  8. import os
  9. import re
  10. from collections.abc import Iterable
  11. from typing import Literal, overload
  12. from ._log import log
  13. from .errors import DistutilsInternalError, DistutilsTemplateError
  14. from .util import convert_path
  15. class FileList:
  16. """A list of files built by on exploring the filesystem and filtered by
  17. applying various patterns to what we find there.
  18. Instance attributes:
  19. dir
  20. directory from which files will be taken -- only used if
  21. 'allfiles' not supplied to constructor
  22. files
  23. list of filenames currently being built/filtered/manipulated
  24. allfiles
  25. complete list of files under consideration (ie. without any
  26. filtering applied)
  27. """
  28. def __init__(self, warn: object = None, debug_print: object = None) -> None:
  29. # ignore argument to FileList, but keep them for backwards
  30. # compatibility
  31. self.allfiles: Iterable[str] | None = None
  32. self.files: list[str] = []
  33. def set_allfiles(self, allfiles: Iterable[str]) -> None:
  34. self.allfiles = allfiles
  35. def findall(self, dir: str | os.PathLike[str] = os.curdir) -> None:
  36. self.allfiles = findall(dir)
  37. def debug_print(self, msg: object) -> None:
  38. """Print 'msg' to stdout if the global DEBUG (taken from the
  39. DISTUTILS_DEBUG environment variable) flag is true.
  40. """
  41. from distutils.debug import DEBUG
  42. if DEBUG:
  43. print(msg)
  44. # Collection methods
  45. def append(self, item: str) -> None:
  46. self.files.append(item)
  47. def extend(self, items: Iterable[str]) -> None:
  48. self.files.extend(items)
  49. def sort(self) -> None:
  50. # Not a strict lexical sort!
  51. sortable_files = sorted(map(os.path.split, self.files))
  52. self.files = []
  53. for sort_tuple in sortable_files:
  54. self.files.append(os.path.join(*sort_tuple))
  55. # Other miscellaneous utility methods
  56. def remove_duplicates(self) -> None:
  57. # Assumes list has been sorted!
  58. for i in range(len(self.files) - 1, 0, -1):
  59. if self.files[i] == self.files[i - 1]:
  60. del self.files[i]
  61. # "File template" methods
  62. def _parse_template_line(self, line):
  63. words = line.split()
  64. action = words[0]
  65. patterns = dir = dir_pattern = None
  66. if action in ('include', 'exclude', 'global-include', 'global-exclude'):
  67. if len(words) < 2:
  68. raise DistutilsTemplateError(
  69. f"'{action}' expects <pattern1> <pattern2> ..."
  70. )
  71. patterns = [convert_path(w) for w in words[1:]]
  72. elif action in ('recursive-include', 'recursive-exclude'):
  73. if len(words) < 3:
  74. raise DistutilsTemplateError(
  75. f"'{action}' expects <dir> <pattern1> <pattern2> ..."
  76. )
  77. dir = convert_path(words[1])
  78. patterns = [convert_path(w) for w in words[2:]]
  79. elif action in ('graft', 'prune'):
  80. if len(words) != 2:
  81. raise DistutilsTemplateError(
  82. f"'{action}' expects a single <dir_pattern>"
  83. )
  84. dir_pattern = convert_path(words[1])
  85. else:
  86. raise DistutilsTemplateError(f"unknown action '{action}'")
  87. return (action, patterns, dir, dir_pattern)
  88. def process_template_line(self, line: str) -> None: # noqa: C901
  89. # Parse the line: split it up, make sure the right number of words
  90. # is there, and return the relevant words. 'action' is always
  91. # defined: it's the first word of the line. Which of the other
  92. # three are defined depends on the action; it'll be either
  93. # patterns, (dir and patterns), or (dir_pattern).
  94. (action, patterns, dir, dir_pattern) = self._parse_template_line(line)
  95. # OK, now we know that the action is valid and we have the
  96. # right number of words on the line for that action -- so we
  97. # can proceed with minimal error-checking.
  98. if action == 'include':
  99. self.debug_print("include " + ' '.join(patterns))
  100. for pattern in patterns:
  101. if not self.include_pattern(pattern, anchor=True):
  102. log.warning("warning: no files found matching '%s'", pattern)
  103. elif action == 'exclude':
  104. self.debug_print("exclude " + ' '.join(patterns))
  105. for pattern in patterns:
  106. if not self.exclude_pattern(pattern, anchor=True):
  107. log.warning(
  108. "warning: no previously-included files found matching '%s'",
  109. pattern,
  110. )
  111. elif action == 'global-include':
  112. self.debug_print("global-include " + ' '.join(patterns))
  113. for pattern in patterns:
  114. if not self.include_pattern(pattern, anchor=False):
  115. log.warning(
  116. (
  117. "warning: no files found matching '%s' "
  118. "anywhere in distribution"
  119. ),
  120. pattern,
  121. )
  122. elif action == 'global-exclude':
  123. self.debug_print("global-exclude " + ' '.join(patterns))
  124. for pattern in patterns:
  125. if not self.exclude_pattern(pattern, anchor=False):
  126. log.warning(
  127. (
  128. "warning: no previously-included files matching "
  129. "'%s' found anywhere in distribution"
  130. ),
  131. pattern,
  132. )
  133. elif action == 'recursive-include':
  134. self.debug_print("recursive-include {} {}".format(dir, ' '.join(patterns)))
  135. for pattern in patterns:
  136. if not self.include_pattern(pattern, prefix=dir):
  137. msg = "warning: no files found matching '%s' under directory '%s'"
  138. log.warning(msg, pattern, dir)
  139. elif action == 'recursive-exclude':
  140. self.debug_print("recursive-exclude {} {}".format(dir, ' '.join(patterns)))
  141. for pattern in patterns:
  142. if not self.exclude_pattern(pattern, prefix=dir):
  143. log.warning(
  144. (
  145. "warning: no previously-included files matching "
  146. "'%s' found under directory '%s'"
  147. ),
  148. pattern,
  149. dir,
  150. )
  151. elif action == 'graft':
  152. self.debug_print("graft " + dir_pattern)
  153. if not self.include_pattern(None, prefix=dir_pattern):
  154. log.warning("warning: no directories found matching '%s'", dir_pattern)
  155. elif action == 'prune':
  156. self.debug_print("prune " + dir_pattern)
  157. if not self.exclude_pattern(None, prefix=dir_pattern):
  158. log.warning(
  159. ("no previously-included directories found matching '%s'"),
  160. dir_pattern,
  161. )
  162. else:
  163. raise DistutilsInternalError(
  164. f"this cannot happen: invalid action '{action}'"
  165. )
  166. # Filtering/selection methods
  167. @overload
  168. def include_pattern(
  169. self,
  170. pattern: str,
  171. anchor: bool = True,
  172. prefix: str | None = None,
  173. is_regex: Literal[False] = False,
  174. ) -> bool: ...
  175. @overload
  176. def include_pattern(
  177. self,
  178. pattern: str | re.Pattern[str],
  179. anchor: bool = True,
  180. prefix: str | None = None,
  181. *,
  182. is_regex: Literal[True],
  183. ) -> bool: ...
  184. @overload
  185. def include_pattern(
  186. self,
  187. pattern: str | re.Pattern[str],
  188. anchor: bool,
  189. prefix: str | None,
  190. is_regex: Literal[True],
  191. ) -> bool: ...
  192. def include_pattern(
  193. self,
  194. pattern: str | re.Pattern,
  195. anchor: bool = True,
  196. prefix: str | None = None,
  197. is_regex: bool = False,
  198. ) -> bool:
  199. """Select strings (presumably filenames) from 'self.files' that
  200. match 'pattern', a Unix-style wildcard (glob) pattern. Patterns
  201. are not quite the same as implemented by the 'fnmatch' module: '*'
  202. and '?' match non-special characters, where "special" is platform-
  203. dependent: slash on Unix; colon, slash, and backslash on
  204. DOS/Windows; and colon on Mac OS.
  205. If 'anchor' is true (the default), then the pattern match is more
  206. stringent: "*.py" will match "foo.py" but not "foo/bar.py". If
  207. 'anchor' is false, both of these will match.
  208. If 'prefix' is supplied, then only filenames starting with 'prefix'
  209. (itself a pattern) and ending with 'pattern', with anything in between
  210. them, will match. 'anchor' is ignored in this case.
  211. If 'is_regex' is true, 'anchor' and 'prefix' are ignored, and
  212. 'pattern' is assumed to be either a string containing a regex or a
  213. regex object -- no translation is done, the regex is just compiled
  214. and used as-is.
  215. Selected strings will be added to self.files.
  216. Return True if files are found, False otherwise.
  217. """
  218. # XXX docstring lying about what the special chars are?
  219. files_found = False
  220. pattern_re = translate_pattern(pattern, anchor, prefix, is_regex)
  221. self.debug_print(f"include_pattern: applying regex r'{pattern_re.pattern}'")
  222. # delayed loading of allfiles list
  223. if self.allfiles is None:
  224. self.findall()
  225. for name in self.allfiles:
  226. if pattern_re.search(name):
  227. self.debug_print(" adding " + name)
  228. self.files.append(name)
  229. files_found = True
  230. return files_found
  231. @overload
  232. def exclude_pattern(
  233. self,
  234. pattern: str,
  235. anchor: bool = True,
  236. prefix: str | None = None,
  237. is_regex: Literal[False] = False,
  238. ) -> bool: ...
  239. @overload
  240. def exclude_pattern(
  241. self,
  242. pattern: str | re.Pattern[str],
  243. anchor: bool = True,
  244. prefix: str | None = None,
  245. *,
  246. is_regex: Literal[True],
  247. ) -> bool: ...
  248. @overload
  249. def exclude_pattern(
  250. self,
  251. pattern: str | re.Pattern[str],
  252. anchor: bool,
  253. prefix: str | None,
  254. is_regex: Literal[True],
  255. ) -> bool: ...
  256. def exclude_pattern(
  257. self,
  258. pattern: str | re.Pattern,
  259. anchor: bool = True,
  260. prefix: str | None = None,
  261. is_regex: bool = False,
  262. ) -> bool:
  263. """Remove strings (presumably filenames) from 'files' that match
  264. 'pattern'. Other parameters are the same as for
  265. 'include_pattern()', above.
  266. The list 'self.files' is modified in place.
  267. Return True if files are found, False otherwise.
  268. """
  269. files_found = False
  270. pattern_re = translate_pattern(pattern, anchor, prefix, is_regex)
  271. self.debug_print(f"exclude_pattern: applying regex r'{pattern_re.pattern}'")
  272. for i in range(len(self.files) - 1, -1, -1):
  273. if pattern_re.search(self.files[i]):
  274. self.debug_print(" removing " + self.files[i])
  275. del self.files[i]
  276. files_found = True
  277. return files_found
  278. # Utility functions
  279. def _find_all_simple(path):
  280. """
  281. Find all files under 'path'
  282. """
  283. all_unique = _UniqueDirs.filter(os.walk(path, followlinks=True))
  284. results = (
  285. os.path.join(base, file) for base, dirs, files in all_unique for file in files
  286. )
  287. return filter(os.path.isfile, results)
  288. class _UniqueDirs(set):
  289. """
  290. Exclude previously-seen dirs from walk results,
  291. avoiding infinite recursion.
  292. Ref https://bugs.python.org/issue44497.
  293. """
  294. def __call__(self, walk_item):
  295. """
  296. Given an item from an os.walk result, determine
  297. if the item represents a unique dir for this instance
  298. and if not, prevent further traversal.
  299. """
  300. base, dirs, files = walk_item
  301. stat = os.stat(base)
  302. candidate = stat.st_dev, stat.st_ino
  303. found = candidate in self
  304. if found:
  305. del dirs[:]
  306. self.add(candidate)
  307. return not found
  308. @classmethod
  309. def filter(cls, items):
  310. return filter(cls(), items)
  311. def findall(dir: str | os.PathLike[str] = os.curdir):
  312. """
  313. Find all files under 'dir' and return the list of full filenames.
  314. Unless dir is '.', return full filenames with dir prepended.
  315. """
  316. files = _find_all_simple(dir)
  317. if dir == os.curdir:
  318. make_rel = functools.partial(os.path.relpath, start=dir)
  319. files = map(make_rel, files)
  320. return list(files)
  321. def glob_to_re(pattern):
  322. """Translate a shell-like glob pattern to a regular expression; return
  323. a string containing the regex. Differs from 'fnmatch.translate()' in
  324. that '*' does not match "special characters" (which are
  325. platform-specific).
  326. """
  327. pattern_re = fnmatch.translate(pattern)
  328. # '?' and '*' in the glob pattern become '.' and '.*' in the RE, which
  329. # IMHO is wrong -- '?' and '*' aren't supposed to match slash in Unix,
  330. # and by extension they shouldn't match such "special characters" under
  331. # any OS. So change all non-escaped dots in the RE to match any
  332. # character except the special characters (currently: just os.sep).
  333. sep = os.sep
  334. if os.sep == '\\':
  335. # we're using a regex to manipulate a regex, so we need
  336. # to escape the backslash twice
  337. sep = r'\\\\'
  338. escaped = rf'\1[^{sep}]'
  339. pattern_re = re.sub(r'((?<!\\)(\\\\)*)\.', escaped, pattern_re)
  340. return pattern_re
  341. def translate_pattern(pattern, anchor=True, prefix=None, is_regex=False):
  342. """Translate a shell-like wildcard pattern to a compiled regular
  343. expression. Return the compiled regex. If 'is_regex' true,
  344. then 'pattern' is directly compiled to a regex (if it's a string)
  345. or just returned as-is (assumes it's a regex object).
  346. """
  347. if is_regex:
  348. if isinstance(pattern, str):
  349. return re.compile(pattern)
  350. else:
  351. return pattern
  352. # ditch start and end characters
  353. start, _, end = glob_to_re('_').partition('_')
  354. if pattern:
  355. pattern_re = glob_to_re(pattern)
  356. assert pattern_re.startswith(start) and pattern_re.endswith(end)
  357. else:
  358. pattern_re = ''
  359. if prefix is not None:
  360. prefix_re = glob_to_re(prefix)
  361. assert prefix_re.startswith(start) and prefix_re.endswith(end)
  362. prefix_re = prefix_re[len(start) : len(prefix_re) - len(end)]
  363. sep = os.sep
  364. if os.sep == '\\':
  365. sep = r'\\'
  366. pattern_re = pattern_re[len(start) : len(pattern_re) - len(end)]
  367. pattern_re = rf'{start}\A{prefix_re}{sep}.*{pattern_re}{end}'
  368. else: # no prefix -- respect anchor flag
  369. if anchor:
  370. pattern_re = rf'{start}\A{pattern_re[len(start) :]}'
  371. return re.compile(pattern_re)