local.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518
  1. import datetime
  2. import io
  3. import logging
  4. import os
  5. import os.path as osp
  6. import shutil
  7. import stat
  8. import tempfile
  9. from functools import lru_cache
  10. from fsspec import AbstractFileSystem
  11. from fsspec.compression import compr
  12. from fsspec.core import get_compression
  13. from fsspec.utils import isfilelike, stringify_path
  14. logger = logging.getLogger("fsspec.local")
  15. class LocalFileSystem(AbstractFileSystem):
  16. """Interface to files on local storage
  17. Parameters
  18. ----------
  19. auto_mkdir: bool
  20. Whether, when opening a file, the directory containing it should
  21. be created (if it doesn't already exist). This is assumed by pyarrow
  22. code.
  23. """
  24. root_marker = "/"
  25. protocol = "file", "local"
  26. local_file = True
  27. def __init__(self, auto_mkdir=False, **kwargs):
  28. super().__init__(**kwargs)
  29. self.auto_mkdir = auto_mkdir
  30. @property
  31. def fsid(self):
  32. return "local"
  33. def mkdir(self, path, create_parents=True, **kwargs):
  34. path = self._strip_protocol(path)
  35. if self.exists(path):
  36. raise FileExistsError(path)
  37. if create_parents:
  38. self.makedirs(path, exist_ok=True)
  39. else:
  40. os.mkdir(path, **kwargs)
  41. def makedirs(self, path, exist_ok=False):
  42. path = self._strip_protocol(path)
  43. os.makedirs(path, exist_ok=exist_ok)
  44. def rmdir(self, path):
  45. path = self._strip_protocol(path)
  46. os.rmdir(path)
  47. def ls(self, path, detail=False, **kwargs):
  48. path = self._strip_protocol(path)
  49. path_info = self.info(path)
  50. infos = []
  51. if path_info["type"] == "directory":
  52. with os.scandir(path) as it:
  53. for f in it:
  54. try:
  55. # Only get the info if requested since it is a bit expensive (the stat call inside)
  56. # The strip_protocol is also used in info() and calls make_path_posix to always return posix paths
  57. info = self.info(f) if detail else self._strip_protocol(f.path)
  58. infos.append(info)
  59. except FileNotFoundError:
  60. pass
  61. else:
  62. infos = [path_info] if detail else [path_info["name"]]
  63. return infos
  64. def info(self, path, **kwargs):
  65. if isinstance(path, os.DirEntry):
  66. # scandir DirEntry
  67. out = path.stat(follow_symlinks=False)
  68. link = path.is_symlink()
  69. if path.is_dir(follow_symlinks=False):
  70. t = "directory"
  71. elif path.is_file(follow_symlinks=False):
  72. t = "file"
  73. else:
  74. t = "other"
  75. size = out.st_size
  76. if link:
  77. try:
  78. out2 = path.stat(follow_symlinks=True)
  79. size = out2.st_size
  80. except OSError:
  81. size = 0
  82. path = self._strip_protocol(path.path)
  83. else:
  84. # str or path-like
  85. path = self._strip_protocol(path)
  86. out = os.stat(path, follow_symlinks=False)
  87. link = stat.S_ISLNK(out.st_mode)
  88. if link:
  89. out = os.stat(path, follow_symlinks=True)
  90. size = out.st_size
  91. if stat.S_ISDIR(out.st_mode):
  92. t = "directory"
  93. elif stat.S_ISREG(out.st_mode):
  94. t = "file"
  95. else:
  96. t = "other"
  97. # Check for the 'st_birthtime' attribute, which is not always present; fallback to st_ctime
  98. created_time = getattr(out, "st_birthtime", out.st_ctime)
  99. result = {
  100. "name": path,
  101. "size": size,
  102. "type": t,
  103. "created": created_time,
  104. "islink": link,
  105. }
  106. for field in ["mode", "uid", "gid", "mtime", "ino", "nlink"]:
  107. result[field] = getattr(out, f"st_{field}")
  108. if link:
  109. result["destination"] = os.readlink(path)
  110. return result
  111. def lexists(self, path, **kwargs):
  112. return osp.lexists(path)
  113. def cp_file(self, path1, path2, **kwargs):
  114. path1 = self._strip_protocol(path1)
  115. path2 = self._strip_protocol(path2)
  116. if self.auto_mkdir:
  117. self.makedirs(self._parent(path2), exist_ok=True)
  118. if self.isfile(path1):
  119. shutil.copyfile(path1, path2)
  120. elif self.isdir(path1):
  121. self.mkdirs(path2, exist_ok=True)
  122. else:
  123. raise FileNotFoundError(path1)
  124. def isfile(self, path):
  125. path = self._strip_protocol(path)
  126. return os.path.isfile(path)
  127. def isdir(self, path):
  128. path = self._strip_protocol(path)
  129. return os.path.isdir(path)
  130. def get_file(self, path1, path2, callback=None, **kwargs):
  131. if isfilelike(path2):
  132. with open(path1, "rb") as f:
  133. shutil.copyfileobj(f, path2)
  134. else:
  135. return self.cp_file(path1, path2, **kwargs)
  136. def put_file(self, path1, path2, callback=None, **kwargs):
  137. return self.cp_file(path1, path2, **kwargs)
  138. def mv(self, path1, path2, recursive: bool = True, **kwargs):
  139. """Move files/directories
  140. For the specific case of local, all ops on directories are recursive and
  141. the recursive= kwarg is ignored.
  142. """
  143. path1 = self._strip_protocol(path1)
  144. path2 = self._strip_protocol(path2)
  145. if self.auto_mkdir:
  146. self.makedirs(self._parent(path2), exist_ok=True)
  147. shutil.move(path1, path2)
  148. def link(self, src, dst, **kwargs):
  149. src = self._strip_protocol(src)
  150. dst = self._strip_protocol(dst)
  151. os.link(src, dst, **kwargs)
  152. def symlink(self, src, dst, **kwargs):
  153. src = self._strip_protocol(src)
  154. dst = self._strip_protocol(dst)
  155. os.symlink(src, dst, **kwargs)
  156. def islink(self, path) -> bool:
  157. return os.path.islink(self._strip_protocol(path))
  158. def rm_file(self, path):
  159. os.remove(self._strip_protocol(path))
  160. def rm(self, path, recursive=False, maxdepth=None):
  161. if not isinstance(path, list):
  162. path = [path]
  163. for p in path:
  164. p = self._strip_protocol(p)
  165. if self.isdir(p):
  166. if not recursive:
  167. raise ValueError("Cannot delete directory, set recursive=True")
  168. if osp.abspath(p) == os.getcwd():
  169. raise ValueError("Cannot delete current working directory")
  170. shutil.rmtree(p)
  171. else:
  172. os.remove(p)
  173. def unstrip_protocol(self, name):
  174. name = self._strip_protocol(name) # normalise for local/win/...
  175. return f"file://{name}"
  176. def _open(self, path, mode="rb", block_size=None, **kwargs):
  177. path = self._strip_protocol(path)
  178. if self.auto_mkdir and "w" in mode:
  179. self.makedirs(self._parent(path), exist_ok=True)
  180. return LocalFileOpener(path, mode, fs=self, **kwargs)
  181. def touch(self, path, truncate=True, **kwargs):
  182. path = self._strip_protocol(path)
  183. if self.auto_mkdir:
  184. self.makedirs(self._parent(path), exist_ok=True)
  185. if self.exists(path):
  186. os.utime(path, None)
  187. else:
  188. open(path, "a").close()
  189. if truncate:
  190. os.truncate(path, 0)
  191. def created(self, path):
  192. info = self.info(path=path)
  193. return datetime.datetime.fromtimestamp(
  194. info["created"], tz=datetime.timezone.utc
  195. )
  196. def modified(self, path):
  197. info = self.info(path=path)
  198. return datetime.datetime.fromtimestamp(info["mtime"], tz=datetime.timezone.utc)
  199. @classmethod
  200. def _parent(cls, path):
  201. path = cls._strip_protocol(path)
  202. if os.sep == "/":
  203. # posix native
  204. return path.rsplit("/", 1)[0] or "/"
  205. else:
  206. # NT
  207. path_ = path.rsplit("/", 1)[0]
  208. if len(path_) <= 3:
  209. if path_[1:2] == ":":
  210. # nt root (something like c:/)
  211. return path_[0] + ":/"
  212. # More cases may be required here
  213. return path_
  214. @classmethod
  215. def _strip_protocol(cls, path):
  216. path = stringify_path(path)
  217. if path.startswith("file://"):
  218. path = path[7:]
  219. elif path.startswith("file:"):
  220. path = path[5:]
  221. elif path.startswith("local://"):
  222. path = path[8:]
  223. elif path.startswith("local:"):
  224. path = path[6:]
  225. path = make_path_posix(path)
  226. if os.sep != "/":
  227. # This code-path is a stripped down version of
  228. # > drive, path = ntpath.splitdrive(path)
  229. if path[1:2] == ":":
  230. # Absolute drive-letter path, e.g. X:\Windows
  231. # Relative path with drive, e.g. X:Windows
  232. drive, path = path[:2], path[2:]
  233. elif path[:2] == "//":
  234. # UNC drives, e.g. \\server\share or \\?\UNC\server\share
  235. # Device drives, e.g. \\.\device or \\?\device
  236. if (index1 := path.find("/", 2)) == -1 or (
  237. index2 := path.find("/", index1 + 1)
  238. ) == -1:
  239. drive, path = path, ""
  240. else:
  241. drive, path = path[:index2], path[index2:]
  242. else:
  243. # Relative path, e.g. Windows
  244. drive = ""
  245. path = path.rstrip("/") or cls.root_marker
  246. return drive + path
  247. else:
  248. return path.rstrip("/") or cls.root_marker
  249. def _isfilestore(self):
  250. # Inheriting from DaskFileSystem makes this False (S3, etc. were)
  251. # the original motivation. But we are a posix-like file system.
  252. # See https://github.com/dask/dask/issues/5526
  253. return True
  254. def chmod(self, path, mode):
  255. path = stringify_path(path)
  256. return os.chmod(path, mode)
  257. def make_path_posix(path):
  258. """Make path generic and absolute for current OS"""
  259. if not isinstance(path, str):
  260. if isinstance(path, (list, set, tuple)):
  261. return type(path)(make_path_posix(p) for p in path)
  262. else:
  263. path = stringify_path(path)
  264. if not isinstance(path, str):
  265. raise TypeError(f"could not convert {path!r} to string")
  266. if os.sep == "/":
  267. # Native posix
  268. if path.startswith("/"):
  269. # most common fast case for posix
  270. return path
  271. elif path.startswith("~"):
  272. return osp.expanduser(path)
  273. elif path.startswith("./"):
  274. path = path[2:]
  275. elif path == ".":
  276. path = ""
  277. return f"{os.getcwd()}/{path}"
  278. else:
  279. # NT handling
  280. if path[0:1] == "/" and path[2:3] == ":":
  281. # path is like "/c:/local/path"
  282. path = path[1:]
  283. if path[1:2] == ":":
  284. # windows full path like "C:\\local\\path"
  285. if len(path) <= 3:
  286. # nt root (something like c:/)
  287. return path[0] + ":/"
  288. path = path.replace("\\", "/")
  289. return path
  290. elif path[0:1] == "~":
  291. return make_path_posix(osp.expanduser(path))
  292. elif path.startswith(("\\\\", "//")):
  293. # windows UNC/DFS-style paths
  294. return "//" + path[2:].replace("\\", "/")
  295. elif path.startswith(("\\", "/")):
  296. # windows relative path with root
  297. path = path.replace("\\", "/")
  298. return f"{osp.splitdrive(os.getcwd())[0]}{path}"
  299. else:
  300. path = path.replace("\\", "/")
  301. if path.startswith("./"):
  302. path = path[2:]
  303. elif path == ".":
  304. path = ""
  305. return f"{make_path_posix(os.getcwd())}/{path}"
  306. def trailing_sep(path):
  307. """Return True if the path ends with a path separator.
  308. A forward slash is always considered a path separator, even on Operating
  309. Systems that normally use a backslash.
  310. """
  311. # TODO: if all incoming paths were posix-compliant then separator would
  312. # always be a forward slash, simplifying this function.
  313. # See https://github.com/fsspec/filesystem_spec/pull/1250
  314. return path.endswith(os.sep) or (os.altsep is not None and path.endswith(os.altsep))
  315. @lru_cache(maxsize=1)
  316. def get_umask(mask: int = 0o666) -> int:
  317. """Get the current umask.
  318. Follows https://stackoverflow.com/a/44130549 to get the umask.
  319. Temporarily sets the umask to the given value, and then resets it to the
  320. original value.
  321. """
  322. value = os.umask(mask)
  323. os.umask(value)
  324. return value
  325. class LocalFileOpener(io.IOBase):
  326. def __init__(
  327. self, path, mode, autocommit=True, fs=None, compression=None, **kwargs
  328. ):
  329. logger.debug("open file: %s", path)
  330. self.path = path
  331. self.mode = mode
  332. self.fs = fs
  333. self.f = None
  334. self.autocommit = autocommit
  335. self.compression = get_compression(path, compression)
  336. self.blocksize = io.DEFAULT_BUFFER_SIZE
  337. self._open()
  338. def _open(self):
  339. if self.f is None or self.f.closed:
  340. if self.autocommit or "w" not in self.mode:
  341. self.f = open(self.path, mode=self.mode)
  342. if self.compression:
  343. compress = compr[self.compression]
  344. self.f = compress(self.f, mode=self.mode)
  345. else:
  346. # TODO: check if path is writable?
  347. i, name = tempfile.mkstemp()
  348. os.close(i) # we want normal open and normal buffered file
  349. self.temp = name
  350. self.f = open(name, mode=self.mode)
  351. if "w" not in self.mode:
  352. self.size = self.f.seek(0, 2)
  353. self.f.seek(0)
  354. self.f.size = self.size
  355. def _fetch_range(self, start, end):
  356. # probably only used by cached FS
  357. if "r" not in self.mode:
  358. raise ValueError
  359. self._open()
  360. self.f.seek(start)
  361. return self.f.read(end - start)
  362. def __setstate__(self, state):
  363. self.f = None
  364. loc = state.pop("loc", None)
  365. self.__dict__.update(state)
  366. if "r" in state["mode"]:
  367. self.f = None
  368. self._open()
  369. self.f.seek(loc)
  370. def __getstate__(self):
  371. d = self.__dict__.copy()
  372. d.pop("f")
  373. if "r" in self.mode:
  374. d["loc"] = self.f.tell()
  375. else:
  376. if not self.f.closed:
  377. raise ValueError("Cannot serialise open write-mode local file")
  378. return d
  379. def commit(self):
  380. if self.autocommit:
  381. raise RuntimeError("Can only commit if not already set to autocommit")
  382. try:
  383. shutil.move(self.temp, self.path)
  384. except PermissionError as e:
  385. # shutil.move raises PermissionError if os.rename
  386. # and the default copy2 fallback with shutil.copystats fail.
  387. # The file should be there nonetheless, but without copied permissions.
  388. # If it doesn't exist, there was no permission to create the file.
  389. if not os.path.exists(self.path):
  390. raise e
  391. else:
  392. # If PermissionError is not raised, permissions can be set.
  393. try:
  394. mask = 0o666
  395. os.chmod(self.path, mask & ~get_umask(mask))
  396. except RuntimeError:
  397. pass
  398. def discard(self):
  399. if self.autocommit:
  400. raise RuntimeError("Cannot discard if set to autocommit")
  401. os.remove(self.temp)
  402. def readable(self) -> bool:
  403. return True
  404. def writable(self) -> bool:
  405. return "r" not in self.mode
  406. def read(self, *args, **kwargs):
  407. return self.f.read(*args, **kwargs)
  408. def write(self, *args, **kwargs):
  409. return self.f.write(*args, **kwargs)
  410. def tell(self, *args, **kwargs):
  411. return self.f.tell(*args, **kwargs)
  412. def seek(self, *args, **kwargs):
  413. return self.f.seek(*args, **kwargs)
  414. def seekable(self, *args, **kwargs):
  415. return self.f.seekable(*args, **kwargs)
  416. def readline(self, *args, **kwargs):
  417. return self.f.readline(*args, **kwargs)
  418. def readlines(self, *args, **kwargs):
  419. return self.f.readlines(*args, **kwargs)
  420. def close(self):
  421. return self.f.close()
  422. def truncate(self, size=None) -> int:
  423. return self.f.truncate(size)
  424. @property
  425. def closed(self):
  426. return self.f.closed
  427. def fileno(self):
  428. return self.raw.fileno()
  429. def flush(self) -> None:
  430. self.f.flush()
  431. def __iter__(self):
  432. return self.f.__iter__()
  433. def __getattr__(self, item):
  434. return getattr(self.f, item)
  435. def __enter__(self):
  436. self._incontext = True
  437. return self
  438. def __exit__(self, exc_type, exc_value, traceback):
  439. self._incontext = False
  440. self.f.__exit__(exc_type, exc_value, traceback)