spec.py 76 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281
  1. from __future__ import annotations
  2. import io
  3. import json
  4. import logging
  5. import os
  6. import threading
  7. import warnings
  8. import weakref
  9. from errno import ESPIPE
  10. from glob import has_magic
  11. from hashlib import sha256
  12. from typing import Any, ClassVar
  13. from .callbacks import DEFAULT_CALLBACK
  14. from .config import apply_config, conf
  15. from .dircache import DirCache
  16. from .transaction import Transaction
  17. from .utils import (
  18. _unstrip_protocol,
  19. glob_translate,
  20. isfilelike,
  21. other_paths,
  22. read_block,
  23. stringify_path,
  24. tokenize,
  25. )
  26. logger = logging.getLogger("fsspec")
  27. def make_instance(cls, args, kwargs):
  28. return cls(*args, **kwargs)
  29. class _Cached(type):
  30. """
  31. Metaclass for caching file system instances.
  32. Notes
  33. -----
  34. Instances are cached according to
  35. * The values of the class attributes listed in `_extra_tokenize_attributes`
  36. * The arguments passed to ``__init__``.
  37. This creates an additional reference to the filesystem, which prevents the
  38. filesystem from being garbage collected when all *user* references go away.
  39. A call to the :meth:`AbstractFileSystem.clear_instance_cache` must *also*
  40. be made for a filesystem instance to be garbage collected.
  41. """
  42. def __init__(cls, *args, **kwargs):
  43. super().__init__(*args, **kwargs)
  44. # Note: we intentionally create a reference here, to avoid garbage
  45. # collecting instances when all other references are gone. To really
  46. # delete a FileSystem, the cache must be cleared.
  47. if conf.get("weakref_instance_cache"): # pragma: no cover
  48. # debug option for analysing fork/spawn conditions
  49. cls._cache = weakref.WeakValueDictionary()
  50. else:
  51. cls._cache = {}
  52. cls._pid = os.getpid()
  53. def __call__(cls, *args, **kwargs):
  54. kwargs = apply_config(cls, kwargs)
  55. extra_tokens = tuple(
  56. getattr(cls, attr, None) for attr in cls._extra_tokenize_attributes
  57. )
  58. strip_tokenize_options = {
  59. k: kwargs.pop(k) for k in cls._strip_tokenize_options if k in kwargs
  60. }
  61. token = tokenize(
  62. cls, cls._pid, threading.get_ident(), *args, *extra_tokens, **kwargs
  63. )
  64. skip = kwargs.pop("skip_instance_cache", False)
  65. if os.getpid() != cls._pid:
  66. cls._cache.clear()
  67. cls._pid = os.getpid()
  68. if not skip and cls.cachable and token in cls._cache:
  69. cls._latest = token
  70. return cls._cache[token]
  71. else:
  72. obj = super().__call__(*args, **kwargs, **strip_tokenize_options)
  73. # Setting _fs_token here causes some static linters to complain.
  74. obj._fs_token_ = token
  75. obj.storage_args = args
  76. obj.storage_options = kwargs
  77. if obj.async_impl and obj.mirror_sync_methods:
  78. from .asyn import mirror_sync_methods
  79. mirror_sync_methods(obj)
  80. if cls.cachable and not skip:
  81. cls._latest = token
  82. cls._cache[token] = obj
  83. return obj
  84. class AbstractFileSystem(metaclass=_Cached):
  85. """
  86. An abstract super-class for pythonic file-systems
  87. Implementations are expected to be compatible with or, better, subclass
  88. from here.
  89. """
  90. cachable = True # this class can be cached, instances reused
  91. _cached = False
  92. blocksize = 2**22
  93. sep = "/"
  94. protocol: ClassVar[str | tuple[str, ...]] = "abstract"
  95. _latest = None
  96. async_impl = False
  97. mirror_sync_methods = False
  98. root_marker = "" # For some FSs, may require leading '/' or other character
  99. transaction_type = Transaction
  100. #: Extra *class attributes* that should be considered when hashing.
  101. _extra_tokenize_attributes = ()
  102. #: *storage options* that should not be considered when hashing.
  103. _strip_tokenize_options = ()
  104. # Set by _Cached metaclass
  105. storage_args: tuple[Any, ...]
  106. storage_options: dict[str, Any]
  107. def __init__(self, *args, **storage_options):
  108. """Create and configure file-system instance
  109. Instances may be cachable, so if similar enough arguments are seen
  110. a new instance is not required. The token attribute exists to allow
  111. implementations to cache instances if they wish.
  112. A reasonable default should be provided if there are no arguments.
  113. Subclasses should call this method.
  114. Parameters
  115. ----------
  116. use_listings_cache, listings_expiry_time, max_paths:
  117. passed to ``DirCache``, if the implementation supports
  118. directory listing caching. Pass use_listings_cache=False
  119. to disable such caching.
  120. skip_instance_cache: bool
  121. If this is a cachable implementation, pass True here to force
  122. creating a new instance even if a matching instance exists, and prevent
  123. storing this instance.
  124. asynchronous: bool
  125. loop: asyncio-compatible IOLoop or None
  126. """
  127. if self._cached:
  128. # reusing instance, don't change
  129. return
  130. self._cached = True
  131. self._intrans = False
  132. self._transaction = None
  133. self._invalidated_caches_in_transaction = []
  134. self.dircache = DirCache(**storage_options)
  135. if storage_options.pop("add_docs", None):
  136. warnings.warn("add_docs is no longer supported.", FutureWarning)
  137. if storage_options.pop("add_aliases", None):
  138. warnings.warn("add_aliases has been removed.", FutureWarning)
  139. # This is set in _Cached
  140. self._fs_token_ = None
  141. @property
  142. def fsid(self):
  143. """Persistent filesystem id that can be used to compare filesystems
  144. across sessions.
  145. """
  146. raise NotImplementedError
  147. @property
  148. def _fs_token(self):
  149. return self._fs_token_
  150. def __dask_tokenize__(self):
  151. return self._fs_token
  152. def __hash__(self):
  153. return int(self._fs_token, 16)
  154. def __eq__(self, other):
  155. return isinstance(other, type(self)) and self._fs_token == other._fs_token
  156. def __reduce__(self):
  157. return make_instance, (type(self), self.storage_args, self.storage_options)
  158. @classmethod
  159. def _strip_protocol(cls, path):
  160. """Turn path from fully-qualified to file-system-specific
  161. May require FS-specific handling, e.g., for relative paths or links.
  162. """
  163. if isinstance(path, list):
  164. return [cls._strip_protocol(p) for p in path]
  165. path = stringify_path(path)
  166. protos = (cls.protocol,) if isinstance(cls.protocol, str) else cls.protocol
  167. for protocol in protos:
  168. if path.startswith(protocol + "://"):
  169. path = path[len(protocol) + 3 :]
  170. elif path.startswith(protocol + "::"):
  171. path = path[len(protocol) + 2 :]
  172. path = path.rstrip("/")
  173. # use of root_marker to make minimum required path, e.g., "/"
  174. return path or cls.root_marker
  175. def unstrip_protocol(self, name: str) -> str:
  176. """Format FS-specific path to generic, including protocol"""
  177. protos = (self.protocol,) if isinstance(self.protocol, str) else self.protocol
  178. for protocol in protos:
  179. if name.startswith(f"{protocol}://"):
  180. return name
  181. return f"{protos[0]}://{name}"
  182. @staticmethod
  183. def _get_kwargs_from_urls(path):
  184. """If kwargs can be encoded in the paths, extract them here
  185. This should happen before instantiation of the class; incoming paths
  186. then should be amended to strip the options in methods.
  187. Examples may look like an sftp path "sftp://user@host:/my/path", where
  188. the user and host should become kwargs and later get stripped.
  189. """
  190. # by default, nothing happens
  191. return {}
  192. @classmethod
  193. def current(cls):
  194. """Return the most recently instantiated FileSystem
  195. If no instance has been created, then create one with defaults
  196. """
  197. if cls._latest in cls._cache:
  198. return cls._cache[cls._latest]
  199. return cls()
  200. @property
  201. def transaction(self):
  202. """A context within which files are committed together upon exit
  203. Requires the file class to implement `.commit()` and `.discard()`
  204. for the normal and exception cases.
  205. """
  206. if self._transaction is None:
  207. self._transaction = self.transaction_type(self)
  208. return self._transaction
  209. def start_transaction(self):
  210. """Begin write transaction for deferring files, non-context version"""
  211. self._intrans = True
  212. self._transaction = self.transaction_type(self)
  213. return self.transaction
  214. def end_transaction(self):
  215. """Finish write transaction, non-context version"""
  216. self.transaction.complete()
  217. self._transaction = None
  218. # The invalid cache must be cleared after the transaction is completed.
  219. for path in self._invalidated_caches_in_transaction:
  220. self.invalidate_cache(path)
  221. self._invalidated_caches_in_transaction.clear()
  222. def invalidate_cache(self, path=None):
  223. """
  224. Discard any cached directory information
  225. Parameters
  226. ----------
  227. path: string or None
  228. If None, clear all listings cached else listings at or under given
  229. path.
  230. """
  231. # Not necessary to implement invalidation mechanism, may have no cache.
  232. # But if have, you should call this method of parent class from your
  233. # subclass to ensure expiring caches after transacations correctly.
  234. # See the implementation of FTPFileSystem in ftp.py
  235. if self._intrans:
  236. self._invalidated_caches_in_transaction.append(path)
  237. def mkdir(self, path, create_parents=True, **kwargs):
  238. """
  239. Create directory entry at path
  240. For systems that don't have true directories, may create an for
  241. this instance only and not touch the real filesystem
  242. Parameters
  243. ----------
  244. path: str
  245. location
  246. create_parents: bool
  247. if True, this is equivalent to ``makedirs``
  248. kwargs:
  249. may be permissions, etc.
  250. """
  251. pass # not necessary to implement, may not have directories
  252. def makedirs(self, path, exist_ok=False):
  253. """Recursively make directories
  254. Creates directory at path and any intervening required directories.
  255. Raises exception if, for instance, the path already exists but is a
  256. file.
  257. Parameters
  258. ----------
  259. path: str
  260. leaf directory name
  261. exist_ok: bool (False)
  262. If False, will error if the target already exists
  263. """
  264. pass # not necessary to implement, may not have directories
  265. def rmdir(self, path):
  266. """Remove a directory, if empty"""
  267. pass # not necessary to implement, may not have directories
  268. def ls(self, path, detail=True, **kwargs):
  269. """List objects at path.
  270. This should include subdirectories and files at that location. The
  271. difference between a file and a directory must be clear when details
  272. are requested.
  273. The specific keys, or perhaps a FileInfo class, or similar, is TBD,
  274. but must be consistent across implementations.
  275. Must include:
  276. - full path to the entry (without protocol)
  277. - size of the entry, in bytes. If the value cannot be determined, will
  278. be ``None``.
  279. - type of entry, "file", "directory" or other
  280. Additional information
  281. may be present, appropriate to the file-system, e.g., generation,
  282. checksum, etc.
  283. May use refresh=True|False to allow use of self._ls_from_cache to
  284. check for a saved listing and avoid calling the backend. This would be
  285. common where listing may be expensive.
  286. Parameters
  287. ----------
  288. path: str
  289. detail: bool
  290. if True, gives a list of dictionaries, where each is the same as
  291. the result of ``info(path)``. If False, gives a list of paths
  292. (str).
  293. kwargs: may have additional backend-specific options, such as version
  294. information
  295. Returns
  296. -------
  297. List of strings if detail is False, or list of directory information
  298. dicts if detail is True.
  299. """
  300. raise NotImplementedError
  301. def _ls_from_cache(self, path):
  302. """Check cache for listing
  303. Returns listing, if found (may be empty list for a directly that exists
  304. but contains nothing), None if not in cache.
  305. """
  306. parent = self._parent(path)
  307. try:
  308. return self.dircache[path.rstrip("/")]
  309. except KeyError:
  310. pass
  311. try:
  312. files = [
  313. f
  314. for f in self.dircache[parent]
  315. if f["name"] == path
  316. or (f["name"] == path.rstrip("/") and f["type"] == "directory")
  317. ]
  318. if len(files) == 0:
  319. # parent dir was listed but did not contain this file
  320. raise FileNotFoundError(path)
  321. return files
  322. except KeyError:
  323. pass
  324. def walk(self, path, maxdepth=None, topdown=True, on_error="omit", **kwargs):
  325. """Return all files under the given path.
  326. List all files, recursing into subdirectories; output is iterator-style,
  327. like ``os.walk()``. For a simple list of files, ``find()`` is available.
  328. When topdown is True, the caller can modify the dirnames list in-place (perhaps
  329. using del or slice assignment), and walk() will
  330. only recurse into the subdirectories whose names remain in dirnames;
  331. this can be used to prune the search, impose a specific order of visiting,
  332. or even to inform walk() about directories the caller creates or renames before
  333. it resumes walk() again.
  334. Modifying dirnames when topdown is False has no effect. (see os.walk)
  335. Note that the "files" outputted will include anything that is not
  336. a directory, such as links.
  337. Parameters
  338. ----------
  339. path: str
  340. Root to recurse into
  341. maxdepth: int
  342. Maximum recursion depth. None means limitless, but not recommended
  343. on link-based file-systems.
  344. topdown: bool (True)
  345. Whether to walk the directory tree from the top downwards or from
  346. the bottom upwards.
  347. on_error: "omit", "raise", a callable
  348. if omit (default), path with exception will simply be empty;
  349. If raise, an underlying exception will be raised;
  350. if callable, it will be called with a single OSError instance as argument
  351. kwargs: passed to ``ls``
  352. """
  353. if maxdepth is not None and maxdepth < 1:
  354. raise ValueError("maxdepth must be at least 1")
  355. path = self._strip_protocol(path)
  356. full_dirs = {}
  357. dirs = {}
  358. files = {}
  359. detail = kwargs.pop("detail", False)
  360. try:
  361. listing = self.ls(path, detail=True, **kwargs)
  362. except (FileNotFoundError, OSError) as e:
  363. if on_error == "raise":
  364. raise
  365. if callable(on_error):
  366. on_error(e)
  367. return
  368. for info in listing:
  369. # each info name must be at least [path]/part , but here
  370. # we check also for names like [path]/part/
  371. pathname = info["name"].rstrip("/")
  372. name = pathname.rsplit("/", 1)[-1]
  373. if info["type"] == "directory" and pathname != path:
  374. # do not include "self" path
  375. full_dirs[name] = pathname
  376. dirs[name] = info
  377. elif pathname == path:
  378. # file-like with same name as give path
  379. files[""] = info
  380. else:
  381. files[name] = info
  382. if not detail:
  383. dirs = list(dirs)
  384. files = list(files)
  385. if topdown:
  386. # Yield before recursion if walking top down
  387. yield path, dirs, files
  388. if maxdepth is not None:
  389. maxdepth -= 1
  390. if maxdepth < 1:
  391. if not topdown:
  392. yield path, dirs, files
  393. return
  394. for d in dirs:
  395. yield from self.walk(
  396. full_dirs[d],
  397. maxdepth=maxdepth,
  398. detail=detail,
  399. topdown=topdown,
  400. **kwargs,
  401. )
  402. if not topdown:
  403. # Yield after recursion if walking bottom up
  404. yield path, dirs, files
  405. def find(self, path, maxdepth=None, withdirs=False, detail=False, **kwargs):
  406. """List all files below path.
  407. Like posix ``find`` command without conditions
  408. Parameters
  409. ----------
  410. path : str
  411. maxdepth: int or None
  412. If not None, the maximum number of levels to descend
  413. withdirs: bool
  414. Whether to include directory paths in the output. This is True
  415. when used by glob, but users usually only want files.
  416. kwargs are passed to ``ls``.
  417. """
  418. # TODO: allow equivalent of -name parameter
  419. path = self._strip_protocol(path)
  420. out = {}
  421. # Add the root directory if withdirs is requested
  422. # This is needed for posix glob compliance
  423. if withdirs and path != "" and self.isdir(path):
  424. out[path] = self.info(path)
  425. for _, dirs, files in self.walk(path, maxdepth, detail=True, **kwargs):
  426. if withdirs:
  427. files.update(dirs)
  428. out.update({info["name"]: info for name, info in files.items()})
  429. if not out and self.isfile(path):
  430. # walk works on directories, but find should also return [path]
  431. # when path happens to be a file
  432. out[path] = {}
  433. names = sorted(out)
  434. if not detail:
  435. return names
  436. else:
  437. return {name: out[name] for name in names}
  438. def du(self, path, total=True, maxdepth=None, withdirs=False, **kwargs):
  439. """Space used by files and optionally directories within a path
  440. Directory size does not include the size of its contents.
  441. Parameters
  442. ----------
  443. path: str
  444. total: bool
  445. Whether to sum all the file sizes
  446. maxdepth: int or None
  447. Maximum number of directory levels to descend, None for unlimited.
  448. withdirs: bool
  449. Whether to include directory paths in the output.
  450. kwargs: passed to ``find``
  451. Returns
  452. -------
  453. Dict of {path: size} if total=False, or int otherwise, where numbers
  454. refer to bytes used.
  455. """
  456. sizes = {}
  457. if withdirs and self.isdir(path):
  458. # Include top-level directory in output
  459. info = self.info(path)
  460. sizes[info["name"]] = info["size"]
  461. for f in self.find(path, maxdepth=maxdepth, withdirs=withdirs, **kwargs):
  462. info = self.info(f)
  463. sizes[info["name"]] = info["size"]
  464. if total:
  465. return sum(sizes.values())
  466. else:
  467. return sizes
  468. def glob(self, path, maxdepth=None, **kwargs):
  469. """Find files by glob-matching.
  470. Pattern matching capabilities for finding files that match the given pattern.
  471. Parameters
  472. ----------
  473. path: str
  474. The glob pattern to match against
  475. maxdepth: int or None
  476. Maximum depth for ``'**'`` patterns. Applied on the first ``'**'`` found.
  477. Must be at least 1 if provided.
  478. kwargs:
  479. Additional arguments passed to ``find`` (e.g., detail=True)
  480. Returns
  481. -------
  482. List of matched paths, or dict of paths and their info if detail=True
  483. Notes
  484. -----
  485. Supported patterns:
  486. - '*': Matches any sequence of characters within a single directory level
  487. - ``'**'``: Matches any number of directory levels (must be an entire path component)
  488. - '?': Matches exactly one character
  489. - '[abc]': Matches any character in the set
  490. - '[a-z]': Matches any character in the range
  491. - '[!abc]': Matches any character NOT in the set
  492. Special behaviors:
  493. - If the path ends with '/', only folders are returned
  494. - Consecutive '*' characters are compressed into a single '*'
  495. - Empty brackets '[]' never match anything
  496. - Negated empty brackets '[!]' match any single character
  497. - Special characters in character classes are escaped properly
  498. Limitations:
  499. - ``'**'`` must be a complete path component (e.g., ``'a/**/b'``, not ``'a**b'``)
  500. - No brace expansion ('{a,b}.txt')
  501. - No extended glob patterns ('+(pattern)', '!(pattern)')
  502. """
  503. if maxdepth is not None and maxdepth < 1:
  504. raise ValueError("maxdepth must be at least 1")
  505. import re
  506. seps = (os.path.sep, os.path.altsep) if os.path.altsep else (os.path.sep,)
  507. ends_with_sep = path.endswith(seps) # _strip_protocol strips trailing slash
  508. path = self._strip_protocol(path)
  509. append_slash_to_dirname = ends_with_sep or path.endswith(
  510. tuple(sep + "**" for sep in seps)
  511. )
  512. idx_star = path.find("*") if path.find("*") >= 0 else len(path)
  513. idx_qmark = path.find("?") if path.find("?") >= 0 else len(path)
  514. idx_brace = path.find("[") if path.find("[") >= 0 else len(path)
  515. min_idx = min(idx_star, idx_qmark, idx_brace)
  516. detail = kwargs.pop("detail", False)
  517. if not has_magic(path):
  518. if self.exists(path, **kwargs):
  519. if not detail:
  520. return [path]
  521. else:
  522. return {path: self.info(path, **kwargs)}
  523. else:
  524. if not detail:
  525. return [] # glob of non-existent returns empty
  526. else:
  527. return {}
  528. elif "/" in path[:min_idx]:
  529. min_idx = path[:min_idx].rindex("/")
  530. root = path[: min_idx + 1]
  531. depth = path[min_idx + 1 :].count("/") + 1
  532. else:
  533. root = ""
  534. depth = path[min_idx + 1 :].count("/") + 1
  535. if "**" in path:
  536. if maxdepth is not None:
  537. idx_double_stars = path.find("**")
  538. depth_double_stars = path[idx_double_stars:].count("/") + 1
  539. depth = depth - depth_double_stars + maxdepth
  540. else:
  541. depth = None
  542. allpaths = self.find(root, maxdepth=depth, withdirs=True, detail=True, **kwargs)
  543. pattern = glob_translate(path + ("/" if ends_with_sep else ""))
  544. pattern = re.compile(pattern)
  545. out = {
  546. p: info
  547. for p, info in sorted(allpaths.items())
  548. if pattern.match(
  549. p + "/"
  550. if append_slash_to_dirname and info["type"] == "directory"
  551. else p
  552. )
  553. }
  554. if detail:
  555. return out
  556. else:
  557. return list(out)
  558. def exists(self, path, **kwargs):
  559. """Is there a file at the given path"""
  560. try:
  561. self.info(path, **kwargs)
  562. return True
  563. except: # noqa: E722
  564. # any exception allowed bar FileNotFoundError?
  565. return False
  566. def lexists(self, path, **kwargs):
  567. """If there is a file at the given path (including
  568. broken links)"""
  569. return self.exists(path)
  570. def info(self, path, **kwargs):
  571. """Give details of entry at path
  572. Returns a single dictionary, with exactly the same information as ``ls``
  573. would with ``detail=True``.
  574. The default implementation calls ls and could be overridden by a
  575. shortcut. kwargs are passed on to ```ls()``.
  576. Some file systems might not be able to measure the file's size, in
  577. which case, the returned dict will include ``'size': None``.
  578. Returns
  579. -------
  580. dict with keys: name (full path in the FS), size (in bytes), type (file,
  581. directory, or something else) and other FS-specific keys.
  582. """
  583. path = self._strip_protocol(path)
  584. out = self.ls(self._parent(path), detail=True, **kwargs)
  585. out = [o for o in out if o["name"].rstrip("/") == path]
  586. if out:
  587. return out[0]
  588. out = self.ls(path, detail=True, **kwargs)
  589. path = path.rstrip("/")
  590. out1 = [o for o in out if o["name"].rstrip("/") == path]
  591. if len(out1) == 1:
  592. if "size" not in out1[0]:
  593. out1[0]["size"] = None
  594. return out1[0]
  595. elif len(out1) > 1 or out:
  596. return {"name": path, "size": 0, "type": "directory"}
  597. else:
  598. raise FileNotFoundError(path)
  599. def checksum(self, path):
  600. """Unique value for current version of file
  601. If the checksum is the same from one moment to another, the contents
  602. are guaranteed to be the same. If the checksum changes, the contents
  603. *might* have changed.
  604. This should normally be overridden; default will probably capture
  605. creation/modification timestamp (which would be good) or maybe
  606. access timestamp (which would be bad)
  607. """
  608. return int(tokenize(self.info(path)), 16)
  609. def size(self, path):
  610. """Size in bytes of file"""
  611. return self.info(path).get("size", None)
  612. def sizes(self, paths):
  613. """Size in bytes of each file in a list of paths"""
  614. return [self.size(p) for p in paths]
  615. def isdir(self, path):
  616. """Is this entry directory-like?"""
  617. try:
  618. return self.info(path)["type"] == "directory"
  619. except OSError:
  620. return False
  621. def isfile(self, path):
  622. """Is this entry file-like?"""
  623. try:
  624. return self.info(path)["type"] == "file"
  625. except: # noqa: E722
  626. return False
  627. def read_text(self, path, encoding=None, errors=None, newline=None, **kwargs):
  628. """Get the contents of the file as a string.
  629. Parameters
  630. ----------
  631. path: str
  632. URL of file on this filesystems
  633. encoding, errors, newline: same as `open`.
  634. """
  635. with self.open(
  636. path,
  637. mode="r",
  638. encoding=encoding,
  639. errors=errors,
  640. newline=newline,
  641. **kwargs,
  642. ) as f:
  643. return f.read()
  644. def write_text(
  645. self, path, value, encoding=None, errors=None, newline=None, **kwargs
  646. ):
  647. """Write the text to the given file.
  648. An existing file will be overwritten.
  649. Parameters
  650. ----------
  651. path: str
  652. URL of file on this filesystems
  653. value: str
  654. Text to write.
  655. encoding, errors, newline: same as `open`.
  656. """
  657. with self.open(
  658. path,
  659. mode="w",
  660. encoding=encoding,
  661. errors=errors,
  662. newline=newline,
  663. **kwargs,
  664. ) as f:
  665. return f.write(value)
  666. def cat_file(self, path, start=None, end=None, **kwargs):
  667. """Get the content of a file
  668. Parameters
  669. ----------
  670. path: URL of file on this filesystems
  671. start, end: int
  672. Bytes limits of the read. If negative, backwards from end,
  673. like usual python slices. Either can be None for start or
  674. end of file, respectively
  675. kwargs: passed to ``open()``.
  676. """
  677. # explicitly set buffering off?
  678. with self.open(path, "rb", **kwargs) as f:
  679. if start is not None:
  680. if start >= 0:
  681. f.seek(start)
  682. else:
  683. f.seek(max(0, f.size + start))
  684. if end is not None:
  685. if end < 0:
  686. end = f.size + end
  687. return f.read(end - f.tell())
  688. return f.read()
  689. def pipe_file(self, path, value, mode="overwrite", **kwargs):
  690. """Set the bytes of given file"""
  691. if mode == "create" and self.exists(path):
  692. # non-atomic but simple way; or could use "xb" in open(), which is likely
  693. # not as well supported
  694. raise FileExistsError
  695. with self.open(path, "wb", **kwargs) as f:
  696. f.write(value)
  697. def pipe(self, path, value=None, **kwargs):
  698. """Put value into path
  699. (counterpart to ``cat``)
  700. Parameters
  701. ----------
  702. path: string or dict(str, bytes)
  703. If a string, a single remote location to put ``value`` bytes; if a dict,
  704. a mapping of {path: bytesvalue}.
  705. value: bytes, optional
  706. If using a single path, these are the bytes to put there. Ignored if
  707. ``path`` is a dict
  708. """
  709. if isinstance(path, str):
  710. self.pipe_file(self._strip_protocol(path), value, **kwargs)
  711. elif isinstance(path, dict):
  712. for k, v in path.items():
  713. self.pipe_file(self._strip_protocol(k), v, **kwargs)
  714. else:
  715. raise ValueError("path must be str or dict")
  716. def cat_ranges(
  717. self, paths, starts, ends, max_gap=None, on_error="return", **kwargs
  718. ):
  719. """Get the contents of byte ranges from one or more files
  720. Parameters
  721. ----------
  722. paths: list
  723. A list of of filepaths on this filesystems
  724. starts, ends: int or list
  725. Bytes limits of the read. If using a single int, the same value will be
  726. used to read all the specified files.
  727. """
  728. if max_gap is not None:
  729. raise NotImplementedError
  730. if not isinstance(paths, list):
  731. raise TypeError
  732. if not isinstance(starts, list):
  733. starts = [starts] * len(paths)
  734. if not isinstance(ends, list):
  735. ends = [ends] * len(paths)
  736. if len(starts) != len(paths) or len(ends) != len(paths):
  737. raise ValueError
  738. out = []
  739. for p, s, e in zip(paths, starts, ends):
  740. try:
  741. out.append(self.cat_file(p, s, e))
  742. except Exception as e:
  743. if on_error == "return":
  744. out.append(e)
  745. else:
  746. raise
  747. return out
  748. def cat(self, path, recursive=False, on_error="raise", **kwargs):
  749. """Fetch (potentially multiple) paths' contents
  750. Parameters
  751. ----------
  752. recursive: bool
  753. If True, assume the path(s) are directories, and get all the
  754. contained files
  755. on_error : "raise", "omit", "return"
  756. If raise, an underlying exception will be raised (converted to KeyError
  757. if the type is in self.missing_exceptions); if omit, keys with exception
  758. will simply not be included in the output; if "return", all keys are
  759. included in the output, but the value will be bytes or an exception
  760. instance.
  761. kwargs: passed to cat_file
  762. Returns
  763. -------
  764. dict of {path: contents} if there are multiple paths
  765. or the path has been otherwise expanded
  766. """
  767. paths = self.expand_path(path, recursive=recursive, **kwargs)
  768. if (
  769. len(paths) > 1
  770. or isinstance(path, list)
  771. or paths[0] != self._strip_protocol(path)
  772. ):
  773. out = {}
  774. for path in paths:
  775. try:
  776. out[path] = self.cat_file(path, **kwargs)
  777. except Exception as e:
  778. if on_error == "raise":
  779. raise
  780. if on_error == "return":
  781. out[path] = e
  782. return out
  783. else:
  784. return self.cat_file(paths[0], **kwargs)
  785. def get_file(self, rpath, lpath, callback=DEFAULT_CALLBACK, outfile=None, **kwargs):
  786. """Copy single remote file to local"""
  787. from .implementations.local import LocalFileSystem
  788. if isfilelike(lpath):
  789. outfile = lpath
  790. elif self.isdir(rpath):
  791. os.makedirs(lpath, exist_ok=True)
  792. return None
  793. fs = LocalFileSystem(auto_mkdir=True)
  794. fs.makedirs(fs._parent(lpath), exist_ok=True)
  795. with self.open(rpath, "rb", **kwargs) as f1:
  796. if outfile is None:
  797. outfile = open(lpath, "wb")
  798. try:
  799. callback.set_size(getattr(f1, "size", None))
  800. data = True
  801. while data:
  802. data = f1.read(self.blocksize)
  803. segment_len = outfile.write(data)
  804. if segment_len is None:
  805. segment_len = len(data)
  806. callback.relative_update(segment_len)
  807. finally:
  808. if not isfilelike(lpath):
  809. outfile.close()
  810. def get(
  811. self,
  812. rpath,
  813. lpath,
  814. recursive=False,
  815. callback=DEFAULT_CALLBACK,
  816. maxdepth=None,
  817. **kwargs,
  818. ):
  819. """Copy file(s) to local.
  820. Copies a specific file or tree of files (if recursive=True). If lpath
  821. ends with a "/", it will be assumed to be a directory, and target files
  822. will go within. Can submit a list of paths, which may be glob-patterns
  823. and will be expanded.
  824. Calls get_file for each source.
  825. """
  826. if isinstance(lpath, list) and isinstance(rpath, list):
  827. # No need to expand paths when both source and destination
  828. # are provided as lists
  829. rpaths = rpath
  830. lpaths = lpath
  831. else:
  832. from .implementations.local import (
  833. LocalFileSystem,
  834. make_path_posix,
  835. trailing_sep,
  836. )
  837. source_is_str = isinstance(rpath, str)
  838. rpaths = self.expand_path(
  839. rpath, recursive=recursive, maxdepth=maxdepth, **kwargs
  840. )
  841. if source_is_str and (not recursive or maxdepth is not None):
  842. # Non-recursive glob does not copy directories
  843. rpaths = [p for p in rpaths if not (trailing_sep(p) or self.isdir(p))]
  844. if not rpaths:
  845. return
  846. if isinstance(lpath, str):
  847. lpath = make_path_posix(lpath)
  848. source_is_file = len(rpaths) == 1
  849. dest_is_dir = isinstance(lpath, str) and (
  850. trailing_sep(lpath) or LocalFileSystem().isdir(lpath)
  851. )
  852. exists = source_is_str and (
  853. (has_magic(rpath) and source_is_file)
  854. or (not has_magic(rpath) and dest_is_dir and not trailing_sep(rpath))
  855. )
  856. lpaths = other_paths(
  857. rpaths,
  858. lpath,
  859. exists=exists,
  860. flatten=not source_is_str,
  861. )
  862. callback.set_size(len(lpaths))
  863. for lpath, rpath in callback.wrap(zip(lpaths, rpaths)):
  864. with callback.branched(rpath, lpath) as child:
  865. self.get_file(rpath, lpath, callback=child, **kwargs)
  866. def put_file(
  867. self, lpath, rpath, callback=DEFAULT_CALLBACK, mode="overwrite", **kwargs
  868. ):
  869. """Copy single file to remote"""
  870. if mode == "create" and self.exists(rpath):
  871. raise FileExistsError
  872. if os.path.isdir(lpath):
  873. self.makedirs(rpath, exist_ok=True)
  874. return None
  875. with open(lpath, "rb") as f1:
  876. size = f1.seek(0, 2)
  877. callback.set_size(size)
  878. f1.seek(0)
  879. self.mkdirs(self._parent(os.fspath(rpath)), exist_ok=True)
  880. with self.open(rpath, "wb", **kwargs) as f2:
  881. while f1.tell() < size:
  882. data = f1.read(self.blocksize)
  883. segment_len = f2.write(data)
  884. if segment_len is None:
  885. segment_len = len(data)
  886. callback.relative_update(segment_len)
  887. def put(
  888. self,
  889. lpath,
  890. rpath,
  891. recursive=False,
  892. callback=DEFAULT_CALLBACK,
  893. maxdepth=None,
  894. **kwargs,
  895. ):
  896. """Copy file(s) from local.
  897. Copies a specific file or tree of files (if recursive=True). If rpath
  898. ends with a "/", it will be assumed to be a directory, and target files
  899. will go within.
  900. Calls put_file for each source.
  901. """
  902. if isinstance(lpath, list) and isinstance(rpath, list):
  903. # No need to expand paths when both source and destination
  904. # are provided as lists
  905. rpaths = rpath
  906. lpaths = lpath
  907. else:
  908. from .implementations.local import (
  909. LocalFileSystem,
  910. make_path_posix,
  911. trailing_sep,
  912. )
  913. source_is_str = isinstance(lpath, str)
  914. if source_is_str:
  915. lpath = make_path_posix(lpath)
  916. fs = LocalFileSystem()
  917. lpaths = fs.expand_path(
  918. lpath, recursive=recursive, maxdepth=maxdepth, **kwargs
  919. )
  920. if source_is_str and (not recursive or maxdepth is not None):
  921. # Non-recursive glob does not copy directories
  922. lpaths = [p for p in lpaths if not (trailing_sep(p) or fs.isdir(p))]
  923. if not lpaths:
  924. return
  925. source_is_file = len(lpaths) == 1
  926. dest_is_dir = isinstance(rpath, str) and (
  927. trailing_sep(rpath) or self.isdir(rpath)
  928. )
  929. rpath = (
  930. self._strip_protocol(rpath)
  931. if isinstance(rpath, str)
  932. else [self._strip_protocol(p) for p in rpath]
  933. )
  934. exists = source_is_str and (
  935. (has_magic(lpath) and source_is_file)
  936. or (not has_magic(lpath) and dest_is_dir and not trailing_sep(lpath))
  937. )
  938. rpaths = other_paths(
  939. lpaths,
  940. rpath,
  941. exists=exists,
  942. flatten=not source_is_str,
  943. )
  944. callback.set_size(len(rpaths))
  945. for lpath, rpath in callback.wrap(zip(lpaths, rpaths)):
  946. with callback.branched(lpath, rpath) as child:
  947. self.put_file(lpath, rpath, callback=child, **kwargs)
  948. def head(self, path, size=1024):
  949. """Get the first ``size`` bytes from file"""
  950. with self.open(path, "rb") as f:
  951. return f.read(size)
  952. def tail(self, path, size=1024):
  953. """Get the last ``size`` bytes from file"""
  954. with self.open(path, "rb") as f:
  955. f.seek(max(-size, -f.size), 2)
  956. return f.read()
  957. def cp_file(self, path1, path2, **kwargs):
  958. raise NotImplementedError
  959. def copy(
  960. self, path1, path2, recursive=False, maxdepth=None, on_error=None, **kwargs
  961. ):
  962. """Copy within two locations in the filesystem
  963. on_error : "raise", "ignore"
  964. If raise, any not-found exceptions will be raised; if ignore any
  965. not-found exceptions will cause the path to be skipped; defaults to
  966. raise unless recursive is true, where the default is ignore
  967. """
  968. if on_error is None and recursive:
  969. on_error = "ignore"
  970. elif on_error is None:
  971. on_error = "raise"
  972. if isinstance(path1, list) and isinstance(path2, list):
  973. # No need to expand paths when both source and destination
  974. # are provided as lists
  975. paths1 = path1
  976. paths2 = path2
  977. else:
  978. from .implementations.local import trailing_sep
  979. source_is_str = isinstance(path1, str)
  980. paths1 = self.expand_path(
  981. path1, recursive=recursive, maxdepth=maxdepth, **kwargs
  982. )
  983. if source_is_str and (not recursive or maxdepth is not None):
  984. # Non-recursive glob does not copy directories
  985. paths1 = [p for p in paths1 if not (trailing_sep(p) or self.isdir(p))]
  986. if not paths1:
  987. return
  988. source_is_file = len(paths1) == 1
  989. dest_is_dir = isinstance(path2, str) and (
  990. trailing_sep(path2) or self.isdir(path2)
  991. )
  992. exists = source_is_str and (
  993. (has_magic(path1) and source_is_file)
  994. or (not has_magic(path1) and dest_is_dir and not trailing_sep(path1))
  995. )
  996. paths2 = other_paths(
  997. paths1,
  998. path2,
  999. exists=exists,
  1000. flatten=not source_is_str,
  1001. )
  1002. for p1, p2 in zip(paths1, paths2):
  1003. try:
  1004. self.cp_file(p1, p2, **kwargs)
  1005. except FileNotFoundError:
  1006. if on_error == "raise":
  1007. raise
  1008. def expand_path(self, path, recursive=False, maxdepth=None, **kwargs):
  1009. """Turn one or more globs or directories into a list of all matching paths
  1010. to files or directories.
  1011. kwargs are passed to ``glob`` or ``find``, which may in turn call ``ls``
  1012. """
  1013. if maxdepth is not None and maxdepth < 1:
  1014. raise ValueError("maxdepth must be at least 1")
  1015. if isinstance(path, (str, os.PathLike)):
  1016. out = self.expand_path([path], recursive, maxdepth, **kwargs)
  1017. else:
  1018. out = set()
  1019. path = [self._strip_protocol(p) for p in path]
  1020. for p in path:
  1021. if has_magic(p):
  1022. bit = set(self.glob(p, maxdepth=maxdepth, **kwargs))
  1023. out |= bit
  1024. if recursive:
  1025. # glob call above expanded one depth so if maxdepth is defined
  1026. # then decrement it in expand_path call below. If it is zero
  1027. # after decrementing then avoid expand_path call.
  1028. if maxdepth is not None and maxdepth <= 1:
  1029. continue
  1030. out |= set(
  1031. self.expand_path(
  1032. list(bit),
  1033. recursive=recursive,
  1034. maxdepth=maxdepth - 1 if maxdepth is not None else None,
  1035. **kwargs,
  1036. )
  1037. )
  1038. continue
  1039. elif recursive:
  1040. rec = set(
  1041. self.find(
  1042. p, maxdepth=maxdepth, withdirs=True, detail=False, **kwargs
  1043. )
  1044. )
  1045. out |= rec
  1046. if p not in out and (recursive is False or self.exists(p)):
  1047. # should only check once, for the root
  1048. out.add(p)
  1049. if not out:
  1050. raise FileNotFoundError(path)
  1051. return sorted(out)
  1052. def mv(self, path1, path2, recursive=False, maxdepth=None, **kwargs):
  1053. """Move file(s) from one location to another"""
  1054. if path1 == path2:
  1055. logger.debug("%s mv: The paths are the same, so no files were moved.", self)
  1056. else:
  1057. # explicitly raise exception to prevent data corruption
  1058. self.copy(
  1059. path1, path2, recursive=recursive, maxdepth=maxdepth, onerror="raise"
  1060. )
  1061. self.rm(path1, recursive=recursive)
  1062. def rm_file(self, path):
  1063. """Delete a file"""
  1064. self._rm(path)
  1065. def _rm(self, path):
  1066. """Delete one file"""
  1067. # this is the old name for the method, prefer rm_file
  1068. raise NotImplementedError
  1069. def rm(self, path, recursive=False, maxdepth=None):
  1070. """Delete files.
  1071. Parameters
  1072. ----------
  1073. path: str or list of str
  1074. File(s) to delete.
  1075. recursive: bool
  1076. If file(s) are directories, recursively delete contents and then
  1077. also remove the directory
  1078. maxdepth: int or None
  1079. Depth to pass to walk for finding files to delete, if recursive.
  1080. If None, there will be no limit and infinite recursion may be
  1081. possible.
  1082. """
  1083. path = self.expand_path(path, recursive=recursive, maxdepth=maxdepth)
  1084. for p in reversed(path):
  1085. self.rm_file(p)
  1086. @classmethod
  1087. def _parent(cls, path):
  1088. path = cls._strip_protocol(path)
  1089. if "/" in path:
  1090. parent = path.rsplit("/", 1)[0].lstrip(cls.root_marker)
  1091. return cls.root_marker + parent
  1092. else:
  1093. return cls.root_marker
  1094. def _open(
  1095. self,
  1096. path,
  1097. mode="rb",
  1098. block_size=None,
  1099. autocommit=True,
  1100. cache_options=None,
  1101. **kwargs,
  1102. ):
  1103. """Return raw bytes-mode file-like from the file-system"""
  1104. return AbstractBufferedFile(
  1105. self,
  1106. path,
  1107. mode,
  1108. block_size,
  1109. autocommit,
  1110. cache_options=cache_options,
  1111. **kwargs,
  1112. )
  1113. def open(
  1114. self,
  1115. path,
  1116. mode="rb",
  1117. block_size=None,
  1118. cache_options=None,
  1119. compression=None,
  1120. **kwargs,
  1121. ):
  1122. """
  1123. Return a file-like object from the filesystem
  1124. The resultant instance must function correctly in a context ``with``
  1125. block.
  1126. Parameters
  1127. ----------
  1128. path: str
  1129. Target file
  1130. mode: str like 'rb', 'w'
  1131. See builtin ``open()``
  1132. Mode "x" (exclusive write) may be implemented by the backend. Even if
  1133. it is, whether it is checked up front or on commit, and whether it is
  1134. atomic is implementation-dependent.
  1135. block_size: int
  1136. Some indication of buffering - this is a value in bytes
  1137. cache_options : dict, optional
  1138. Extra arguments to pass through to the cache.
  1139. compression: string or None
  1140. If given, open file using compression codec. Can either be a compression
  1141. name (a key in ``fsspec.compression.compr``) or "infer" to guess the
  1142. compression from the filename suffix.
  1143. encoding, errors, newline: passed on to TextIOWrapper for text mode
  1144. """
  1145. import io
  1146. path = self._strip_protocol(path)
  1147. if "b" not in mode:
  1148. mode = mode.replace("t", "") + "b"
  1149. text_kwargs = {
  1150. k: kwargs.pop(k)
  1151. for k in ["encoding", "errors", "newline"]
  1152. if k in kwargs
  1153. }
  1154. return io.TextIOWrapper(
  1155. self.open(
  1156. path,
  1157. mode,
  1158. block_size=block_size,
  1159. cache_options=cache_options,
  1160. compression=compression,
  1161. **kwargs,
  1162. ),
  1163. **text_kwargs,
  1164. )
  1165. else:
  1166. ac = kwargs.pop("autocommit", not self._intrans)
  1167. f = self._open(
  1168. path,
  1169. mode=mode,
  1170. block_size=block_size,
  1171. autocommit=ac,
  1172. cache_options=cache_options,
  1173. **kwargs,
  1174. )
  1175. if compression is not None:
  1176. from fsspec.compression import compr
  1177. from fsspec.core import get_compression
  1178. compression = get_compression(path, compression)
  1179. compress = compr[compression]
  1180. f = compress(f, mode=mode[0])
  1181. if not ac and "r" not in mode:
  1182. self.transaction.files.append(f)
  1183. return f
  1184. def touch(self, path, truncate=True, **kwargs):
  1185. """Create empty file, or update timestamp
  1186. Parameters
  1187. ----------
  1188. path: str
  1189. file location
  1190. truncate: bool
  1191. If True, always set file size to 0; if False, update timestamp and
  1192. leave file unchanged, if backend allows this
  1193. """
  1194. if truncate or not self.exists(path):
  1195. with self.open(path, "wb", **kwargs):
  1196. pass
  1197. else:
  1198. raise NotImplementedError # update timestamp, if possible
  1199. def ukey(self, path):
  1200. """Hash of file properties, to tell if it has changed"""
  1201. return sha256(str(self.info(path)).encode()).hexdigest()
  1202. def read_block(self, fn, offset, length, delimiter=None):
  1203. """Read a block of bytes from
  1204. Starting at ``offset`` of the file, read ``length`` bytes. If
  1205. ``delimiter`` is set then we ensure that the read starts and stops at
  1206. delimiter boundaries that follow the locations ``offset`` and ``offset
  1207. + length``. If ``offset`` is zero then we start at zero. The
  1208. bytestring returned WILL include the end delimiter string.
  1209. If offset+length is beyond the eof, reads to eof.
  1210. Parameters
  1211. ----------
  1212. fn: string
  1213. Path to filename
  1214. offset: int
  1215. Byte offset to start read
  1216. length: int
  1217. Number of bytes to read. If None, read to end.
  1218. delimiter: bytes (optional)
  1219. Ensure reading starts and stops at delimiter bytestring
  1220. Examples
  1221. --------
  1222. >>> fs.read_block('data/file.csv', 0, 13) # doctest: +SKIP
  1223. b'Alice, 100\\nBo'
  1224. >>> fs.read_block('data/file.csv', 0, 13, delimiter=b'\\n') # doctest: +SKIP
  1225. b'Alice, 100\\nBob, 200\\n'
  1226. Use ``length=None`` to read to the end of the file.
  1227. >>> fs.read_block('data/file.csv', 0, None, delimiter=b'\\n') # doctest: +SKIP
  1228. b'Alice, 100\\nBob, 200\\nCharlie, 300'
  1229. See Also
  1230. --------
  1231. :func:`fsspec.utils.read_block`
  1232. """
  1233. with self.open(fn, "rb") as f:
  1234. size = f.size
  1235. if length is None:
  1236. length = size
  1237. if size is not None and offset + length > size:
  1238. length = size - offset
  1239. return read_block(f, offset, length, delimiter)
  1240. def to_json(self, *, include_password: bool = True) -> str:
  1241. """
  1242. JSON representation of this filesystem instance.
  1243. Parameters
  1244. ----------
  1245. include_password: bool, default True
  1246. Whether to include the password (if any) in the output.
  1247. Returns
  1248. -------
  1249. JSON string with keys ``cls`` (the python location of this class),
  1250. protocol (text name of this class's protocol, first one in case of
  1251. multiple), ``args`` (positional args, usually empty), and all other
  1252. keyword arguments as their own keys.
  1253. Warnings
  1254. --------
  1255. Serialized filesystems may contain sensitive information which have been
  1256. passed to the constructor, such as passwords and tokens. Make sure you
  1257. store and send them in a secure environment!
  1258. """
  1259. from .json import FilesystemJSONEncoder
  1260. return json.dumps(
  1261. self,
  1262. cls=type(
  1263. "_FilesystemJSONEncoder",
  1264. (FilesystemJSONEncoder,),
  1265. {"include_password": include_password},
  1266. ),
  1267. )
  1268. @staticmethod
  1269. def from_json(blob: str) -> AbstractFileSystem:
  1270. """
  1271. Recreate a filesystem instance from JSON representation.
  1272. See ``.to_json()`` for the expected structure of the input.
  1273. Parameters
  1274. ----------
  1275. blob: str
  1276. Returns
  1277. -------
  1278. file system instance, not necessarily of this particular class.
  1279. Warnings
  1280. --------
  1281. This can import arbitrary modules (as determined by the ``cls`` key).
  1282. Make sure you haven't installed any modules that may execute malicious code
  1283. at import time.
  1284. """
  1285. from .json import FilesystemJSONDecoder
  1286. return json.loads(blob, cls=FilesystemJSONDecoder)
  1287. def to_dict(self, *, include_password: bool = True) -> dict[str, Any]:
  1288. """
  1289. JSON-serializable dictionary representation of this filesystem instance.
  1290. Parameters
  1291. ----------
  1292. include_password: bool, default True
  1293. Whether to include the password (if any) in the output.
  1294. Returns
  1295. -------
  1296. Dictionary with keys ``cls`` (the python location of this class),
  1297. protocol (text name of this class's protocol, first one in case of
  1298. multiple), ``args`` (positional args, usually empty), and all other
  1299. keyword arguments as their own keys.
  1300. Warnings
  1301. --------
  1302. Serialized filesystems may contain sensitive information which have been
  1303. passed to the constructor, such as passwords and tokens. Make sure you
  1304. store and send them in a secure environment!
  1305. """
  1306. from .json import FilesystemJSONEncoder
  1307. json_encoder = FilesystemJSONEncoder()
  1308. cls = type(self)
  1309. proto = self.protocol
  1310. storage_options = dict(self.storage_options)
  1311. if not include_password:
  1312. storage_options.pop("password", None)
  1313. return dict(
  1314. cls=f"{cls.__module__}:{cls.__name__}",
  1315. protocol=proto[0] if isinstance(proto, (tuple, list)) else proto,
  1316. args=json_encoder.make_serializable(self.storage_args),
  1317. **json_encoder.make_serializable(storage_options),
  1318. )
  1319. @staticmethod
  1320. def from_dict(dct: dict[str, Any]) -> AbstractFileSystem:
  1321. """
  1322. Recreate a filesystem instance from dictionary representation.
  1323. See ``.to_dict()`` for the expected structure of the input.
  1324. Parameters
  1325. ----------
  1326. dct: Dict[str, Any]
  1327. Returns
  1328. -------
  1329. file system instance, not necessarily of this particular class.
  1330. Warnings
  1331. --------
  1332. This can import arbitrary modules (as determined by the ``cls`` key).
  1333. Make sure you haven't installed any modules that may execute malicious code
  1334. at import time.
  1335. """
  1336. from .json import FilesystemJSONDecoder
  1337. json_decoder = FilesystemJSONDecoder()
  1338. dct = dict(dct) # Defensive copy
  1339. cls = FilesystemJSONDecoder.try_resolve_fs_cls(dct)
  1340. if cls is None:
  1341. raise ValueError("Not a serialized AbstractFileSystem")
  1342. dct.pop("cls", None)
  1343. dct.pop("protocol", None)
  1344. return cls(
  1345. *json_decoder.unmake_serializable(dct.pop("args", ())),
  1346. **json_decoder.unmake_serializable(dct),
  1347. )
  1348. def _get_pyarrow_filesystem(self):
  1349. """
  1350. Make a version of the FS instance which will be acceptable to pyarrow
  1351. """
  1352. # all instances already also derive from pyarrow
  1353. return self
  1354. def get_mapper(self, root="", check=False, create=False, missing_exceptions=None):
  1355. """Create key/value store based on this file-system
  1356. Makes a MutableMapping interface to the FS at the given root path.
  1357. See ``fsspec.mapping.FSMap`` for further details.
  1358. """
  1359. from .mapping import FSMap
  1360. return FSMap(
  1361. root,
  1362. self,
  1363. check=check,
  1364. create=create,
  1365. missing_exceptions=missing_exceptions,
  1366. )
  1367. @classmethod
  1368. def clear_instance_cache(cls):
  1369. """
  1370. Clear the cache of filesystem instances.
  1371. Notes
  1372. -----
  1373. Unless overridden by setting the ``cachable`` class attribute to False,
  1374. the filesystem class stores a reference to newly created instances. This
  1375. prevents Python's normal rules around garbage collection from working,
  1376. since the instances refcount will not drop to zero until
  1377. ``clear_instance_cache`` is called.
  1378. """
  1379. cls._cache.clear()
  1380. def created(self, path):
  1381. """Return the created timestamp of a file as a datetime.datetime"""
  1382. raise NotImplementedError
  1383. def modified(self, path):
  1384. """Return the modified timestamp of a file as a datetime.datetime"""
  1385. raise NotImplementedError
  1386. def tree(
  1387. self,
  1388. path: str = "/",
  1389. recursion_limit: int = 2,
  1390. max_display: int = 25,
  1391. display_size: bool = False,
  1392. prefix: str = "",
  1393. is_last: bool = True,
  1394. first: bool = True,
  1395. indent_size: int = 4,
  1396. ) -> str:
  1397. """
  1398. Return a tree-like structure of the filesystem starting from the given path as a string.
  1399. Parameters
  1400. ----------
  1401. path: Root path to start traversal from
  1402. recursion_limit: Maximum depth of directory traversal
  1403. max_display: Maximum number of items to display per directory
  1404. display_size: Whether to display file sizes
  1405. prefix: Current line prefix for visual tree structure
  1406. is_last: Whether current item is last in its level
  1407. first: Whether this is the first call (displays root path)
  1408. indent_size: Number of spaces by indent
  1409. Returns
  1410. -------
  1411. str: A string representing the tree structure.
  1412. Example
  1413. -------
  1414. >>> from fsspec import filesystem
  1415. >>> fs = filesystem('ftp', host='test.rebex.net', user='demo', password='password')
  1416. >>> tree = fs.tree(display_size=True, recursion_limit=3, indent_size=8, max_display=10)
  1417. >>> print(tree)
  1418. """
  1419. def format_bytes(n: int) -> str:
  1420. """Format bytes as text."""
  1421. for prefix, k in (
  1422. ("P", 2**50),
  1423. ("T", 2**40),
  1424. ("G", 2**30),
  1425. ("M", 2**20),
  1426. ("k", 2**10),
  1427. ):
  1428. if n >= 0.9 * k:
  1429. return f"{n / k:.2f} {prefix}b"
  1430. return f"{n}B"
  1431. result = []
  1432. if first:
  1433. result.append(path)
  1434. if recursion_limit:
  1435. indent = " " * indent_size
  1436. contents = self.ls(path, detail=True)
  1437. contents.sort(
  1438. key=lambda x: (x.get("type") != "directory", x.get("name", ""))
  1439. )
  1440. if max_display is not None and len(contents) > max_display:
  1441. displayed_contents = contents[:max_display]
  1442. remaining_count = len(contents) - max_display
  1443. else:
  1444. displayed_contents = contents
  1445. remaining_count = 0
  1446. for i, item in enumerate(displayed_contents):
  1447. is_last_item = (i == len(displayed_contents) - 1) and (
  1448. remaining_count == 0
  1449. )
  1450. branch = (
  1451. "└" + ("─" * (indent_size - 2))
  1452. if is_last_item
  1453. else "├" + ("─" * (indent_size - 2))
  1454. )
  1455. branch += " "
  1456. new_prefix = prefix + (
  1457. indent if is_last_item else "│" + " " * (indent_size - 1)
  1458. )
  1459. name = os.path.basename(item.get("name", ""))
  1460. if display_size and item.get("type") == "directory":
  1461. sub_contents = self.ls(item.get("name", ""), detail=True)
  1462. num_files = sum(
  1463. 1 for sub_item in sub_contents if sub_item.get("type") == "file"
  1464. )
  1465. num_folders = sum(
  1466. 1
  1467. for sub_item in sub_contents
  1468. if sub_item.get("type") == "directory"
  1469. )
  1470. if num_files == 0 and num_folders == 0:
  1471. size = " (empty folder)"
  1472. elif num_files == 0:
  1473. size = f" ({num_folders} subfolder{'s' if num_folders > 1 else ''})"
  1474. elif num_folders == 0:
  1475. size = f" ({num_files} file{'s' if num_files > 1 else ''})"
  1476. else:
  1477. size = f" ({num_files} file{'s' if num_files > 1 else ''}, {num_folders} subfolder{'s' if num_folders > 1 else ''})"
  1478. elif display_size and item.get("type") == "file":
  1479. size = f" ({format_bytes(item.get('size', 0))})"
  1480. else:
  1481. size = ""
  1482. result.append(f"{prefix}{branch}{name}{size}")
  1483. if item.get("type") == "directory" and recursion_limit > 0:
  1484. result.append(
  1485. self.tree(
  1486. path=item.get("name", ""),
  1487. recursion_limit=recursion_limit - 1,
  1488. max_display=max_display,
  1489. display_size=display_size,
  1490. prefix=new_prefix,
  1491. is_last=is_last_item,
  1492. first=False,
  1493. indent_size=indent_size,
  1494. )
  1495. )
  1496. if remaining_count > 0:
  1497. more_message = f"{remaining_count} more item(s) not displayed."
  1498. result.append(
  1499. f"{prefix}{'└' + ('─' * (indent_size - 2))} {more_message}"
  1500. )
  1501. return "\n".join(_ for _ in result if _)
  1502. # ------------------------------------------------------------------------
  1503. # Aliases
  1504. def read_bytes(self, path, start=None, end=None, **kwargs):
  1505. """Alias of `AbstractFileSystem.cat_file`."""
  1506. return self.cat_file(path, start=start, end=end, **kwargs)
  1507. def write_bytes(self, path, value, **kwargs):
  1508. """Alias of `AbstractFileSystem.pipe_file`."""
  1509. self.pipe_file(path, value, **kwargs)
  1510. def makedir(self, path, create_parents=True, **kwargs):
  1511. """Alias of `AbstractFileSystem.mkdir`."""
  1512. return self.mkdir(path, create_parents=create_parents, **kwargs)
  1513. def mkdirs(self, path, exist_ok=False):
  1514. """Alias of `AbstractFileSystem.makedirs`."""
  1515. return self.makedirs(path, exist_ok=exist_ok)
  1516. def listdir(self, path, detail=True, **kwargs):
  1517. """Alias of `AbstractFileSystem.ls`."""
  1518. return self.ls(path, detail=detail, **kwargs)
  1519. def cp(self, path1, path2, **kwargs):
  1520. """Alias of `AbstractFileSystem.copy`."""
  1521. return self.copy(path1, path2, **kwargs)
  1522. def move(self, path1, path2, **kwargs):
  1523. """Alias of `AbstractFileSystem.mv`."""
  1524. return self.mv(path1, path2, **kwargs)
  1525. def stat(self, path, **kwargs):
  1526. """Alias of `AbstractFileSystem.info`."""
  1527. return self.info(path, **kwargs)
  1528. def disk_usage(self, path, total=True, maxdepth=None, **kwargs):
  1529. """Alias of `AbstractFileSystem.du`."""
  1530. return self.du(path, total=total, maxdepth=maxdepth, **kwargs)
  1531. def rename(self, path1, path2, **kwargs):
  1532. """Alias of `AbstractFileSystem.mv`."""
  1533. return self.mv(path1, path2, **kwargs)
  1534. def delete(self, path, recursive=False, maxdepth=None):
  1535. """Alias of `AbstractFileSystem.rm`."""
  1536. return self.rm(path, recursive=recursive, maxdepth=maxdepth)
  1537. def upload(self, lpath, rpath, recursive=False, **kwargs):
  1538. """Alias of `AbstractFileSystem.put`."""
  1539. return self.put(lpath, rpath, recursive=recursive, **kwargs)
  1540. def download(self, rpath, lpath, recursive=False, **kwargs):
  1541. """Alias of `AbstractFileSystem.get`."""
  1542. return self.get(rpath, lpath, recursive=recursive, **kwargs)
  1543. def sign(self, path, expiration=100, **kwargs):
  1544. """Create a signed URL representing the given path
  1545. Some implementations allow temporary URLs to be generated, as a
  1546. way of delegating credentials.
  1547. Parameters
  1548. ----------
  1549. path : str
  1550. The path on the filesystem
  1551. expiration : int
  1552. Number of seconds to enable the URL for (if supported)
  1553. Returns
  1554. -------
  1555. URL : str
  1556. The signed URL
  1557. Raises
  1558. ------
  1559. NotImplementedError : if method is not implemented for a filesystem
  1560. """
  1561. raise NotImplementedError("Sign is not implemented for this filesystem")
  1562. def _isfilestore(self):
  1563. # Originally inherited from pyarrow DaskFileSystem. Keeping this
  1564. # here for backwards compatibility as long as pyarrow uses its
  1565. # legacy fsspec-compatible filesystems and thus accepts fsspec
  1566. # filesystems as well
  1567. return False
  1568. class AbstractBufferedFile(io.IOBase):
  1569. """Convenient class to derive from to provide buffering
  1570. In the case that the backend does not provide a pythonic file-like object
  1571. already, this class contains much of the logic to build one. The only
  1572. methods that need to be overridden are ``_upload_chunk``,
  1573. ``_initiate_upload`` and ``_fetch_range``.
  1574. """
  1575. DEFAULT_BLOCK_SIZE = 5 * 2**20
  1576. _details = None
  1577. def __init__(
  1578. self,
  1579. fs,
  1580. path,
  1581. mode="rb",
  1582. block_size="default",
  1583. autocommit=True,
  1584. cache_type="readahead",
  1585. cache_options=None,
  1586. size=None,
  1587. **kwargs,
  1588. ):
  1589. """
  1590. Template for files with buffered reading and writing
  1591. Parameters
  1592. ----------
  1593. fs: instance of FileSystem
  1594. path: str
  1595. location in file-system
  1596. mode: str
  1597. Normal file modes. Currently only 'wb', 'ab' or 'rb'. Some file
  1598. systems may be read-only, and some may not support append.
  1599. block_size: int
  1600. Buffer size for reading or writing, 'default' for class default
  1601. autocommit: bool
  1602. Whether to write to final destination; may only impact what
  1603. happens when file is being closed.
  1604. cache_type: {"readahead", "none", "mmap", "bytes"}, default "readahead"
  1605. Caching policy in read mode. See the definitions in ``core``.
  1606. cache_options : dict
  1607. Additional options passed to the constructor for the cache specified
  1608. by `cache_type`.
  1609. size: int
  1610. If given and in read mode, suppressed having to look up the file size
  1611. kwargs:
  1612. Gets stored as self.kwargs
  1613. """
  1614. from .core import caches
  1615. self.path = path
  1616. self.fs = fs
  1617. self.mode = mode
  1618. self.blocksize = (
  1619. self.DEFAULT_BLOCK_SIZE if block_size in ["default", None] else block_size
  1620. )
  1621. self.loc = 0
  1622. self.autocommit = autocommit
  1623. self.end = None
  1624. self.start = None
  1625. self.closed = False
  1626. if cache_options is None:
  1627. cache_options = {}
  1628. if "trim" in kwargs:
  1629. warnings.warn(
  1630. "Passing 'trim' to control the cache behavior has been deprecated. "
  1631. "Specify it within the 'cache_options' argument instead.",
  1632. FutureWarning,
  1633. )
  1634. cache_options["trim"] = kwargs.pop("trim")
  1635. self.kwargs = kwargs
  1636. if mode not in {"ab", "rb", "wb", "xb"}:
  1637. raise NotImplementedError("File mode not supported")
  1638. if mode == "rb":
  1639. if size is not None:
  1640. self.size = size
  1641. else:
  1642. self.size = self.details["size"]
  1643. self.cache = caches[cache_type](
  1644. self.blocksize, self._fetch_range, self.size, **cache_options
  1645. )
  1646. else:
  1647. self.buffer = io.BytesIO()
  1648. self.offset = None
  1649. self.forced = False
  1650. self.location = None
  1651. @property
  1652. def details(self):
  1653. if self._details is None:
  1654. self._details = self.fs.info(self.path)
  1655. return self._details
  1656. @details.setter
  1657. def details(self, value):
  1658. self._details = value
  1659. self.size = value["size"]
  1660. @property
  1661. def full_name(self):
  1662. return _unstrip_protocol(self.path, self.fs)
  1663. @property
  1664. def closed(self):
  1665. # get around this attr being read-only in IOBase
  1666. # use getattr here, since this can be called during del
  1667. return getattr(self, "_closed", True)
  1668. @closed.setter
  1669. def closed(self, c):
  1670. self._closed = c
  1671. def __hash__(self):
  1672. if "w" in self.mode:
  1673. return id(self)
  1674. else:
  1675. return int(tokenize(self.details), 16)
  1676. def __eq__(self, other):
  1677. """Files are equal if they have the same checksum, only in read mode"""
  1678. if self is other:
  1679. return True
  1680. return (
  1681. isinstance(other, type(self))
  1682. and self.mode == "rb"
  1683. and other.mode == "rb"
  1684. and hash(self) == hash(other)
  1685. )
  1686. def commit(self):
  1687. """Move from temp to final destination"""
  1688. def discard(self):
  1689. """Throw away temporary file"""
  1690. def info(self):
  1691. """File information about this path"""
  1692. if self.readable():
  1693. return self.details
  1694. else:
  1695. raise ValueError("Info not available while writing")
  1696. def tell(self):
  1697. """Current file location"""
  1698. return self.loc
  1699. def seek(self, loc, whence=0):
  1700. """Set current file location
  1701. Parameters
  1702. ----------
  1703. loc: int
  1704. byte location
  1705. whence: {0, 1, 2}
  1706. from start of file, current location or end of file, resp.
  1707. """
  1708. loc = int(loc)
  1709. if not self.mode == "rb":
  1710. raise OSError(ESPIPE, "Seek only available in read mode")
  1711. if whence == 0:
  1712. nloc = loc
  1713. elif whence == 1:
  1714. nloc = self.loc + loc
  1715. elif whence == 2:
  1716. nloc = self.size + loc
  1717. else:
  1718. raise ValueError(f"invalid whence ({whence}, should be 0, 1 or 2)")
  1719. if nloc < 0:
  1720. raise ValueError("Seek before start of file")
  1721. self.loc = nloc
  1722. return self.loc
  1723. def write(self, data):
  1724. """
  1725. Write data to buffer.
  1726. Buffer only sent on flush() or if buffer is greater than
  1727. or equal to blocksize.
  1728. Parameters
  1729. ----------
  1730. data: bytes
  1731. Set of bytes to be written.
  1732. """
  1733. if not self.writable():
  1734. raise ValueError("File not in write mode")
  1735. if self.closed:
  1736. raise ValueError("I/O operation on closed file.")
  1737. if self.forced:
  1738. raise ValueError("This file has been force-flushed, can only close")
  1739. out = self.buffer.write(data)
  1740. self.loc += out
  1741. if self.buffer.tell() >= self.blocksize:
  1742. self.flush()
  1743. return out
  1744. def flush(self, force=False):
  1745. """
  1746. Write buffered data to backend store.
  1747. Writes the current buffer, if it is larger than the block-size, or if
  1748. the file is being closed.
  1749. Parameters
  1750. ----------
  1751. force: bool
  1752. When closing, write the last block even if it is smaller than
  1753. blocks are allowed to be. Disallows further writing to this file.
  1754. """
  1755. if self.closed:
  1756. raise ValueError("Flush on closed file")
  1757. if force and self.forced:
  1758. raise ValueError("Force flush cannot be called more than once")
  1759. if force:
  1760. self.forced = True
  1761. if self.readable():
  1762. # no-op to flush on read-mode
  1763. return
  1764. if not force and self.buffer.tell() < self.blocksize:
  1765. # Defer write on small block
  1766. return
  1767. if self.offset is None:
  1768. # Initialize a multipart upload
  1769. self.offset = 0
  1770. try:
  1771. self._initiate_upload()
  1772. except:
  1773. self.closed = True
  1774. raise
  1775. if self._upload_chunk(final=force) is not False:
  1776. self.offset += self.buffer.seek(0, 2)
  1777. self.buffer = io.BytesIO()
  1778. def _upload_chunk(self, final=False):
  1779. """Write one part of a multi-block file upload
  1780. Parameters
  1781. ==========
  1782. final: bool
  1783. This is the last block, so should complete file, if
  1784. self.autocommit is True.
  1785. """
  1786. # may not yet have been initialized, may need to call _initialize_upload
  1787. def _initiate_upload(self):
  1788. """Create remote file/upload"""
  1789. pass
  1790. def _fetch_range(self, start, end):
  1791. """Get the specified set of bytes from remote"""
  1792. return self.fs.cat_file(self.path, start=start, end=end)
  1793. def read(self, length=-1):
  1794. """
  1795. Return data from cache, or fetch pieces as necessary
  1796. Parameters
  1797. ----------
  1798. length: int (-1)
  1799. Number of bytes to read; if <0, all remaining bytes.
  1800. """
  1801. length = -1 if length is None else int(length)
  1802. if self.mode != "rb":
  1803. raise ValueError("File not in read mode")
  1804. if length < 0:
  1805. length = self.size - self.loc
  1806. if self.closed:
  1807. raise ValueError("I/O operation on closed file.")
  1808. if length == 0:
  1809. # don't even bother calling fetch
  1810. return b""
  1811. out = self.cache._fetch(self.loc, self.loc + length)
  1812. logger.debug(
  1813. "%s read: %i - %i %s",
  1814. self,
  1815. self.loc,
  1816. self.loc + length,
  1817. self.cache._log_stats(),
  1818. )
  1819. self.loc += len(out)
  1820. return out
  1821. def readinto(self, b):
  1822. """mirrors builtin file's readinto method
  1823. https://docs.python.org/3/library/io.html#io.RawIOBase.readinto
  1824. """
  1825. out = memoryview(b).cast("B")
  1826. data = self.read(out.nbytes)
  1827. out[: len(data)] = data
  1828. return len(data)
  1829. def readuntil(self, char=b"\n", blocks=None):
  1830. """Return data between current position and first occurrence of char
  1831. char is included in the output, except if the end of the tile is
  1832. encountered first.
  1833. Parameters
  1834. ----------
  1835. char: bytes
  1836. Thing to find
  1837. blocks: None or int
  1838. How much to read in each go. Defaults to file blocksize - which may
  1839. mean a new read on every call.
  1840. """
  1841. out = []
  1842. while True:
  1843. start = self.tell()
  1844. part = self.read(blocks or self.blocksize)
  1845. if len(part) == 0:
  1846. break
  1847. found = part.find(char)
  1848. if found > -1:
  1849. out.append(part[: found + len(char)])
  1850. self.seek(start + found + len(char))
  1851. break
  1852. out.append(part)
  1853. return b"".join(out)
  1854. def readline(self):
  1855. """Read until and including the first occurrence of newline character
  1856. Note that, because of character encoding, this is not necessarily a
  1857. true line ending.
  1858. """
  1859. return self.readuntil(b"\n")
  1860. def __next__(self):
  1861. out = self.readline()
  1862. if out:
  1863. return out
  1864. raise StopIteration
  1865. def __iter__(self):
  1866. return self
  1867. def readlines(self):
  1868. """Return all data, split by the newline character, including the newline character"""
  1869. data = self.read()
  1870. lines = data.split(b"\n")
  1871. out = [l + b"\n" for l in lines[:-1]]
  1872. if data.endswith(b"\n"):
  1873. return out
  1874. else:
  1875. return out + [lines[-1]]
  1876. # return list(self) ???
  1877. def readinto1(self, b):
  1878. return self.readinto(b)
  1879. def close(self):
  1880. """Close file
  1881. Finalizes writes, discards cache
  1882. """
  1883. if getattr(self, "_unclosable", False):
  1884. return
  1885. if self.closed:
  1886. return
  1887. try:
  1888. if self.mode == "rb":
  1889. self.cache = None
  1890. else:
  1891. if not self.forced:
  1892. self.flush(force=True)
  1893. if self.fs is not None:
  1894. self.fs.invalidate_cache(self.path)
  1895. self.fs.invalidate_cache(self.fs._parent(self.path))
  1896. finally:
  1897. self.closed = True
  1898. def readable(self):
  1899. """Whether opened for reading"""
  1900. return "r" in self.mode and not self.closed
  1901. def seekable(self):
  1902. """Whether is seekable (only in read mode)"""
  1903. return self.readable()
  1904. def writable(self):
  1905. """Whether opened for writing"""
  1906. return self.mode in {"wb", "ab", "xb"} and not self.closed
  1907. def __reduce__(self):
  1908. if self.mode != "rb":
  1909. raise RuntimeError("Pickling a writeable file is not supported")
  1910. return reopen, (
  1911. self.fs,
  1912. self.path,
  1913. self.mode,
  1914. self.blocksize,
  1915. self.loc,
  1916. self.size,
  1917. self.autocommit,
  1918. self.cache.name if self.cache else "none",
  1919. self.kwargs,
  1920. )
  1921. def __del__(self):
  1922. if not self.closed:
  1923. self.close()
  1924. def __str__(self):
  1925. return f"<File-like object {type(self.fs).__name__}, {self.path}>"
  1926. __repr__ = __str__
  1927. def __enter__(self):
  1928. return self
  1929. def __exit__(self, *args):
  1930. self.close()
  1931. def reopen(fs, path, mode, blocksize, loc, size, autocommit, cache_type, kwargs):
  1932. file = fs.open(
  1933. path,
  1934. mode=mode,
  1935. block_size=blocksize,
  1936. autocommit=autocommit,
  1937. cache_type=cache_type,
  1938. size=size,
  1939. **kwargs,
  1940. )
  1941. if loc > 0:
  1942. file.seek(loc)
  1943. return file