spec.py 76 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284
  1. from __future__ import annotations
  2. import io
  3. import json
  4. import logging
  5. import os
  6. import threading
  7. import warnings
  8. import weakref
  9. from errno import ESPIPE
  10. from glob import has_magic
  11. from hashlib import sha256
  12. from typing import Any, ClassVar
  13. from .callbacks import DEFAULT_CALLBACK
  14. from .config import apply_config, conf
  15. from .dircache import DirCache
  16. from .transaction import Transaction
  17. from .utils import (
  18. _unstrip_protocol,
  19. glob_translate,
  20. isfilelike,
  21. other_paths,
  22. read_block,
  23. stringify_path,
  24. tokenize,
  25. )
  26. logger = logging.getLogger("fsspec")
  27. def make_instance(cls, args, kwargs):
  28. return cls(*args, **kwargs)
  29. class _Cached(type):
  30. """
  31. Metaclass for caching file system instances.
  32. Notes
  33. -----
  34. Instances are cached according to
  35. * The values of the class attributes listed in `_extra_tokenize_attributes`
  36. * The arguments passed to ``__init__``.
  37. This creates an additional reference to the filesystem, which prevents the
  38. filesystem from being garbage collected when all *user* references go away.
  39. A call to the :meth:`AbstractFileSystem.clear_instance_cache` must *also*
  40. be made for a filesystem instance to be garbage collected.
  41. """
  42. def __init__(cls, *args, **kwargs):
  43. super().__init__(*args, **kwargs)
  44. # Note: we intentionally create a reference here, to avoid garbage
  45. # collecting instances when all other references are gone. To really
  46. # delete a FileSystem, the cache must be cleared.
  47. if conf.get("weakref_instance_cache"): # pragma: no cover
  48. # debug option for analysing fork/spawn conditions
  49. cls._cache = weakref.WeakValueDictionary()
  50. else:
  51. cls._cache = {}
  52. cls._pid = os.getpid()
  53. def __call__(cls, *args, **kwargs):
  54. kwargs = apply_config(cls, kwargs)
  55. extra_tokens = tuple(
  56. getattr(cls, attr, None) for attr in cls._extra_tokenize_attributes
  57. )
  58. strip_tokenize_options = {
  59. k: kwargs.pop(k) for k in cls._strip_tokenize_options if k in kwargs
  60. }
  61. token = tokenize(
  62. cls, cls._pid, threading.get_ident(), *args, *extra_tokens, **kwargs
  63. )
  64. skip = kwargs.pop("skip_instance_cache", False)
  65. if os.getpid() != cls._pid:
  66. cls._cache.clear()
  67. cls._pid = os.getpid()
  68. if not skip and cls.cachable and token in cls._cache:
  69. cls._latest = token
  70. return cls._cache[token]
  71. else:
  72. obj = super().__call__(*args, **kwargs, **strip_tokenize_options)
  73. # Setting _fs_token here causes some static linters to complain.
  74. obj._fs_token_ = token
  75. obj.storage_args = args
  76. obj.storage_options = kwargs
  77. if obj.async_impl and obj.mirror_sync_methods:
  78. from .asyn import mirror_sync_methods
  79. mirror_sync_methods(obj)
  80. if cls.cachable and not skip:
  81. cls._latest = token
  82. cls._cache[token] = obj
  83. return obj
  84. class AbstractFileSystem(metaclass=_Cached):
  85. """
  86. An abstract super-class for pythonic file-systems
  87. Implementations are expected to be compatible with or, better, subclass
  88. from here.
  89. """
  90. cachable = True # this class can be cached, instances reused
  91. _cached = False
  92. blocksize = 2**22
  93. sep = "/"
  94. protocol: ClassVar[str | tuple[str, ...]] = "abstract"
  95. _latest = None
  96. async_impl = False
  97. mirror_sync_methods = False
  98. root_marker = "" # For some FSs, may require leading '/' or other character
  99. transaction_type = Transaction
  100. #: Extra *class attributes* that should be considered when hashing.
  101. _extra_tokenize_attributes = ()
  102. #: *storage options* that should not be considered when hashing.
  103. _strip_tokenize_options = ()
  104. # Set by _Cached metaclass
  105. storage_args: tuple[Any, ...]
  106. storage_options: dict[str, Any]
  107. def __init__(self, *args, **storage_options):
  108. """Create and configure file-system instance
  109. Instances may be cachable, so if similar enough arguments are seen
  110. a new instance is not required. The token attribute exists to allow
  111. implementations to cache instances if they wish.
  112. A reasonable default should be provided if there are no arguments.
  113. Subclasses should call this method.
  114. Parameters
  115. ----------
  116. use_listings_cache, listings_expiry_time, max_paths:
  117. passed to ``DirCache``, if the implementation supports
  118. directory listing caching. Pass use_listings_cache=False
  119. to disable such caching.
  120. skip_instance_cache: bool
  121. If this is a cachable implementation, pass True here to force
  122. creating a new instance even if a matching instance exists, and prevent
  123. storing this instance.
  124. asynchronous: bool
  125. loop: asyncio-compatible IOLoop or None
  126. """
  127. if self._cached:
  128. # reusing instance, don't change
  129. return
  130. self._cached = True
  131. self._intrans = False
  132. self._transaction = None
  133. self._invalidated_caches_in_transaction = []
  134. self.dircache = DirCache(**storage_options)
  135. if storage_options.pop("add_docs", None):
  136. warnings.warn("add_docs is no longer supported.", FutureWarning)
  137. if storage_options.pop("add_aliases", None):
  138. warnings.warn("add_aliases has been removed.", FutureWarning)
  139. # This is set in _Cached
  140. self._fs_token_ = None
  141. @property
  142. def fsid(self):
  143. """Persistent filesystem id that can be used to compare filesystems
  144. across sessions.
  145. """
  146. raise NotImplementedError
  147. @property
  148. def _fs_token(self):
  149. return self._fs_token_
  150. def __dask_tokenize__(self):
  151. return self._fs_token
  152. def __hash__(self):
  153. return int(self._fs_token, 16)
  154. def __eq__(self, other):
  155. return isinstance(other, type(self)) and self._fs_token == other._fs_token
  156. def __reduce__(self):
  157. return make_instance, (type(self), self.storage_args, self.storage_options)
  158. @classmethod
  159. def _strip_protocol(cls, path):
  160. """Turn path from fully-qualified to file-system-specific
  161. May require FS-specific handling, e.g., for relative paths or links.
  162. """
  163. if isinstance(path, list):
  164. return [cls._strip_protocol(p) for p in path]
  165. path = stringify_path(path)
  166. protos = (cls.protocol,) if isinstance(cls.protocol, str) else cls.protocol
  167. for protocol in protos:
  168. if path.startswith(protocol + "://"):
  169. path = path[len(protocol) + 3 :]
  170. elif path.startswith(protocol + "::"):
  171. path = path[len(protocol) + 2 :]
  172. path = path.rstrip("/")
  173. # use of root_marker to make minimum required path, e.g., "/"
  174. return path or cls.root_marker
  175. def unstrip_protocol(self, name: str) -> str:
  176. """Format FS-specific path to generic, including protocol"""
  177. protos = (self.protocol,) if isinstance(self.protocol, str) else self.protocol
  178. for protocol in protos:
  179. if name.startswith(f"{protocol}://"):
  180. return name
  181. return f"{protos[0]}://{name}"
  182. @staticmethod
  183. def _get_kwargs_from_urls(path):
  184. """If kwargs can be encoded in the paths, extract them here
  185. This should happen before instantiation of the class; incoming paths
  186. then should be amended to strip the options in methods.
  187. Examples may look like an sftp path "sftp://user@host:/my/path", where
  188. the user and host should become kwargs and later get stripped.
  189. """
  190. # by default, nothing happens
  191. return {}
  192. @classmethod
  193. def current(cls):
  194. """Return the most recently instantiated FileSystem
  195. If no instance has been created, then create one with defaults
  196. """
  197. if cls._latest in cls._cache:
  198. return cls._cache[cls._latest]
  199. return cls()
  200. @property
  201. def transaction(self):
  202. """A context within which files are committed together upon exit
  203. Requires the file class to implement `.commit()` and `.discard()`
  204. for the normal and exception cases.
  205. """
  206. if self._transaction is None:
  207. self._transaction = self.transaction_type(self)
  208. return self._transaction
  209. def start_transaction(self):
  210. """Begin write transaction for deferring files, non-context version"""
  211. self._intrans = True
  212. self._transaction = self.transaction_type(self)
  213. return self.transaction
  214. def end_transaction(self):
  215. """Finish write transaction, non-context version"""
  216. self.transaction.complete()
  217. self._transaction = None
  218. # The invalid cache must be cleared after the transaction is completed.
  219. for path in self._invalidated_caches_in_transaction:
  220. self.invalidate_cache(path)
  221. self._invalidated_caches_in_transaction.clear()
  222. def invalidate_cache(self, path=None):
  223. """
  224. Discard any cached directory information
  225. Parameters
  226. ----------
  227. path: string or None
  228. If None, clear all listings cached else listings at or under given
  229. path.
  230. """
  231. # Not necessary to implement invalidation mechanism, may have no cache.
  232. # But if have, you should call this method of parent class from your
  233. # subclass to ensure expiring caches after transacations correctly.
  234. # See the implementation of FTPFileSystem in ftp.py
  235. if self._intrans:
  236. self._invalidated_caches_in_transaction.append(path)
  237. def mkdir(self, path, create_parents=True, **kwargs):
  238. """
  239. Create directory entry at path
  240. For systems that don't have true directories, may create an for
  241. this instance only and not touch the real filesystem
  242. Parameters
  243. ----------
  244. path: str
  245. location
  246. create_parents: bool
  247. if True, this is equivalent to ``makedirs``
  248. kwargs:
  249. may be permissions, etc.
  250. """
  251. pass # not necessary to implement, may not have directories
  252. def makedirs(self, path, exist_ok=False):
  253. """Recursively make directories
  254. Creates directory at path and any intervening required directories.
  255. Raises exception if, for instance, the path already exists but is a
  256. file.
  257. Parameters
  258. ----------
  259. path: str
  260. leaf directory name
  261. exist_ok: bool (False)
  262. If False, will error if the target already exists
  263. """
  264. pass # not necessary to implement, may not have directories
  265. def rmdir(self, path):
  266. """Remove a directory, if empty"""
  267. pass # not necessary to implement, may not have directories
  268. def ls(self, path, detail=True, **kwargs):
  269. """List objects at path.
  270. This should include subdirectories and files at that location. The
  271. difference between a file and a directory must be clear when details
  272. are requested.
  273. The specific keys, or perhaps a FileInfo class, or similar, is TBD,
  274. but must be consistent across implementations.
  275. Must include:
  276. - full path to the entry (without protocol)
  277. - size of the entry, in bytes. If the value cannot be determined, will
  278. be ``None``.
  279. - type of entry, "file", "directory" or other
  280. Additional information
  281. may be present, appropriate to the file-system, e.g., generation,
  282. checksum, etc.
  283. May use refresh=True|False to allow use of self._ls_from_cache to
  284. check for a saved listing and avoid calling the backend. This would be
  285. common where listing may be expensive.
  286. Parameters
  287. ----------
  288. path: str
  289. detail: bool
  290. if True, gives a list of dictionaries, where each is the same as
  291. the result of ``info(path)``. If False, gives a list of paths
  292. (str).
  293. kwargs: may have additional backend-specific options, such as version
  294. information
  295. Returns
  296. -------
  297. List of strings if detail is False, or list of directory information
  298. dicts if detail is True.
  299. """
  300. raise NotImplementedError
  301. def _ls_from_cache(self, path):
  302. """Check cache for listing
  303. Returns listing, if found (may be empty list for a directly that exists
  304. but contains nothing), None if not in cache.
  305. """
  306. parent = self._parent(path)
  307. try:
  308. return self.dircache[path.rstrip("/")]
  309. except KeyError:
  310. pass
  311. try:
  312. files = [
  313. f
  314. for f in self.dircache[parent]
  315. if f["name"] == path
  316. or (f["name"] == path.rstrip("/") and f["type"] == "directory")
  317. ]
  318. if len(files) == 0:
  319. # parent dir was listed but did not contain this file
  320. raise FileNotFoundError(path)
  321. return files
  322. except KeyError:
  323. pass
  324. def walk(self, path, maxdepth=None, topdown=True, on_error="omit", **kwargs):
  325. """Return all files under the given path.
  326. List all files, recursing into subdirectories; output is iterator-style,
  327. like ``os.walk()``. For a simple list of files, ``find()`` is available.
  328. When topdown is True, the caller can modify the dirnames list in-place (perhaps
  329. using del or slice assignment), and walk() will
  330. only recurse into the subdirectories whose names remain in dirnames;
  331. this can be used to prune the search, impose a specific order of visiting,
  332. or even to inform walk() about directories the caller creates or renames before
  333. it resumes walk() again.
  334. Modifying dirnames when topdown is False has no effect. (see os.walk)
  335. Note that the "files" outputted will include anything that is not
  336. a directory, such as links.
  337. Parameters
  338. ----------
  339. path: str
  340. Root to recurse into
  341. maxdepth: int
  342. Maximum recursion depth. None means limitless, but not recommended
  343. on link-based file-systems.
  344. topdown: bool (True)
  345. Whether to walk the directory tree from the top downwards or from
  346. the bottom upwards.
  347. on_error: "omit", "raise", a callable
  348. if omit (default), path with exception will simply be empty;
  349. If raise, an underlying exception will be raised;
  350. if callable, it will be called with a single OSError instance as argument
  351. kwargs: passed to ``ls``
  352. """
  353. if maxdepth is not None and maxdepth < 1:
  354. raise ValueError("maxdepth must be at least 1")
  355. path = self._strip_protocol(path)
  356. full_dirs = {}
  357. dirs = {}
  358. files = {}
  359. detail = kwargs.pop("detail", False)
  360. try:
  361. listing = self.ls(path, detail=True, **kwargs)
  362. except (FileNotFoundError, OSError) as e:
  363. if on_error == "raise":
  364. raise
  365. if callable(on_error):
  366. on_error(e)
  367. return
  368. for info in listing:
  369. # each info name must be at least [path]/part , but here
  370. # we check also for names like [path]/part/
  371. pathname = info["name"].rstrip("/")
  372. name = pathname.rsplit("/", 1)[-1]
  373. if info["type"] == "directory" and pathname != path:
  374. # do not include "self" path
  375. full_dirs[name] = pathname
  376. dirs[name] = info
  377. elif pathname == path:
  378. # file-like with same name as give path
  379. files[""] = info
  380. else:
  381. files[name] = info
  382. if not detail:
  383. dirs = list(dirs)
  384. files = list(files)
  385. if topdown:
  386. # Yield before recursion if walking top down
  387. yield path, dirs, files
  388. if maxdepth is not None:
  389. maxdepth -= 1
  390. if maxdepth < 1:
  391. if not topdown:
  392. yield path, dirs, files
  393. return
  394. for d in dirs:
  395. yield from self.walk(
  396. full_dirs[d],
  397. maxdepth=maxdepth,
  398. detail=detail,
  399. topdown=topdown,
  400. **kwargs,
  401. )
  402. if not topdown:
  403. # Yield after recursion if walking bottom up
  404. yield path, dirs, files
  405. def find(self, path, maxdepth=None, withdirs=False, detail=False, **kwargs):
  406. """List all files below path.
  407. Like posix ``find`` command without conditions
  408. Parameters
  409. ----------
  410. path : str
  411. maxdepth: int or None
  412. If not None, the maximum number of levels to descend
  413. withdirs: bool
  414. Whether to include directory paths in the output. This is True
  415. when used by glob, but users usually only want files.
  416. kwargs are passed to ``ls``.
  417. """
  418. # TODO: allow equivalent of -name parameter
  419. path = self._strip_protocol(path)
  420. out = {}
  421. # Add the root directory if withdirs is requested
  422. # This is needed for posix glob compliance
  423. if withdirs and path != "" and self.isdir(path):
  424. out[path] = self.info(path)
  425. for _, dirs, files in self.walk(path, maxdepth, detail=True, **kwargs):
  426. if withdirs:
  427. files.update(dirs)
  428. out.update({info["name"]: info for name, info in files.items()})
  429. if not out and self.isfile(path):
  430. # walk works on directories, but find should also return [path]
  431. # when path happens to be a file
  432. out[path] = {}
  433. names = sorted(out)
  434. if not detail:
  435. return names
  436. else:
  437. return {name: out[name] for name in names}
  438. def du(self, path, total=True, maxdepth=None, withdirs=False, **kwargs):
  439. """Space used by files and optionally directories within a path
  440. Directory size does not include the size of its contents.
  441. Parameters
  442. ----------
  443. path: str
  444. total: bool
  445. Whether to sum all the file sizes
  446. maxdepth: int or None
  447. Maximum number of directory levels to descend, None for unlimited.
  448. withdirs: bool
  449. Whether to include directory paths in the output.
  450. kwargs: passed to ``find``
  451. Returns
  452. -------
  453. Dict of {path: size} if total=False, or int otherwise, where numbers
  454. refer to bytes used.
  455. """
  456. sizes = {}
  457. if withdirs and self.isdir(path):
  458. # Include top-level directory in output
  459. info = self.info(path)
  460. sizes[info["name"]] = info["size"]
  461. for f in self.find(path, maxdepth=maxdepth, withdirs=withdirs, **kwargs):
  462. info = self.info(f)
  463. sizes[info["name"]] = info["size"]
  464. if total:
  465. return sum(sizes.values())
  466. else:
  467. return sizes
  468. def glob(self, path, maxdepth=None, **kwargs):
  469. """Find files by glob-matching.
  470. Pattern matching capabilities for finding files that match the given pattern.
  471. Parameters
  472. ----------
  473. path: str
  474. The glob pattern to match against
  475. maxdepth: int or None
  476. Maximum depth for ``'**'`` patterns. Applied on the first ``'**'`` found.
  477. Must be at least 1 if provided.
  478. kwargs:
  479. Additional arguments passed to ``find`` (e.g., detail=True)
  480. Returns
  481. -------
  482. List of matched paths, or dict of paths and their info if detail=True
  483. Notes
  484. -----
  485. Supported patterns:
  486. - '*': Matches any sequence of characters within a single directory level
  487. - ``'**'``: Matches any number of directory levels (must be an entire path component)
  488. - '?': Matches exactly one character
  489. - '[abc]': Matches any character in the set
  490. - '[a-z]': Matches any character in the range
  491. - '[!abc]': Matches any character NOT in the set
  492. Special behaviors:
  493. - If the path ends with '/', only folders are returned
  494. - Consecutive '*' characters are compressed into a single '*'
  495. - Empty brackets '[]' never match anything
  496. - Negated empty brackets '[!]' match any single character
  497. - Special characters in character classes are escaped properly
  498. Limitations:
  499. - ``'**'`` must be a complete path component (e.g., ``'a/**/b'``, not ``'a**b'``)
  500. - No brace expansion ('{a,b}.txt')
  501. - No extended glob patterns ('+(pattern)', '!(pattern)')
  502. """
  503. if maxdepth is not None and maxdepth < 1:
  504. raise ValueError("maxdepth must be at least 1")
  505. import re
  506. seps = (os.path.sep, os.path.altsep) if os.path.altsep else (os.path.sep,)
  507. ends_with_sep = path.endswith(seps) # _strip_protocol strips trailing slash
  508. path = self._strip_protocol(path)
  509. append_slash_to_dirname = ends_with_sep or path.endswith(
  510. tuple(sep + "**" for sep in seps)
  511. )
  512. idx_star = path.find("*") if path.find("*") >= 0 else len(path)
  513. idx_qmark = path.find("?") if path.find("?") >= 0 else len(path)
  514. idx_brace = path.find("[") if path.find("[") >= 0 else len(path)
  515. min_idx = min(idx_star, idx_qmark, idx_brace)
  516. detail = kwargs.pop("detail", False)
  517. withdirs = kwargs.pop("withdirs", True)
  518. if not has_magic(path):
  519. if self.exists(path, **kwargs):
  520. if not detail:
  521. return [path]
  522. else:
  523. return {path: self.info(path, **kwargs)}
  524. else:
  525. if not detail:
  526. return [] # glob of non-existent returns empty
  527. else:
  528. return {}
  529. elif "/" in path[:min_idx]:
  530. min_idx = path[:min_idx].rindex("/")
  531. root = path[: min_idx + 1]
  532. depth = path[min_idx + 1 :].count("/") + 1
  533. else:
  534. root = ""
  535. depth = path[min_idx + 1 :].count("/") + 1
  536. if "**" in path:
  537. if maxdepth is not None:
  538. idx_double_stars = path.find("**")
  539. depth_double_stars = path[idx_double_stars:].count("/") + 1
  540. depth = depth - depth_double_stars + maxdepth
  541. else:
  542. depth = None
  543. allpaths = self.find(
  544. root, maxdepth=depth, withdirs=withdirs, detail=True, **kwargs
  545. )
  546. pattern = glob_translate(path + ("/" if ends_with_sep else ""))
  547. pattern = re.compile(pattern)
  548. out = {
  549. p: info
  550. for p, info in sorted(allpaths.items())
  551. if pattern.match(
  552. p + "/"
  553. if append_slash_to_dirname and info["type"] == "directory"
  554. else p
  555. )
  556. }
  557. if detail:
  558. return out
  559. else:
  560. return list(out)
  561. def exists(self, path, **kwargs):
  562. """Is there a file at the given path"""
  563. try:
  564. self.info(path, **kwargs)
  565. return True
  566. except: # noqa: E722
  567. # any exception allowed bar FileNotFoundError?
  568. return False
  569. def lexists(self, path, **kwargs):
  570. """If there is a file at the given path (including
  571. broken links)"""
  572. return self.exists(path)
  573. def info(self, path, **kwargs):
  574. """Give details of entry at path
  575. Returns a single dictionary, with exactly the same information as ``ls``
  576. would with ``detail=True``.
  577. The default implementation calls ls and could be overridden by a
  578. shortcut. kwargs are passed on to ```ls()``.
  579. Some file systems might not be able to measure the file's size, in
  580. which case, the returned dict will include ``'size': None``.
  581. Returns
  582. -------
  583. dict with keys: name (full path in the FS), size (in bytes), type (file,
  584. directory, or something else) and other FS-specific keys.
  585. """
  586. path = self._strip_protocol(path)
  587. out = self.ls(self._parent(path), detail=True, **kwargs)
  588. out = [o for o in out if o["name"].rstrip("/") == path]
  589. if out:
  590. return out[0]
  591. out = self.ls(path, detail=True, **kwargs)
  592. path = path.rstrip("/")
  593. out1 = [o for o in out if o["name"].rstrip("/") == path]
  594. if len(out1) == 1:
  595. if "size" not in out1[0]:
  596. out1[0]["size"] = None
  597. return out1[0]
  598. elif len(out1) > 1 or out:
  599. return {"name": path, "size": 0, "type": "directory"}
  600. else:
  601. raise FileNotFoundError(path)
  602. def checksum(self, path):
  603. """Unique value for current version of file
  604. If the checksum is the same from one moment to another, the contents
  605. are guaranteed to be the same. If the checksum changes, the contents
  606. *might* have changed.
  607. This should normally be overridden; default will probably capture
  608. creation/modification timestamp (which would be good) or maybe
  609. access timestamp (which would be bad)
  610. """
  611. return int(tokenize(self.info(path)), 16)
  612. def size(self, path):
  613. """Size in bytes of file"""
  614. return self.info(path).get("size", None)
  615. def sizes(self, paths):
  616. """Size in bytes of each file in a list of paths"""
  617. return [self.size(p) for p in paths]
  618. def isdir(self, path):
  619. """Is this entry directory-like?"""
  620. try:
  621. return self.info(path)["type"] == "directory"
  622. except OSError:
  623. return False
  624. def isfile(self, path):
  625. """Is this entry file-like?"""
  626. try:
  627. return self.info(path)["type"] == "file"
  628. except: # noqa: E722
  629. return False
  630. def read_text(self, path, encoding=None, errors=None, newline=None, **kwargs):
  631. """Get the contents of the file as a string.
  632. Parameters
  633. ----------
  634. path: str
  635. URL of file on this filesystems
  636. encoding, errors, newline: same as `open`.
  637. """
  638. with self.open(
  639. path,
  640. mode="r",
  641. encoding=encoding,
  642. errors=errors,
  643. newline=newline,
  644. **kwargs,
  645. ) as f:
  646. return f.read()
  647. def write_text(
  648. self, path, value, encoding=None, errors=None, newline=None, **kwargs
  649. ):
  650. """Write the text to the given file.
  651. An existing file will be overwritten.
  652. Parameters
  653. ----------
  654. path: str
  655. URL of file on this filesystems
  656. value: str
  657. Text to write.
  658. encoding, errors, newline: same as `open`.
  659. """
  660. with self.open(
  661. path,
  662. mode="w",
  663. encoding=encoding,
  664. errors=errors,
  665. newline=newline,
  666. **kwargs,
  667. ) as f:
  668. return f.write(value)
  669. def cat_file(self, path, start=None, end=None, **kwargs):
  670. """Get the content of a file
  671. Parameters
  672. ----------
  673. path: URL of file on this filesystems
  674. start, end: int
  675. Bytes limits of the read. If negative, backwards from end,
  676. like usual python slices. Either can be None for start or
  677. end of file, respectively
  678. kwargs: passed to ``open()``.
  679. """
  680. # explicitly set buffering off?
  681. with self.open(path, "rb", **kwargs) as f:
  682. if start is not None:
  683. if start >= 0:
  684. f.seek(start)
  685. else:
  686. f.seek(max(0, f.size + start))
  687. if end is not None:
  688. if end < 0:
  689. end = f.size + end
  690. return f.read(end - f.tell())
  691. return f.read()
  692. def pipe_file(self, path, value, mode="overwrite", **kwargs):
  693. """Set the bytes of given file"""
  694. if mode == "create" and self.exists(path):
  695. # non-atomic but simple way; or could use "xb" in open(), which is likely
  696. # not as well supported
  697. raise FileExistsError
  698. with self.open(path, "wb", **kwargs) as f:
  699. f.write(value)
  700. def pipe(self, path, value=None, **kwargs):
  701. """Put value into path
  702. (counterpart to ``cat``)
  703. Parameters
  704. ----------
  705. path: string or dict(str, bytes)
  706. If a string, a single remote location to put ``value`` bytes; if a dict,
  707. a mapping of {path: bytesvalue}.
  708. value: bytes, optional
  709. If using a single path, these are the bytes to put there. Ignored if
  710. ``path`` is a dict
  711. """
  712. if isinstance(path, str):
  713. self.pipe_file(self._strip_protocol(path), value, **kwargs)
  714. elif isinstance(path, dict):
  715. for k, v in path.items():
  716. self.pipe_file(self._strip_protocol(k), v, **kwargs)
  717. else:
  718. raise ValueError("path must be str or dict")
  719. def cat_ranges(
  720. self, paths, starts, ends, max_gap=None, on_error="return", **kwargs
  721. ):
  722. """Get the contents of byte ranges from one or more files
  723. Parameters
  724. ----------
  725. paths: list
  726. A list of of filepaths on this filesystems
  727. starts, ends: int or list
  728. Bytes limits of the read. If using a single int, the same value will be
  729. used to read all the specified files.
  730. """
  731. if max_gap is not None:
  732. raise NotImplementedError
  733. if not isinstance(paths, list):
  734. raise TypeError
  735. if not isinstance(starts, list):
  736. starts = [starts] * len(paths)
  737. if not isinstance(ends, list):
  738. ends = [ends] * len(paths)
  739. if len(starts) != len(paths) or len(ends) != len(paths):
  740. raise ValueError
  741. out = []
  742. for p, s, e in zip(paths, starts, ends):
  743. try:
  744. out.append(self.cat_file(p, s, e))
  745. except Exception as e:
  746. if on_error == "return":
  747. out.append(e)
  748. else:
  749. raise
  750. return out
  751. def cat(self, path, recursive=False, on_error="raise", **kwargs):
  752. """Fetch (potentially multiple) paths' contents
  753. Parameters
  754. ----------
  755. recursive: bool
  756. If True, assume the path(s) are directories, and get all the
  757. contained files
  758. on_error : "raise", "omit", "return"
  759. If raise, an underlying exception will be raised (converted to KeyError
  760. if the type is in self.missing_exceptions); if omit, keys with exception
  761. will simply not be included in the output; if "return", all keys are
  762. included in the output, but the value will be bytes or an exception
  763. instance.
  764. kwargs: passed to cat_file
  765. Returns
  766. -------
  767. dict of {path: contents} if there are multiple paths
  768. or the path has been otherwise expanded
  769. """
  770. paths = self.expand_path(path, recursive=recursive, **kwargs)
  771. if (
  772. len(paths) > 1
  773. or isinstance(path, list)
  774. or paths[0] != self._strip_protocol(path)
  775. ):
  776. out = {}
  777. for path in paths:
  778. try:
  779. out[path] = self.cat_file(path, **kwargs)
  780. except Exception as e:
  781. if on_error == "raise":
  782. raise
  783. if on_error == "return":
  784. out[path] = e
  785. return out
  786. else:
  787. return self.cat_file(paths[0], **kwargs)
  788. def get_file(self, rpath, lpath, callback=DEFAULT_CALLBACK, outfile=None, **kwargs):
  789. """Copy single remote file to local"""
  790. from .implementations.local import LocalFileSystem
  791. if isfilelike(lpath):
  792. outfile = lpath
  793. elif self.isdir(rpath):
  794. os.makedirs(lpath, exist_ok=True)
  795. return None
  796. fs = LocalFileSystem(auto_mkdir=True)
  797. fs.makedirs(fs._parent(lpath), exist_ok=True)
  798. with self.open(rpath, "rb", **kwargs) as f1:
  799. if outfile is None:
  800. outfile = open(lpath, "wb")
  801. try:
  802. callback.set_size(getattr(f1, "size", None))
  803. data = True
  804. while data:
  805. data = f1.read(self.blocksize)
  806. segment_len = outfile.write(data)
  807. if segment_len is None:
  808. segment_len = len(data)
  809. callback.relative_update(segment_len)
  810. finally:
  811. if not isfilelike(lpath):
  812. outfile.close()
  813. def get(
  814. self,
  815. rpath,
  816. lpath,
  817. recursive=False,
  818. callback=DEFAULT_CALLBACK,
  819. maxdepth=None,
  820. **kwargs,
  821. ):
  822. """Copy file(s) to local.
  823. Copies a specific file or tree of files (if recursive=True). If lpath
  824. ends with a "/", it will be assumed to be a directory, and target files
  825. will go within. Can submit a list of paths, which may be glob-patterns
  826. and will be expanded.
  827. Calls get_file for each source.
  828. """
  829. if isinstance(lpath, list) and isinstance(rpath, list):
  830. # No need to expand paths when both source and destination
  831. # are provided as lists
  832. rpaths = rpath
  833. lpaths = lpath
  834. else:
  835. from .implementations.local import (
  836. LocalFileSystem,
  837. make_path_posix,
  838. trailing_sep,
  839. )
  840. source_is_str = isinstance(rpath, str)
  841. rpaths = self.expand_path(
  842. rpath, recursive=recursive, maxdepth=maxdepth, **kwargs
  843. )
  844. if source_is_str and (not recursive or maxdepth is not None):
  845. # Non-recursive glob does not copy directories
  846. rpaths = [p for p in rpaths if not (trailing_sep(p) or self.isdir(p))]
  847. if not rpaths:
  848. return
  849. if isinstance(lpath, str):
  850. lpath = make_path_posix(lpath)
  851. source_is_file = len(rpaths) == 1
  852. dest_is_dir = isinstance(lpath, str) and (
  853. trailing_sep(lpath) or LocalFileSystem().isdir(lpath)
  854. )
  855. exists = source_is_str and (
  856. (has_magic(rpath) and source_is_file)
  857. or (not has_magic(rpath) and dest_is_dir and not trailing_sep(rpath))
  858. )
  859. lpaths = other_paths(
  860. rpaths,
  861. lpath,
  862. exists=exists,
  863. flatten=not source_is_str,
  864. )
  865. callback.set_size(len(lpaths))
  866. for lpath, rpath in callback.wrap(zip(lpaths, rpaths)):
  867. with callback.branched(rpath, lpath) as child:
  868. self.get_file(rpath, lpath, callback=child, **kwargs)
  869. def put_file(
  870. self, lpath, rpath, callback=DEFAULT_CALLBACK, mode="overwrite", **kwargs
  871. ):
  872. """Copy single file to remote"""
  873. if mode == "create" and self.exists(rpath):
  874. raise FileExistsError
  875. if os.path.isdir(lpath):
  876. self.makedirs(rpath, exist_ok=True)
  877. return None
  878. with open(lpath, "rb") as f1:
  879. size = f1.seek(0, 2)
  880. callback.set_size(size)
  881. f1.seek(0)
  882. self.mkdirs(self._parent(os.fspath(rpath)), exist_ok=True)
  883. with self.open(rpath, "wb", **kwargs) as f2:
  884. while f1.tell() < size:
  885. data = f1.read(self.blocksize)
  886. segment_len = f2.write(data)
  887. if segment_len is None:
  888. segment_len = len(data)
  889. callback.relative_update(segment_len)
  890. def put(
  891. self,
  892. lpath,
  893. rpath,
  894. recursive=False,
  895. callback=DEFAULT_CALLBACK,
  896. maxdepth=None,
  897. **kwargs,
  898. ):
  899. """Copy file(s) from local.
  900. Copies a specific file or tree of files (if recursive=True). If rpath
  901. ends with a "/", it will be assumed to be a directory, and target files
  902. will go within.
  903. Calls put_file for each source.
  904. """
  905. if isinstance(lpath, list) and isinstance(rpath, list):
  906. # No need to expand paths when both source and destination
  907. # are provided as lists
  908. rpaths = rpath
  909. lpaths = lpath
  910. else:
  911. from .implementations.local import (
  912. LocalFileSystem,
  913. make_path_posix,
  914. trailing_sep,
  915. )
  916. source_is_str = isinstance(lpath, str)
  917. if source_is_str:
  918. lpath = make_path_posix(lpath)
  919. fs = LocalFileSystem()
  920. lpaths = fs.expand_path(
  921. lpath, recursive=recursive, maxdepth=maxdepth, **kwargs
  922. )
  923. if source_is_str and (not recursive or maxdepth is not None):
  924. # Non-recursive glob does not copy directories
  925. lpaths = [p for p in lpaths if not (trailing_sep(p) or fs.isdir(p))]
  926. if not lpaths:
  927. return
  928. source_is_file = len(lpaths) == 1
  929. dest_is_dir = isinstance(rpath, str) and (
  930. trailing_sep(rpath) or self.isdir(rpath)
  931. )
  932. rpath = (
  933. self._strip_protocol(rpath)
  934. if isinstance(rpath, str)
  935. else [self._strip_protocol(p) for p in rpath]
  936. )
  937. exists = source_is_str and (
  938. (has_magic(lpath) and source_is_file)
  939. or (not has_magic(lpath) and dest_is_dir and not trailing_sep(lpath))
  940. )
  941. rpaths = other_paths(
  942. lpaths,
  943. rpath,
  944. exists=exists,
  945. flatten=not source_is_str,
  946. )
  947. callback.set_size(len(rpaths))
  948. for lpath, rpath in callback.wrap(zip(lpaths, rpaths)):
  949. with callback.branched(lpath, rpath) as child:
  950. self.put_file(lpath, rpath, callback=child, **kwargs)
  951. def head(self, path, size=1024):
  952. """Get the first ``size`` bytes from file"""
  953. with self.open(path, "rb") as f:
  954. return f.read(size)
  955. def tail(self, path, size=1024):
  956. """Get the last ``size`` bytes from file"""
  957. with self.open(path, "rb") as f:
  958. f.seek(max(-size, -f.size), 2)
  959. return f.read()
  960. def cp_file(self, path1, path2, **kwargs):
  961. raise NotImplementedError
  962. def copy(
  963. self, path1, path2, recursive=False, maxdepth=None, on_error=None, **kwargs
  964. ):
  965. """Copy within two locations in the filesystem
  966. on_error : "raise", "ignore"
  967. If raise, any not-found exceptions will be raised; if ignore any
  968. not-found exceptions will cause the path to be skipped; defaults to
  969. raise unless recursive is true, where the default is ignore
  970. """
  971. if on_error is None and recursive:
  972. on_error = "ignore"
  973. elif on_error is None:
  974. on_error = "raise"
  975. if isinstance(path1, list) and isinstance(path2, list):
  976. # No need to expand paths when both source and destination
  977. # are provided as lists
  978. paths1 = path1
  979. paths2 = path2
  980. else:
  981. from .implementations.local import trailing_sep
  982. source_is_str = isinstance(path1, str)
  983. paths1 = self.expand_path(
  984. path1, recursive=recursive, maxdepth=maxdepth, **kwargs
  985. )
  986. if source_is_str and (not recursive or maxdepth is not None):
  987. # Non-recursive glob does not copy directories
  988. paths1 = [p for p in paths1 if not (trailing_sep(p) or self.isdir(p))]
  989. if not paths1:
  990. return
  991. source_is_file = len(paths1) == 1
  992. dest_is_dir = isinstance(path2, str) and (
  993. trailing_sep(path2) or self.isdir(path2)
  994. )
  995. exists = source_is_str and (
  996. (has_magic(path1) and source_is_file)
  997. or (not has_magic(path1) and dest_is_dir and not trailing_sep(path1))
  998. )
  999. paths2 = other_paths(
  1000. paths1,
  1001. path2,
  1002. exists=exists,
  1003. flatten=not source_is_str,
  1004. )
  1005. for p1, p2 in zip(paths1, paths2):
  1006. try:
  1007. self.cp_file(p1, p2, **kwargs)
  1008. except FileNotFoundError:
  1009. if on_error == "raise":
  1010. raise
  1011. def expand_path(self, path, recursive=False, maxdepth=None, **kwargs):
  1012. """Turn one or more globs or directories into a list of all matching paths
  1013. to files or directories.
  1014. kwargs are passed to ``glob`` or ``find``, which may in turn call ``ls``
  1015. """
  1016. if maxdepth is not None and maxdepth < 1:
  1017. raise ValueError("maxdepth must be at least 1")
  1018. if isinstance(path, (str, os.PathLike)):
  1019. out = self.expand_path([path], recursive, maxdepth, **kwargs)
  1020. else:
  1021. out = set()
  1022. path = [self._strip_protocol(p) for p in path]
  1023. for p in path:
  1024. if has_magic(p):
  1025. bit = set(self.glob(p, maxdepth=maxdepth, **kwargs))
  1026. out |= bit
  1027. if recursive:
  1028. # glob call above expanded one depth so if maxdepth is defined
  1029. # then decrement it in expand_path call below. If it is zero
  1030. # after decrementing then avoid expand_path call.
  1031. if maxdepth is not None and maxdepth <= 1:
  1032. continue
  1033. out |= set(
  1034. self.expand_path(
  1035. list(bit),
  1036. recursive=recursive,
  1037. maxdepth=maxdepth - 1 if maxdepth is not None else None,
  1038. **kwargs,
  1039. )
  1040. )
  1041. continue
  1042. elif recursive:
  1043. rec = set(
  1044. self.find(
  1045. p, maxdepth=maxdepth, withdirs=True, detail=False, **kwargs
  1046. )
  1047. )
  1048. out |= rec
  1049. if p not in out and (recursive is False or self.exists(p)):
  1050. # should only check once, for the root
  1051. out.add(p)
  1052. if not out:
  1053. raise FileNotFoundError(path)
  1054. return sorted(out)
  1055. def mv(self, path1, path2, recursive=False, maxdepth=None, **kwargs):
  1056. """Move file(s) from one location to another"""
  1057. if path1 == path2:
  1058. logger.debug("%s mv: The paths are the same, so no files were moved.", self)
  1059. else:
  1060. # explicitly raise exception to prevent data corruption
  1061. self.copy(
  1062. path1, path2, recursive=recursive, maxdepth=maxdepth, onerror="raise"
  1063. )
  1064. self.rm(path1, recursive=recursive)
  1065. def rm_file(self, path):
  1066. """Delete a file"""
  1067. self._rm(path)
  1068. def _rm(self, path):
  1069. """Delete one file"""
  1070. # this is the old name for the method, prefer rm_file
  1071. raise NotImplementedError
  1072. def rm(self, path, recursive=False, maxdepth=None):
  1073. """Delete files.
  1074. Parameters
  1075. ----------
  1076. path: str or list of str
  1077. File(s) to delete.
  1078. recursive: bool
  1079. If file(s) are directories, recursively delete contents and then
  1080. also remove the directory
  1081. maxdepth: int or None
  1082. Depth to pass to walk for finding files to delete, if recursive.
  1083. If None, there will be no limit and infinite recursion may be
  1084. possible.
  1085. """
  1086. path = self.expand_path(path, recursive=recursive, maxdepth=maxdepth)
  1087. for p in reversed(path):
  1088. self.rm_file(p)
  1089. @classmethod
  1090. def _parent(cls, path):
  1091. path = cls._strip_protocol(path)
  1092. if "/" in path:
  1093. parent = path.rsplit("/", 1)[0].lstrip(cls.root_marker)
  1094. return cls.root_marker + parent
  1095. else:
  1096. return cls.root_marker
  1097. def _open(
  1098. self,
  1099. path,
  1100. mode="rb",
  1101. block_size=None,
  1102. autocommit=True,
  1103. cache_options=None,
  1104. **kwargs,
  1105. ):
  1106. """Return raw bytes-mode file-like from the file-system"""
  1107. return AbstractBufferedFile(
  1108. self,
  1109. path,
  1110. mode,
  1111. block_size,
  1112. autocommit,
  1113. cache_options=cache_options,
  1114. **kwargs,
  1115. )
  1116. def open(
  1117. self,
  1118. path,
  1119. mode="rb",
  1120. block_size=None,
  1121. cache_options=None,
  1122. compression=None,
  1123. **kwargs,
  1124. ):
  1125. """
  1126. Return a file-like object from the filesystem
  1127. The resultant instance must function correctly in a context ``with``
  1128. block.
  1129. Parameters
  1130. ----------
  1131. path: str
  1132. Target file
  1133. mode: str like 'rb', 'w'
  1134. See builtin ``open()``
  1135. Mode "x" (exclusive write) may be implemented by the backend. Even if
  1136. it is, whether it is checked up front or on commit, and whether it is
  1137. atomic is implementation-dependent.
  1138. block_size: int
  1139. Some indication of buffering - this is a value in bytes
  1140. cache_options : dict, optional
  1141. Extra arguments to pass through to the cache.
  1142. compression: string or None
  1143. If given, open file using compression codec. Can either be a compression
  1144. name (a key in ``fsspec.compression.compr``) or "infer" to guess the
  1145. compression from the filename suffix.
  1146. encoding, errors, newline: passed on to TextIOWrapper for text mode
  1147. """
  1148. import io
  1149. path = self._strip_protocol(path)
  1150. if "b" not in mode:
  1151. mode = mode.replace("t", "") + "b"
  1152. text_kwargs = {
  1153. k: kwargs.pop(k)
  1154. for k in ["encoding", "errors", "newline"]
  1155. if k in kwargs
  1156. }
  1157. return io.TextIOWrapper(
  1158. self.open(
  1159. path,
  1160. mode,
  1161. block_size=block_size,
  1162. cache_options=cache_options,
  1163. compression=compression,
  1164. **kwargs,
  1165. ),
  1166. **text_kwargs,
  1167. )
  1168. else:
  1169. ac = kwargs.pop("autocommit", not self._intrans)
  1170. f = self._open(
  1171. path,
  1172. mode=mode,
  1173. block_size=block_size,
  1174. autocommit=ac,
  1175. cache_options=cache_options,
  1176. **kwargs,
  1177. )
  1178. if compression is not None:
  1179. from fsspec.compression import compr
  1180. from fsspec.core import get_compression
  1181. compression = get_compression(path, compression)
  1182. compress = compr[compression]
  1183. f = compress(f, mode=mode[0])
  1184. if not ac and "r" not in mode:
  1185. self.transaction.files.append(f)
  1186. return f
  1187. def touch(self, path, truncate=True, **kwargs):
  1188. """Create empty file, or update timestamp
  1189. Parameters
  1190. ----------
  1191. path: str
  1192. file location
  1193. truncate: bool
  1194. If True, always set file size to 0; if False, update timestamp and
  1195. leave file unchanged, if backend allows this
  1196. """
  1197. if truncate or not self.exists(path):
  1198. with self.open(path, "wb", **kwargs):
  1199. pass
  1200. else:
  1201. raise NotImplementedError # update timestamp, if possible
  1202. def ukey(self, path):
  1203. """Hash of file properties, to tell if it has changed"""
  1204. return sha256(str(self.info(path)).encode()).hexdigest()
  1205. def read_block(self, fn, offset, length, delimiter=None):
  1206. """Read a block of bytes from
  1207. Starting at ``offset`` of the file, read ``length`` bytes. If
  1208. ``delimiter`` is set then we ensure that the read starts and stops at
  1209. delimiter boundaries that follow the locations ``offset`` and ``offset
  1210. + length``. If ``offset`` is zero then we start at zero. The
  1211. bytestring returned WILL include the end delimiter string.
  1212. If offset+length is beyond the eof, reads to eof.
  1213. Parameters
  1214. ----------
  1215. fn: string
  1216. Path to filename
  1217. offset: int
  1218. Byte offset to start read
  1219. length: int
  1220. Number of bytes to read. If None, read to end.
  1221. delimiter: bytes (optional)
  1222. Ensure reading starts and stops at delimiter bytestring
  1223. Examples
  1224. --------
  1225. >>> fs.read_block('data/file.csv', 0, 13) # doctest: +SKIP
  1226. b'Alice, 100\\nBo'
  1227. >>> fs.read_block('data/file.csv', 0, 13, delimiter=b'\\n') # doctest: +SKIP
  1228. b'Alice, 100\\nBob, 200\\n'
  1229. Use ``length=None`` to read to the end of the file.
  1230. >>> fs.read_block('data/file.csv', 0, None, delimiter=b'\\n') # doctest: +SKIP
  1231. b'Alice, 100\\nBob, 200\\nCharlie, 300'
  1232. See Also
  1233. --------
  1234. :func:`fsspec.utils.read_block`
  1235. """
  1236. with self.open(fn, "rb") as f:
  1237. size = f.size
  1238. if length is None:
  1239. length = size
  1240. if size is not None and offset + length > size:
  1241. length = size - offset
  1242. return read_block(f, offset, length, delimiter)
  1243. def to_json(self, *, include_password: bool = True) -> str:
  1244. """
  1245. JSON representation of this filesystem instance.
  1246. Parameters
  1247. ----------
  1248. include_password: bool, default True
  1249. Whether to include the password (if any) in the output.
  1250. Returns
  1251. -------
  1252. JSON string with keys ``cls`` (the python location of this class),
  1253. protocol (text name of this class's protocol, first one in case of
  1254. multiple), ``args`` (positional args, usually empty), and all other
  1255. keyword arguments as their own keys.
  1256. Warnings
  1257. --------
  1258. Serialized filesystems may contain sensitive information which have been
  1259. passed to the constructor, such as passwords and tokens. Make sure you
  1260. store and send them in a secure environment!
  1261. """
  1262. from .json import FilesystemJSONEncoder
  1263. return json.dumps(
  1264. self,
  1265. cls=type(
  1266. "_FilesystemJSONEncoder",
  1267. (FilesystemJSONEncoder,),
  1268. {"include_password": include_password},
  1269. ),
  1270. )
  1271. @staticmethod
  1272. def from_json(blob: str) -> AbstractFileSystem:
  1273. """
  1274. Recreate a filesystem instance from JSON representation.
  1275. See ``.to_json()`` for the expected structure of the input.
  1276. Parameters
  1277. ----------
  1278. blob: str
  1279. Returns
  1280. -------
  1281. file system instance, not necessarily of this particular class.
  1282. Warnings
  1283. --------
  1284. This can import arbitrary modules (as determined by the ``cls`` key).
  1285. Make sure you haven't installed any modules that may execute malicious code
  1286. at import time.
  1287. """
  1288. from .json import FilesystemJSONDecoder
  1289. return json.loads(blob, cls=FilesystemJSONDecoder)
  1290. def to_dict(self, *, include_password: bool = True) -> dict[str, Any]:
  1291. """
  1292. JSON-serializable dictionary representation of this filesystem instance.
  1293. Parameters
  1294. ----------
  1295. include_password: bool, default True
  1296. Whether to include the password (if any) in the output.
  1297. Returns
  1298. -------
  1299. Dictionary with keys ``cls`` (the python location of this class),
  1300. protocol (text name of this class's protocol, first one in case of
  1301. multiple), ``args`` (positional args, usually empty), and all other
  1302. keyword arguments as their own keys.
  1303. Warnings
  1304. --------
  1305. Serialized filesystems may contain sensitive information which have been
  1306. passed to the constructor, such as passwords and tokens. Make sure you
  1307. store and send them in a secure environment!
  1308. """
  1309. from .json import FilesystemJSONEncoder
  1310. json_encoder = FilesystemJSONEncoder()
  1311. cls = type(self)
  1312. proto = self.protocol
  1313. storage_options = dict(self.storage_options)
  1314. if not include_password:
  1315. storage_options.pop("password", None)
  1316. return dict(
  1317. cls=f"{cls.__module__}:{cls.__name__}",
  1318. protocol=proto[0] if isinstance(proto, (tuple, list)) else proto,
  1319. args=json_encoder.make_serializable(self.storage_args),
  1320. **json_encoder.make_serializable(storage_options),
  1321. )
  1322. @staticmethod
  1323. def from_dict(dct: dict[str, Any]) -> AbstractFileSystem:
  1324. """
  1325. Recreate a filesystem instance from dictionary representation.
  1326. See ``.to_dict()`` for the expected structure of the input.
  1327. Parameters
  1328. ----------
  1329. dct: Dict[str, Any]
  1330. Returns
  1331. -------
  1332. file system instance, not necessarily of this particular class.
  1333. Warnings
  1334. --------
  1335. This can import arbitrary modules (as determined by the ``cls`` key).
  1336. Make sure you haven't installed any modules that may execute malicious code
  1337. at import time.
  1338. """
  1339. from .json import FilesystemJSONDecoder
  1340. json_decoder = FilesystemJSONDecoder()
  1341. dct = dict(dct) # Defensive copy
  1342. cls = FilesystemJSONDecoder.try_resolve_fs_cls(dct)
  1343. if cls is None:
  1344. raise ValueError("Not a serialized AbstractFileSystem")
  1345. dct.pop("cls", None)
  1346. dct.pop("protocol", None)
  1347. return cls(
  1348. *json_decoder.unmake_serializable(dct.pop("args", ())),
  1349. **json_decoder.unmake_serializable(dct),
  1350. )
  1351. def _get_pyarrow_filesystem(self):
  1352. """
  1353. Make a version of the FS instance which will be acceptable to pyarrow
  1354. """
  1355. # all instances already also derive from pyarrow
  1356. return self
  1357. def get_mapper(self, root="", check=False, create=False, missing_exceptions=None):
  1358. """Create key/value store based on this file-system
  1359. Makes a MutableMapping interface to the FS at the given root path.
  1360. See ``fsspec.mapping.FSMap`` for further details.
  1361. """
  1362. from .mapping import FSMap
  1363. return FSMap(
  1364. root,
  1365. self,
  1366. check=check,
  1367. create=create,
  1368. missing_exceptions=missing_exceptions,
  1369. )
  1370. @classmethod
  1371. def clear_instance_cache(cls):
  1372. """
  1373. Clear the cache of filesystem instances.
  1374. Notes
  1375. -----
  1376. Unless overridden by setting the ``cachable`` class attribute to False,
  1377. the filesystem class stores a reference to newly created instances. This
  1378. prevents Python's normal rules around garbage collection from working,
  1379. since the instances refcount will not drop to zero until
  1380. ``clear_instance_cache`` is called.
  1381. """
  1382. cls._cache.clear()
  1383. def created(self, path):
  1384. """Return the created timestamp of a file as a datetime.datetime"""
  1385. raise NotImplementedError
  1386. def modified(self, path):
  1387. """Return the modified timestamp of a file as a datetime.datetime"""
  1388. raise NotImplementedError
  1389. def tree(
  1390. self,
  1391. path: str = "/",
  1392. recursion_limit: int = 2,
  1393. max_display: int = 25,
  1394. display_size: bool = False,
  1395. prefix: str = "",
  1396. is_last: bool = True,
  1397. first: bool = True,
  1398. indent_size: int = 4,
  1399. ) -> str:
  1400. """
  1401. Return a tree-like structure of the filesystem starting from the given path as a string.
  1402. Parameters
  1403. ----------
  1404. path: Root path to start traversal from
  1405. recursion_limit: Maximum depth of directory traversal
  1406. max_display: Maximum number of items to display per directory
  1407. display_size: Whether to display file sizes
  1408. prefix: Current line prefix for visual tree structure
  1409. is_last: Whether current item is last in its level
  1410. first: Whether this is the first call (displays root path)
  1411. indent_size: Number of spaces by indent
  1412. Returns
  1413. -------
  1414. str: A string representing the tree structure.
  1415. Example
  1416. -------
  1417. >>> from fsspec import filesystem
  1418. >>> fs = filesystem('ftp', host='test.rebex.net', user='demo', password='password')
  1419. >>> tree = fs.tree(display_size=True, recursion_limit=3, indent_size=8, max_display=10)
  1420. >>> print(tree)
  1421. """
  1422. def format_bytes(n: int) -> str:
  1423. """Format bytes as text."""
  1424. for prefix, k in (
  1425. ("P", 2**50),
  1426. ("T", 2**40),
  1427. ("G", 2**30),
  1428. ("M", 2**20),
  1429. ("k", 2**10),
  1430. ):
  1431. if n >= 0.9 * k:
  1432. return f"{n / k:.2f} {prefix}b"
  1433. return f"{n}B"
  1434. result = []
  1435. if first:
  1436. result.append(path)
  1437. if recursion_limit:
  1438. indent = " " * indent_size
  1439. contents = self.ls(path, detail=True)
  1440. contents.sort(
  1441. key=lambda x: (x.get("type") != "directory", x.get("name", ""))
  1442. )
  1443. if max_display is not None and len(contents) > max_display:
  1444. displayed_contents = contents[:max_display]
  1445. remaining_count = len(contents) - max_display
  1446. else:
  1447. displayed_contents = contents
  1448. remaining_count = 0
  1449. for i, item in enumerate(displayed_contents):
  1450. is_last_item = (i == len(displayed_contents) - 1) and (
  1451. remaining_count == 0
  1452. )
  1453. branch = (
  1454. "└" + ("─" * (indent_size - 2))
  1455. if is_last_item
  1456. else "├" + ("─" * (indent_size - 2))
  1457. )
  1458. branch += " "
  1459. new_prefix = prefix + (
  1460. indent if is_last_item else "│" + " " * (indent_size - 1)
  1461. )
  1462. name = os.path.basename(item.get("name", ""))
  1463. if display_size and item.get("type") == "directory":
  1464. sub_contents = self.ls(item.get("name", ""), detail=True)
  1465. num_files = sum(
  1466. 1 for sub_item in sub_contents if sub_item.get("type") == "file"
  1467. )
  1468. num_folders = sum(
  1469. 1
  1470. for sub_item in sub_contents
  1471. if sub_item.get("type") == "directory"
  1472. )
  1473. if num_files == 0 and num_folders == 0:
  1474. size = " (empty folder)"
  1475. elif num_files == 0:
  1476. size = f" ({num_folders} subfolder{'s' if num_folders > 1 else ''})"
  1477. elif num_folders == 0:
  1478. size = f" ({num_files} file{'s' if num_files > 1 else ''})"
  1479. else:
  1480. size = f" ({num_files} file{'s' if num_files > 1 else ''}, {num_folders} subfolder{'s' if num_folders > 1 else ''})"
  1481. elif display_size and item.get("type") == "file":
  1482. size = f" ({format_bytes(item.get('size', 0))})"
  1483. else:
  1484. size = ""
  1485. result.append(f"{prefix}{branch}{name}{size}")
  1486. if item.get("type") == "directory" and recursion_limit > 0:
  1487. result.append(
  1488. self.tree(
  1489. path=item.get("name", ""),
  1490. recursion_limit=recursion_limit - 1,
  1491. max_display=max_display,
  1492. display_size=display_size,
  1493. prefix=new_prefix,
  1494. is_last=is_last_item,
  1495. first=False,
  1496. indent_size=indent_size,
  1497. )
  1498. )
  1499. if remaining_count > 0:
  1500. more_message = f"{remaining_count} more item(s) not displayed."
  1501. result.append(
  1502. f"{prefix}{'└' + ('─' * (indent_size - 2))} {more_message}"
  1503. )
  1504. return "\n".join(_ for _ in result if _)
  1505. # ------------------------------------------------------------------------
  1506. # Aliases
  1507. def read_bytes(self, path, start=None, end=None, **kwargs):
  1508. """Alias of `AbstractFileSystem.cat_file`."""
  1509. return self.cat_file(path, start=start, end=end, **kwargs)
  1510. def write_bytes(self, path, value, **kwargs):
  1511. """Alias of `AbstractFileSystem.pipe_file`."""
  1512. self.pipe_file(path, value, **kwargs)
  1513. def makedir(self, path, create_parents=True, **kwargs):
  1514. """Alias of `AbstractFileSystem.mkdir`."""
  1515. return self.mkdir(path, create_parents=create_parents, **kwargs)
  1516. def mkdirs(self, path, exist_ok=False):
  1517. """Alias of `AbstractFileSystem.makedirs`."""
  1518. return self.makedirs(path, exist_ok=exist_ok)
  1519. def listdir(self, path, detail=True, **kwargs):
  1520. """Alias of `AbstractFileSystem.ls`."""
  1521. return self.ls(path, detail=detail, **kwargs)
  1522. def cp(self, path1, path2, **kwargs):
  1523. """Alias of `AbstractFileSystem.copy`."""
  1524. return self.copy(path1, path2, **kwargs)
  1525. def move(self, path1, path2, **kwargs):
  1526. """Alias of `AbstractFileSystem.mv`."""
  1527. return self.mv(path1, path2, **kwargs)
  1528. def stat(self, path, **kwargs):
  1529. """Alias of `AbstractFileSystem.info`."""
  1530. return self.info(path, **kwargs)
  1531. def disk_usage(self, path, total=True, maxdepth=None, **kwargs):
  1532. """Alias of `AbstractFileSystem.du`."""
  1533. return self.du(path, total=total, maxdepth=maxdepth, **kwargs)
  1534. def rename(self, path1, path2, **kwargs):
  1535. """Alias of `AbstractFileSystem.mv`."""
  1536. return self.mv(path1, path2, **kwargs)
  1537. def delete(self, path, recursive=False, maxdepth=None):
  1538. """Alias of `AbstractFileSystem.rm`."""
  1539. return self.rm(path, recursive=recursive, maxdepth=maxdepth)
  1540. def upload(self, lpath, rpath, recursive=False, **kwargs):
  1541. """Alias of `AbstractFileSystem.put`."""
  1542. return self.put(lpath, rpath, recursive=recursive, **kwargs)
  1543. def download(self, rpath, lpath, recursive=False, **kwargs):
  1544. """Alias of `AbstractFileSystem.get`."""
  1545. return self.get(rpath, lpath, recursive=recursive, **kwargs)
  1546. def sign(self, path, expiration=100, **kwargs):
  1547. """Create a signed URL representing the given path
  1548. Some implementations allow temporary URLs to be generated, as a
  1549. way of delegating credentials.
  1550. Parameters
  1551. ----------
  1552. path : str
  1553. The path on the filesystem
  1554. expiration : int
  1555. Number of seconds to enable the URL for (if supported)
  1556. Returns
  1557. -------
  1558. URL : str
  1559. The signed URL
  1560. Raises
  1561. ------
  1562. NotImplementedError : if method is not implemented for a filesystem
  1563. """
  1564. raise NotImplementedError("Sign is not implemented for this filesystem")
  1565. def _isfilestore(self):
  1566. # Originally inherited from pyarrow DaskFileSystem. Keeping this
  1567. # here for backwards compatibility as long as pyarrow uses its
  1568. # legacy fsspec-compatible filesystems and thus accepts fsspec
  1569. # filesystems as well
  1570. return False
  1571. class AbstractBufferedFile(io.IOBase):
  1572. """Convenient class to derive from to provide buffering
  1573. In the case that the backend does not provide a pythonic file-like object
  1574. already, this class contains much of the logic to build one. The only
  1575. methods that need to be overridden are ``_upload_chunk``,
  1576. ``_initiate_upload`` and ``_fetch_range``.
  1577. """
  1578. DEFAULT_BLOCK_SIZE = 5 * 2**20
  1579. _details = None
  1580. def __init__(
  1581. self,
  1582. fs,
  1583. path,
  1584. mode="rb",
  1585. block_size="default",
  1586. autocommit=True,
  1587. cache_type="readahead",
  1588. cache_options=None,
  1589. size=None,
  1590. **kwargs,
  1591. ):
  1592. """
  1593. Template for files with buffered reading and writing
  1594. Parameters
  1595. ----------
  1596. fs: instance of FileSystem
  1597. path: str
  1598. location in file-system
  1599. mode: str
  1600. Normal file modes. Currently only 'wb', 'ab' or 'rb'. Some file
  1601. systems may be read-only, and some may not support append.
  1602. block_size: int
  1603. Buffer size for reading or writing, 'default' for class default
  1604. autocommit: bool
  1605. Whether to write to final destination; may only impact what
  1606. happens when file is being closed.
  1607. cache_type: {"readahead", "none", "mmap", "bytes"}, default "readahead"
  1608. Caching policy in read mode. See the definitions in ``core``.
  1609. cache_options : dict
  1610. Additional options passed to the constructor for the cache specified
  1611. by `cache_type`.
  1612. size: int
  1613. If given and in read mode, suppressed having to look up the file size
  1614. kwargs:
  1615. Gets stored as self.kwargs
  1616. """
  1617. from .core import caches
  1618. self.path = path
  1619. self.fs = fs
  1620. self.mode = mode
  1621. self.blocksize = (
  1622. self.DEFAULT_BLOCK_SIZE if block_size in ["default", None] else block_size
  1623. )
  1624. self.loc = 0
  1625. self.autocommit = autocommit
  1626. self.end = None
  1627. self.start = None
  1628. self.closed = False
  1629. if cache_options is None:
  1630. cache_options = {}
  1631. if "trim" in kwargs:
  1632. warnings.warn(
  1633. "Passing 'trim' to control the cache behavior has been deprecated. "
  1634. "Specify it within the 'cache_options' argument instead.",
  1635. FutureWarning,
  1636. )
  1637. cache_options["trim"] = kwargs.pop("trim")
  1638. self.kwargs = kwargs
  1639. if mode not in {"ab", "rb", "wb", "xb"}:
  1640. raise NotImplementedError("File mode not supported")
  1641. if mode == "rb":
  1642. if size is not None:
  1643. self.size = size
  1644. else:
  1645. self.size = self.details["size"]
  1646. self.cache = caches[cache_type](
  1647. self.blocksize, self._fetch_range, self.size, **cache_options
  1648. )
  1649. else:
  1650. self.buffer = io.BytesIO()
  1651. self.offset = None
  1652. self.forced = False
  1653. self.location = None
  1654. @property
  1655. def details(self):
  1656. if self._details is None:
  1657. self._details = self.fs.info(self.path)
  1658. return self._details
  1659. @details.setter
  1660. def details(self, value):
  1661. self._details = value
  1662. self.size = value["size"]
  1663. @property
  1664. def full_name(self):
  1665. return _unstrip_protocol(self.path, self.fs)
  1666. @property
  1667. def closed(self):
  1668. # get around this attr being read-only in IOBase
  1669. # use getattr here, since this can be called during del
  1670. return getattr(self, "_closed", True)
  1671. @closed.setter
  1672. def closed(self, c):
  1673. self._closed = c
  1674. def __hash__(self):
  1675. if "w" in self.mode:
  1676. return id(self)
  1677. else:
  1678. return int(tokenize(self.details), 16)
  1679. def __eq__(self, other):
  1680. """Files are equal if they have the same checksum, only in read mode"""
  1681. if self is other:
  1682. return True
  1683. return (
  1684. isinstance(other, type(self))
  1685. and self.mode == "rb"
  1686. and other.mode == "rb"
  1687. and hash(self) == hash(other)
  1688. )
  1689. def commit(self):
  1690. """Move from temp to final destination"""
  1691. def discard(self):
  1692. """Throw away temporary file"""
  1693. def info(self):
  1694. """File information about this path"""
  1695. if self.readable():
  1696. return self.details
  1697. else:
  1698. raise ValueError("Info not available while writing")
  1699. def tell(self):
  1700. """Current file location"""
  1701. return self.loc
  1702. def seek(self, loc, whence=0):
  1703. """Set current file location
  1704. Parameters
  1705. ----------
  1706. loc: int
  1707. byte location
  1708. whence: {0, 1, 2}
  1709. from start of file, current location or end of file, resp.
  1710. """
  1711. loc = int(loc)
  1712. if not self.mode == "rb":
  1713. raise OSError(ESPIPE, "Seek only available in read mode")
  1714. if whence == 0:
  1715. nloc = loc
  1716. elif whence == 1:
  1717. nloc = self.loc + loc
  1718. elif whence == 2:
  1719. nloc = self.size + loc
  1720. else:
  1721. raise ValueError(f"invalid whence ({whence}, should be 0, 1 or 2)")
  1722. if nloc < 0:
  1723. raise ValueError("Seek before start of file")
  1724. self.loc = nloc
  1725. return self.loc
  1726. def write(self, data):
  1727. """
  1728. Write data to buffer.
  1729. Buffer only sent on flush() or if buffer is greater than
  1730. or equal to blocksize.
  1731. Parameters
  1732. ----------
  1733. data: bytes
  1734. Set of bytes to be written.
  1735. """
  1736. if not self.writable():
  1737. raise ValueError("File not in write mode")
  1738. if self.closed:
  1739. raise ValueError("I/O operation on closed file.")
  1740. if self.forced:
  1741. raise ValueError("This file has been force-flushed, can only close")
  1742. out = self.buffer.write(data)
  1743. self.loc += out
  1744. if self.buffer.tell() >= self.blocksize:
  1745. self.flush()
  1746. return out
  1747. def flush(self, force=False):
  1748. """
  1749. Write buffered data to backend store.
  1750. Writes the current buffer, if it is larger than the block-size, or if
  1751. the file is being closed.
  1752. Parameters
  1753. ----------
  1754. force: bool
  1755. When closing, write the last block even if it is smaller than
  1756. blocks are allowed to be. Disallows further writing to this file.
  1757. """
  1758. if self.closed:
  1759. raise ValueError("Flush on closed file")
  1760. if force and self.forced:
  1761. raise ValueError("Force flush cannot be called more than once")
  1762. if force:
  1763. self.forced = True
  1764. if self.readable():
  1765. # no-op to flush on read-mode
  1766. return
  1767. if not force and self.buffer.tell() < self.blocksize:
  1768. # Defer write on small block
  1769. return
  1770. if self.offset is None:
  1771. # Initialize a multipart upload
  1772. self.offset = 0
  1773. try:
  1774. self._initiate_upload()
  1775. except:
  1776. self.closed = True
  1777. raise
  1778. if self._upload_chunk(final=force) is not False:
  1779. self.offset += self.buffer.seek(0, 2)
  1780. self.buffer = io.BytesIO()
  1781. def _upload_chunk(self, final=False):
  1782. """Write one part of a multi-block file upload
  1783. Parameters
  1784. ==========
  1785. final: bool
  1786. This is the last block, so should complete file, if
  1787. self.autocommit is True.
  1788. """
  1789. # may not yet have been initialized, may need to call _initialize_upload
  1790. def _initiate_upload(self):
  1791. """Create remote file/upload"""
  1792. pass
  1793. def _fetch_range(self, start, end):
  1794. """Get the specified set of bytes from remote"""
  1795. return self.fs.cat_file(self.path, start=start, end=end)
  1796. def read(self, length=-1):
  1797. """
  1798. Return data from cache, or fetch pieces as necessary
  1799. Parameters
  1800. ----------
  1801. length: int (-1)
  1802. Number of bytes to read; if <0, all remaining bytes.
  1803. """
  1804. length = -1 if length is None else int(length)
  1805. if self.mode != "rb":
  1806. raise ValueError("File not in read mode")
  1807. if length < 0:
  1808. length = self.size - self.loc
  1809. if self.closed:
  1810. raise ValueError("I/O operation on closed file.")
  1811. if length == 0:
  1812. # don't even bother calling fetch
  1813. return b""
  1814. out = self.cache._fetch(self.loc, self.loc + length)
  1815. logger.debug(
  1816. "%s read: %i - %i %s",
  1817. self,
  1818. self.loc,
  1819. self.loc + length,
  1820. self.cache._log_stats(),
  1821. )
  1822. self.loc += len(out)
  1823. return out
  1824. def readinto(self, b):
  1825. """mirrors builtin file's readinto method
  1826. https://docs.python.org/3/library/io.html#io.RawIOBase.readinto
  1827. """
  1828. out = memoryview(b).cast("B")
  1829. data = self.read(out.nbytes)
  1830. out[: len(data)] = data
  1831. return len(data)
  1832. def readuntil(self, char=b"\n", blocks=None):
  1833. """Return data between current position and first occurrence of char
  1834. char is included in the output, except if the end of the tile is
  1835. encountered first.
  1836. Parameters
  1837. ----------
  1838. char: bytes
  1839. Thing to find
  1840. blocks: None or int
  1841. How much to read in each go. Defaults to file blocksize - which may
  1842. mean a new read on every call.
  1843. """
  1844. out = []
  1845. while True:
  1846. start = self.tell()
  1847. part = self.read(blocks or self.blocksize)
  1848. if len(part) == 0:
  1849. break
  1850. found = part.find(char)
  1851. if found > -1:
  1852. out.append(part[: found + len(char)])
  1853. self.seek(start + found + len(char))
  1854. break
  1855. out.append(part)
  1856. return b"".join(out)
  1857. def readline(self):
  1858. """Read until and including the first occurrence of newline character
  1859. Note that, because of character encoding, this is not necessarily a
  1860. true line ending.
  1861. """
  1862. return self.readuntil(b"\n")
  1863. def __next__(self):
  1864. out = self.readline()
  1865. if out:
  1866. return out
  1867. raise StopIteration
  1868. def __iter__(self):
  1869. return self
  1870. def readlines(self):
  1871. """Return all data, split by the newline character, including the newline character"""
  1872. data = self.read()
  1873. lines = data.split(b"\n")
  1874. out = [l + b"\n" for l in lines[:-1]]
  1875. if data.endswith(b"\n"):
  1876. return out
  1877. else:
  1878. return out + [lines[-1]]
  1879. # return list(self) ???
  1880. def readinto1(self, b):
  1881. return self.readinto(b)
  1882. def close(self):
  1883. """Close file
  1884. Finalizes writes, discards cache
  1885. """
  1886. if getattr(self, "_unclosable", False):
  1887. return
  1888. if self.closed:
  1889. return
  1890. try:
  1891. if self.mode == "rb":
  1892. self.cache = None
  1893. else:
  1894. if not self.forced:
  1895. self.flush(force=True)
  1896. if self.fs is not None:
  1897. self.fs.invalidate_cache(self.path)
  1898. self.fs.invalidate_cache(self.fs._parent(self.path))
  1899. finally:
  1900. self.closed = True
  1901. def readable(self):
  1902. """Whether opened for reading"""
  1903. return "r" in self.mode and not self.closed
  1904. def seekable(self):
  1905. """Whether is seekable (only in read mode)"""
  1906. return self.readable()
  1907. def writable(self):
  1908. """Whether opened for writing"""
  1909. return self.mode in {"wb", "ab", "xb"} and not self.closed
  1910. def __reduce__(self):
  1911. if self.mode != "rb":
  1912. raise RuntimeError("Pickling a writeable file is not supported")
  1913. return reopen, (
  1914. self.fs,
  1915. self.path,
  1916. self.mode,
  1917. self.blocksize,
  1918. self.loc,
  1919. self.size,
  1920. self.autocommit,
  1921. self.cache.name if self.cache else "none",
  1922. self.kwargs,
  1923. )
  1924. def __del__(self):
  1925. if not self.closed:
  1926. self.close()
  1927. def __str__(self):
  1928. return f"<File-like object {type(self.fs).__name__}, {self.path}>"
  1929. __repr__ = __str__
  1930. def __enter__(self):
  1931. return self
  1932. def __exit__(self, *args):
  1933. self.close()
  1934. def reopen(fs, path, mode, blocksize, loc, size, autocommit, cache_type, kwargs):
  1935. file = fs.open(
  1936. path,
  1937. mode=mode,
  1938. block_size=blocksize,
  1939. autocommit=autocommit,
  1940. cache_type=cache_type,
  1941. size=size,
  1942. **kwargs,
  1943. )
  1944. if loc > 0:
  1945. file.seek(loc)
  1946. return file