zip.py 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181
  1. import os
  2. import zipfile
  3. import fsspec
  4. from fsspec.archive import AbstractArchiveFileSystem
  5. class ZipFileSystem(AbstractArchiveFileSystem):
  6. """Read/Write contents of ZIP archive as a file-system
  7. Keeps file object open while instance lives.
  8. This class is pickleable, but not necessarily thread-safe
  9. """
  10. root_marker = ""
  11. protocol = "zip"
  12. cachable = False
  13. def __init__(
  14. self,
  15. fo="",
  16. mode="r",
  17. target_protocol=None,
  18. target_options=None,
  19. compression=zipfile.ZIP_STORED,
  20. allowZip64=True,
  21. compresslevel=None,
  22. **kwargs,
  23. ):
  24. """
  25. Parameters
  26. ----------
  27. fo: str or file-like
  28. Contains ZIP, and must exist. If a str, will fetch file using
  29. :meth:`~fsspec.open_files`, which must return one file exactly.
  30. mode: str
  31. Accept: "r", "w", "a"
  32. target_protocol: str (optional)
  33. If ``fo`` is a string, this value can be used to override the
  34. FS protocol inferred from a URL
  35. target_options: dict (optional)
  36. Kwargs passed when instantiating the target FS, if ``fo`` is
  37. a string.
  38. compression, allowZip64, compresslevel: passed to ZipFile
  39. Only relevant when creating a ZIP
  40. """
  41. super().__init__(self, **kwargs)
  42. if mode not in set("rwa"):
  43. raise ValueError(f"mode '{mode}' no understood")
  44. self.mode = mode
  45. if isinstance(fo, (str, os.PathLike)):
  46. if mode == "a":
  47. m = "r+b"
  48. else:
  49. m = mode + "b"
  50. fo = fsspec.open(
  51. fo, mode=m, protocol=target_protocol, **(target_options or {})
  52. )
  53. self.force_zip_64 = allowZip64
  54. self.of = fo
  55. self.fo = fo.__enter__() # the whole instance is a context
  56. self.zip = zipfile.ZipFile(
  57. self.fo,
  58. mode=mode,
  59. compression=compression,
  60. allowZip64=allowZip64,
  61. compresslevel=compresslevel,
  62. )
  63. self.dir_cache = None
  64. @classmethod
  65. def _strip_protocol(cls, path):
  66. # zip file paths are always relative to the archive root
  67. return super()._strip_protocol(path).lstrip("/")
  68. def __del__(self):
  69. if hasattr(self, "zip"):
  70. self.close()
  71. del self.zip
  72. def close(self):
  73. """Commits any write changes to the file. Done on ``del`` too."""
  74. self.zip.close()
  75. def _get_dirs(self):
  76. if self.dir_cache is None or self.mode in set("wa"):
  77. # when writing, dir_cache is always in the ZipFile's attributes,
  78. # not read from the file.
  79. files = self.zip.infolist()
  80. self.dir_cache = {
  81. dirname.rstrip("/"): {
  82. "name": dirname.rstrip("/"),
  83. "size": 0,
  84. "type": "directory",
  85. }
  86. for dirname in self._all_dirnames(self.zip.namelist())
  87. }
  88. for z in files:
  89. f = {s: getattr(z, s, None) for s in zipfile.ZipInfo.__slots__}
  90. f.update(
  91. {
  92. "name": z.filename.rstrip("/"),
  93. "size": z.file_size,
  94. "type": ("directory" if z.is_dir() else "file"),
  95. }
  96. )
  97. self.dir_cache[f["name"]] = f
  98. def pipe_file(self, path, value, **kwargs):
  99. # override upstream, because we know the exact file size in this case
  100. self.zip.writestr(path, value, **kwargs)
  101. def _open(
  102. self,
  103. path,
  104. mode="rb",
  105. block_size=None,
  106. autocommit=True,
  107. cache_options=None,
  108. **kwargs,
  109. ):
  110. path = self._strip_protocol(path)
  111. if "r" in mode and self.mode in set("wa"):
  112. if self.exists(path):
  113. raise OSError("ZipFS can only be open for reading or writing, not both")
  114. raise FileNotFoundError(path)
  115. if "r" in self.mode and "w" in mode:
  116. raise OSError("ZipFS can only be open for reading or writing, not both")
  117. out = self.zip.open(path, mode.strip("b"), force_zip64=self.force_zip_64)
  118. if "r" in mode:
  119. info = self.info(path)
  120. out.size = info["size"]
  121. out.name = info["name"]
  122. return out
  123. def find(self, path, maxdepth=None, withdirs=False, detail=False, **kwargs):
  124. if maxdepth is not None and maxdepth < 1:
  125. raise ValueError("maxdepth must be at least 1")
  126. def to_parts(_path: str):
  127. return list(filter(None, _path.replace("\\", "/").split("/")))
  128. if not isinstance(path, str):
  129. path = str(path)
  130. # Remove the leading slash, as the zip file paths are always
  131. # given without a leading slash
  132. path = path.lstrip("/")
  133. path_parts = to_parts(path)
  134. path_depth = len(path_parts)
  135. self._get_dirs()
  136. result = {}
  137. # To match posix find, if an exact file name is given, we should
  138. # return only that file
  139. if path in self.dir_cache and self.dir_cache[path]["type"] == "file":
  140. result[path] = self.dir_cache[path]
  141. return result if detail else [path]
  142. for file_path, file_info in self.dir_cache.items():
  143. if len(file_parts := to_parts(file_path)) < path_depth or any(
  144. a != b for a, b in zip(path_parts, file_parts)
  145. ):
  146. # skip parent folders and mismatching paths
  147. continue
  148. if file_info["type"] == "directory":
  149. if withdirs and file_path not in result:
  150. result[file_path.strip("/")] = file_info
  151. continue
  152. if file_path not in result:
  153. result[file_path] = file_info if detail else None
  154. if maxdepth:
  155. result = {
  156. k: v for k, v in result.items() if k.count("/") < maxdepth + path_depth
  157. }
  158. return result if detail else sorted(result)