_local_folder.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473
  1. # Copyright 2024-present, the HuggingFace Inc. team.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. """Contains utilities to handle the `../.cache/huggingface` folder in local directories.
  15. First discussed in https://github.com/huggingface/huggingface_hub/issues/1738 to store
  16. download metadata when downloading files from the hub to a local directory (without
  17. using the cache).
  18. ./.cache/huggingface folder structure:
  19. [4.0K] data
  20. ├── [4.0K] .cache
  21. │ └── [4.0K] huggingface
  22. │ └── [4.0K] download
  23. │ ├── [ 16] file.parquet.metadata
  24. │ ├── [ 16] file.txt.metadata
  25. │ └── [4.0K] folder
  26. │ └── [ 16] file.parquet.metadata
  27. ├── [6.5G] file.parquet
  28. ├── [1.5K] file.txt
  29. └── [4.0K] folder
  30. └── [ 16] file.parquet
  31. Download metadata file structure:
  32. ```
  33. # file.txt.metadata
  34. 11c5a3d5811f50298f278a704980280950aedb10
  35. a16a55fda99d2f2e7b69cce5cf93ff4ad3049930
  36. 1712656091.123
  37. # file.parquet.metadata
  38. 11c5a3d5811f50298f278a704980280950aedb10
  39. 7c5d3f4b8b76583b422fcb9189ad6c89d5d97a094541ce8932dce3ecabde1421
  40. 1712656091.123
  41. }
  42. ```
  43. """
  44. import base64
  45. import hashlib
  46. import logging
  47. import os
  48. import time
  49. from dataclasses import dataclass
  50. from pathlib import Path
  51. from .utils import WeakFileLock
  52. logger = logging.getLogger(__name__)
  53. CACHEDIR_TAG_CONTENT = (
  54. "Signature: 8a477f597d28d172789f06886806bc55\n"
  55. "# This file is a cache directory tag created by huggingface_hub.\n"
  56. "# For information about cache directory tags, see:\n"
  57. "#\thttps://bford.info/cachedir/\n"
  58. )
  59. @dataclass
  60. class LocalDownloadFilePaths:
  61. """
  62. Paths to the files related to a download process in a local dir.
  63. Returned by [`get_local_download_paths`].
  64. Attributes:
  65. file_path (`Path`):
  66. Path where the file will be saved.
  67. lock_path (`Path`):
  68. Path to the lock file used to ensure atomicity when reading/writing metadata.
  69. metadata_path (`Path`):
  70. Path to the metadata file.
  71. """
  72. file_path: Path
  73. lock_path: Path
  74. metadata_path: Path
  75. def incomplete_path(self, etag: str) -> Path:
  76. """Return the path where a file will be temporarily downloaded before being moved to `file_path`."""
  77. path = self.metadata_path.parent / f"{_short_hash(self.metadata_path.name)}.{etag}.incomplete"
  78. resolved_path = str(path.resolve())
  79. # Some Windows versions do not allow for paths longer than 255 characters.
  80. # In this case, we must specify it as an extended path by using the "\\?\" prefix.
  81. if os.name == "nt" and len(resolved_path) > 255 and not resolved_path.startswith("\\\\?\\"):
  82. path = Path("\\\\?\\" + resolved_path)
  83. return path
  84. @dataclass(frozen=True)
  85. class LocalUploadFilePaths:
  86. """
  87. Paths to the files related to an upload process in a local dir.
  88. Returned by [`get_local_upload_paths`].
  89. Attributes:
  90. path_in_repo (`str`):
  91. Path of the file in the repo.
  92. file_path (`Path`):
  93. Path where the file will be saved.
  94. lock_path (`Path`):
  95. Path to the lock file used to ensure atomicity when reading/writing metadata.
  96. metadata_path (`Path`):
  97. Path to the metadata file.
  98. """
  99. path_in_repo: str
  100. file_path: Path
  101. lock_path: Path
  102. metadata_path: Path
  103. @dataclass
  104. class LocalDownloadFileMetadata:
  105. """
  106. Metadata about a file in the local directory related to a download process.
  107. Attributes:
  108. filename (`str`):
  109. Path of the file in the repo.
  110. commit_hash (`str`):
  111. Commit hash of the file in the repo.
  112. etag (`str`):
  113. ETag of the file in the repo. Used to check if the file has changed.
  114. For LFS files, this is the sha256 of the file. For regular files, it corresponds to the git hash.
  115. timestamp (`int`):
  116. Unix timestamp of when the metadata was saved i.e. when the metadata was accurate.
  117. """
  118. filename: str
  119. commit_hash: str
  120. etag: str
  121. timestamp: float
  122. @dataclass
  123. class LocalUploadFileMetadata:
  124. """
  125. Metadata about a file in the local directory related to an upload process.
  126. """
  127. size: int
  128. # Default values correspond to "we don't know yet"
  129. timestamp: float | None = None
  130. should_ignore: bool | None = None
  131. sha256: str | None = None
  132. upload_mode: str | None = None
  133. remote_oid: str | None = None
  134. is_uploaded: bool = False
  135. is_committed: bool = False
  136. def save(self, paths: LocalUploadFilePaths) -> None:
  137. """Save the metadata to disk."""
  138. with WeakFileLock(paths.lock_path):
  139. with paths.metadata_path.open("w") as f:
  140. new_timestamp = time.time()
  141. f.write(str(new_timestamp) + "\n")
  142. f.write(str(self.size)) # never None
  143. f.write("\n")
  144. if self.should_ignore is not None:
  145. f.write(str(int(self.should_ignore)))
  146. f.write("\n")
  147. if self.sha256 is not None:
  148. f.write(self.sha256)
  149. f.write("\n")
  150. if self.upload_mode is not None:
  151. f.write(self.upload_mode)
  152. f.write("\n")
  153. if self.remote_oid is not None:
  154. f.write(self.remote_oid)
  155. f.write("\n")
  156. f.write(str(int(self.is_uploaded)) + "\n")
  157. f.write(str(int(self.is_committed)) + "\n")
  158. self.timestamp = new_timestamp
  159. def get_local_download_paths(local_dir: Path, filename: str) -> LocalDownloadFilePaths:
  160. """Compute paths to the files related to a download process.
  161. Folders containing the paths are all guaranteed to exist.
  162. Args:
  163. local_dir (`Path`):
  164. Path to the local directory in which files are downloaded.
  165. filename (`str`):
  166. Path of the file in the repo.
  167. Return:
  168. [`LocalDownloadFilePaths`]: the paths to the files (file_path, lock_path, metadata_path, incomplete_path).
  169. """
  170. # filename is the path in the Hub repository (separated by '/')
  171. # make sure to have a cross-platform transcription
  172. sanitized_filename = os.path.join(*filename.split("/"))
  173. if os.name == "nt":
  174. if sanitized_filename.startswith("..\\") or "\\..\\" in sanitized_filename:
  175. raise ValueError(
  176. f"Invalid filename: cannot handle filename '{sanitized_filename}' on Windows. Please ask the repository"
  177. " owner to rename this file."
  178. )
  179. file_path = local_dir / sanitized_filename
  180. metadata_path = _huggingface_dir(local_dir) / "download" / f"{sanitized_filename}.metadata"
  181. lock_path = metadata_path.with_suffix(".lock")
  182. # Some Windows versions do not allow for paths longer than 255 characters.
  183. # In this case, we must specify it as an extended path by using the "\\?\" prefix
  184. if os.name == "nt":
  185. if not str(local_dir).startswith("\\\\?\\") and len(os.path.abspath(lock_path)) > 255:
  186. file_path = Path("\\\\?\\" + os.path.abspath(file_path))
  187. lock_path = Path("\\\\?\\" + os.path.abspath(lock_path))
  188. metadata_path = Path("\\\\?\\" + os.path.abspath(metadata_path))
  189. file_path.parent.mkdir(parents=True, exist_ok=True)
  190. metadata_path.parent.mkdir(parents=True, exist_ok=True)
  191. return LocalDownloadFilePaths(file_path=file_path, lock_path=lock_path, metadata_path=metadata_path)
  192. def get_local_upload_paths(local_dir: Path, filename: str) -> LocalUploadFilePaths:
  193. """Compute paths to the files related to an upload process.
  194. Folders containing the paths are all guaranteed to exist.
  195. Args:
  196. local_dir (`Path`):
  197. Path to the local directory that is uploaded.
  198. filename (`str`):
  199. Path of the file in the repo.
  200. Return:
  201. [`LocalUploadFilePaths`]: the paths to the files (file_path, lock_path, metadata_path).
  202. """
  203. # filename is the path in the Hub repository (separated by '/')
  204. # make sure to have a cross-platform transcription
  205. sanitized_filename = os.path.join(*filename.split("/"))
  206. if os.name == "nt":
  207. if sanitized_filename.startswith("..\\") or "\\..\\" in sanitized_filename:
  208. raise ValueError(
  209. f"Invalid filename: cannot handle filename '{sanitized_filename}' on Windows. Please ask the repository"
  210. " owner to rename this file."
  211. )
  212. file_path = local_dir / sanitized_filename
  213. metadata_path = _huggingface_dir(local_dir) / "upload" / f"{sanitized_filename}.metadata"
  214. lock_path = metadata_path.with_suffix(".lock")
  215. # Some Windows versions do not allow for paths longer than 255 characters.
  216. # In this case, we must specify it as an extended path by using the "\\?\" prefix
  217. if os.name == "nt":
  218. if not str(local_dir).startswith("\\\\?\\") and len(os.path.abspath(lock_path)) > 255:
  219. file_path = Path("\\\\?\\" + os.path.abspath(file_path))
  220. lock_path = Path("\\\\?\\" + os.path.abspath(lock_path))
  221. metadata_path = Path("\\\\?\\" + os.path.abspath(metadata_path))
  222. file_path.parent.mkdir(parents=True, exist_ok=True)
  223. metadata_path.parent.mkdir(parents=True, exist_ok=True)
  224. return LocalUploadFilePaths(
  225. path_in_repo=filename, file_path=file_path, lock_path=lock_path, metadata_path=metadata_path
  226. )
  227. def read_download_metadata(local_dir: Path, filename: str) -> LocalDownloadFileMetadata | None:
  228. """Read metadata about a file in the local directory related to a download process.
  229. Args:
  230. local_dir (`Path`):
  231. Path to the local directory in which files are downloaded.
  232. filename (`str`):
  233. Path of the file in the repo.
  234. Return:
  235. `[LocalDownloadFileMetadata]` or `None`: the metadata if it exists, `None` otherwise.
  236. """
  237. paths = get_local_download_paths(local_dir, filename)
  238. with WeakFileLock(paths.lock_path):
  239. if paths.metadata_path.exists():
  240. try:
  241. with paths.metadata_path.open() as f:
  242. commit_hash = f.readline().strip()
  243. etag = f.readline().strip()
  244. timestamp = float(f.readline().strip())
  245. metadata = LocalDownloadFileMetadata(
  246. filename=filename,
  247. commit_hash=commit_hash,
  248. etag=etag,
  249. timestamp=timestamp,
  250. )
  251. except Exception as e:
  252. # remove the metadata file if it is corrupted / not the right format
  253. logger.warning(
  254. f"Invalid metadata file {paths.metadata_path}: {e}. Removing it from disk and continue."
  255. )
  256. try:
  257. paths.metadata_path.unlink()
  258. except Exception as e:
  259. logger.warning(f"Could not remove corrupted metadata file {paths.metadata_path}: {e}")
  260. return None
  261. try:
  262. # check if the file exists and hasn't been modified since the metadata was saved
  263. stat = paths.file_path.stat()
  264. if (
  265. stat.st_mtime - 1 <= metadata.timestamp
  266. ): # allow 1s difference as stat.st_mtime might not be precise
  267. return metadata
  268. logger.info(f"Ignored metadata for '{filename}' (outdated). Will re-compute hash.")
  269. except FileNotFoundError:
  270. # file does not exist => metadata is outdated
  271. return None
  272. return None
  273. def read_upload_metadata(local_dir: Path, filename: str) -> LocalUploadFileMetadata:
  274. """Read metadata about a file in the local directory related to an upload process.
  275. TODO: factorize logic with `read_download_metadata`.
  276. Args:
  277. local_dir (`Path`):
  278. Path to the local directory in which files are downloaded.
  279. filename (`str`):
  280. Path of the file in the repo.
  281. Return:
  282. `[LocalUploadFileMetadata]` or `None`: the metadata if it exists, `None` otherwise.
  283. """
  284. paths = get_local_upload_paths(local_dir, filename)
  285. with WeakFileLock(paths.lock_path):
  286. if paths.metadata_path.exists():
  287. try:
  288. with paths.metadata_path.open() as f:
  289. timestamp = float(f.readline().strip())
  290. size = int(f.readline().strip()) # never None
  291. _should_ignore = f.readline().strip()
  292. should_ignore = None if _should_ignore == "" else bool(int(_should_ignore))
  293. _sha256 = f.readline().strip()
  294. sha256 = None if _sha256 == "" else _sha256
  295. _upload_mode = f.readline().strip()
  296. upload_mode = None if _upload_mode == "" else _upload_mode
  297. if upload_mode not in (None, "regular", "lfs"):
  298. raise ValueError(f"Invalid upload mode in metadata {paths.path_in_repo}: {upload_mode}")
  299. _remote_oid = f.readline().strip()
  300. remote_oid = None if _remote_oid == "" else _remote_oid
  301. is_uploaded = bool(int(f.readline().strip()))
  302. is_committed = bool(int(f.readline().strip()))
  303. metadata = LocalUploadFileMetadata(
  304. timestamp=timestamp,
  305. size=size,
  306. should_ignore=should_ignore,
  307. sha256=sha256,
  308. upload_mode=upload_mode,
  309. remote_oid=remote_oid,
  310. is_uploaded=is_uploaded,
  311. is_committed=is_committed,
  312. )
  313. except Exception as e:
  314. # remove the metadata file if it is corrupted / not the right format
  315. logger.warning(
  316. f"Invalid metadata file {paths.metadata_path}: {e}. Removing it from disk and continue."
  317. )
  318. try:
  319. paths.metadata_path.unlink()
  320. except Exception as e:
  321. logger.warning(f"Could not remove corrupted metadata file {paths.metadata_path}: {e}")
  322. # corrupted metadata => we don't know anything expect its size
  323. return LocalUploadFileMetadata(size=paths.file_path.stat().st_size)
  324. # TODO: can we do better?
  325. if (
  326. metadata.timestamp is not None
  327. and metadata.is_uploaded # file was uploaded
  328. and not metadata.is_committed # but not committed
  329. and time.time() - metadata.timestamp > 20 * 3600 # and it's been more than 20 hours
  330. ): # => we consider it as garbage-collected by S3
  331. metadata.is_uploaded = False
  332. # check if the file exists and hasn't been modified since the metadata was saved
  333. try:
  334. if metadata.timestamp is not None and paths.file_path.stat().st_mtime <= metadata.timestamp:
  335. return metadata
  336. logger.info(f"Ignored metadata for '{filename}' (outdated). Will re-compute hash.")
  337. except FileNotFoundError:
  338. # file does not exist => metadata is outdated
  339. pass
  340. # empty metadata => we don't know anything expect its size
  341. return LocalUploadFileMetadata(size=paths.file_path.stat().st_size)
  342. def write_download_metadata(local_dir: Path, filename: str, commit_hash: str, etag: str) -> None:
  343. """Write metadata about a file in the local directory related to a download process.
  344. Args:
  345. local_dir (`Path`):
  346. Path to the local directory in which files are downloaded.
  347. """
  348. paths = get_local_download_paths(local_dir, filename)
  349. with WeakFileLock(paths.lock_path):
  350. with paths.metadata_path.open("w") as f:
  351. f.write(f"{commit_hash}\n{etag}\n{time.time()}\n")
  352. def _huggingface_dir(local_dir: Path) -> Path:
  353. """Return the path to the `.cache/huggingface` directory in a local directory."""
  354. # Wrap in lru_cache to avoid overwriting the .gitignore file if called multiple times
  355. path = local_dir / ".cache" / "huggingface"
  356. path.mkdir(exist_ok=True, parents=True)
  357. # Create a CACHEDIR.TAG so backup tools can skip this directory.
  358. _create_cachedir_tag(path)
  359. # Create a .gitignore file in the .cache/huggingface directory if it doesn't exist
  360. # Should be thread-safe enough like this.
  361. gitignore = path / ".gitignore"
  362. gitignore_lock = path / ".gitignore.lock"
  363. if not gitignore.exists():
  364. try:
  365. with WeakFileLock(gitignore_lock, timeout=0.1):
  366. gitignore.write_text("*")
  367. except IndexError:
  368. pass
  369. except OSError: # TimeoutError, FileNotFoundError, PermissionError, etc.
  370. pass
  371. try:
  372. gitignore_lock.unlink()
  373. except OSError:
  374. pass
  375. return path
  376. def _create_cachedir_tag(cache_dir: Path) -> None:
  377. """Create a CACHEDIR.TAG file in ``cache_dir`` if one does not already exist.
  378. The tag follows the `Cache Directory Tagging Standard <http://www.brynosaurus.com/cachedir/>`_
  379. so that backup tools can recognize and skip cache directories.
  380. """
  381. tag_path = cache_dir / "CACHEDIR.TAG"
  382. if not tag_path.exists():
  383. try:
  384. tag_path.write_text(CACHEDIR_TAG_CONTENT)
  385. except OSError:
  386. pass
  387. def _short_hash(filename: str) -> str:
  388. return base64.urlsafe_b64encode(hashlib.sha1(filename.encode()).digest()).decode()