| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842 |
- # Copyright 2022-present, the HuggingFace Inc. team.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- """Contains utilities to manage the HF cache directory."""
- import os
- import shutil
- from collections import defaultdict
- from dataclasses import dataclass
- from pathlib import Path
- from typing import Literal
- from huggingface_hub.errors import CacheNotFound, CorruptedCacheException
- from ..constants import HF_HUB_CACHE
- from . import logging
- from ._parsing import format_timesince
- from ._terminal import tabulate
- logger = logging.get_logger(__name__)
- REPO_TYPE_T = Literal["model", "dataset", "space"]
- # List of OS-created helper files that need to be ignored
- FILES_TO_IGNORE = [".DS_Store"]
- @dataclass(frozen=True)
- class CachedFileInfo:
- """Frozen data structure holding information about a single cached file.
- Args:
- file_name (`str`):
- Name of the file. Example: `config.json`.
- file_path (`Path`):
- Path of the file in the `snapshots` directory. The file path is a symlink
- referring to a blob in the `blobs` folder.
- blob_path (`Path`):
- Path of the blob file. This is equivalent to `file_path.resolve()`.
- size_on_disk (`int`):
- Size of the blob file in bytes.
- blob_last_accessed (`float`):
- Timestamp of the last time the blob file has been accessed (from any
- revision).
- blob_last_modified (`float`):
- Timestamp of the last time the blob file has been modified/created.
- > [!WARNING]
- > `blob_last_accessed` and `blob_last_modified` reliability can depend on the OS you
- > are using. See [python documentation](https://docs.python.org/3/library/os.html#os.stat_result)
- > for more details.
- """
- file_name: str
- file_path: Path
- blob_path: Path
- size_on_disk: int
- blob_last_accessed: float
- blob_last_modified: float
- @property
- def blob_last_accessed_str(self) -> str:
- """
- (property) Timestamp of the last time the blob file has been accessed (from any
- revision), returned as a human-readable string.
- Example: "2 weeks ago".
- """
- return format_timesince(self.blob_last_accessed)
- @property
- def blob_last_modified_str(self) -> str:
- """
- (property) Timestamp of the last time the blob file has been modified, returned
- as a human-readable string.
- Example: "2 weeks ago".
- """
- return format_timesince(self.blob_last_modified)
- @property
- def size_on_disk_str(self) -> str:
- """
- (property) Size of the blob file as a human-readable string.
- Example: "42.2K".
- """
- return _format_size(self.size_on_disk)
- @dataclass(frozen=True)
- class CachedRevisionInfo:
- """Frozen data structure holding information about a revision.
- A revision correspond to a folder in the `snapshots` folder and is populated with
- the exact tree structure as the repo on the Hub but contains only symlinks. A
- revision can be either referenced by 1 or more `refs` or be "detached" (no refs).
- Args:
- commit_hash (`str`):
- Hash of the revision (unique).
- Example: `"9338f7b671827df886678df2bdd7cc7b4f36dffd"`.
- snapshot_path (`Path`):
- Path to the revision directory in the `snapshots` folder. It contains the
- exact tree structure as the repo on the Hub.
- files: (`frozenset[CachedFileInfo]`):
- Set of [`~CachedFileInfo`] describing all files contained in the snapshot.
- refs (`frozenset[str]`):
- Set of `refs` pointing to this revision. If the revision has no `refs`, it
- is considered detached.
- Example: `{"main", "2.4.0"}` or `{"refs/pr/1"}`.
- size_on_disk (`int`):
- Sum of the blob file sizes that are symlink-ed by the revision.
- last_modified (`float`):
- Timestamp of the last time the revision has been created/modified.
- > [!WARNING]
- > `last_accessed` cannot be determined correctly on a single revision as blob files
- > are shared across revisions.
- > [!WARNING]
- > `size_on_disk` is not necessarily the sum of all file sizes because of possible
- > duplicated files. Besides, only blobs are taken into account, not the (negligible)
- > size of folders and symlinks.
- """
- commit_hash: str
- snapshot_path: Path
- size_on_disk: int
- files: frozenset[CachedFileInfo]
- refs: frozenset[str]
- last_modified: float
- @property
- def last_modified_str(self) -> str:
- """
- (property) Timestamp of the last time the revision has been modified, returned
- as a human-readable string.
- Example: "2 weeks ago".
- """
- return format_timesince(self.last_modified)
- @property
- def size_on_disk_str(self) -> str:
- """
- (property) Sum of the blob file sizes as a human-readable string.
- Example: "42.2K".
- """
- return _format_size(self.size_on_disk)
- @property
- def nb_files(self) -> int:
- """
- (property) Total number of files in the revision.
- """
- return len(self.files)
- @dataclass(frozen=True)
- class CachedRepoInfo:
- """Frozen data structure holding information about a cached repository.
- Args:
- repo_id (`str`):
- Repo id of the repo on the Hub. Example: `"google/fleurs"`.
- repo_type (`Literal["dataset", "model", "space"]`):
- Type of the cached repo.
- repo_path (`Path`):
- Local path to the cached repo.
- size_on_disk (`int`):
- Sum of the blob file sizes in the cached repo.
- nb_files (`int`):
- Total number of blob files in the cached repo.
- revisions (`frozenset[CachedRevisionInfo]`):
- Set of [`~CachedRevisionInfo`] describing all revisions cached in the repo.
- last_accessed (`float`):
- Timestamp of the last time a blob file of the repo has been accessed.
- last_modified (`float`):
- Timestamp of the last time a blob file of the repo has been modified/created.
- > [!WARNING]
- > `size_on_disk` is not necessarily the sum of all revisions sizes because of
- > duplicated files. Besides, only blobs are taken into account, not the (negligible)
- > size of folders and symlinks.
- > [!WARNING]
- > `last_accessed` and `last_modified` reliability can depend on the OS you are using.
- > See [python documentation](https://docs.python.org/3/library/os.html#os.stat_result)
- > for more details.
- """
- repo_id: str
- repo_type: REPO_TYPE_T
- repo_path: Path
- size_on_disk: int
- nb_files: int
- revisions: frozenset[CachedRevisionInfo]
- last_accessed: float
- last_modified: float
- @property
- def last_accessed_str(self) -> str:
- """
- (property) Last time a blob file of the repo has been accessed, returned as a
- human-readable string.
- Example: "2 weeks ago".
- """
- return format_timesince(self.last_accessed)
- @property
- def last_modified_str(self) -> str:
- """
- (property) Last time a blob file of the repo has been modified, returned as a
- human-readable string.
- Example: "2 weeks ago".
- """
- return format_timesince(self.last_modified)
- @property
- def size_on_disk_str(self) -> str:
- """
- (property) Sum of the blob file sizes as a human-readable string.
- Example: "42.2K".
- """
- return _format_size(self.size_on_disk)
- @property
- def cache_id(self) -> str:
- """Canonical `type/id` identifier used across cache tooling."""
- return f"{self.repo_type}/{self.repo_id}"
- @property
- def refs(self) -> dict[str, CachedRevisionInfo]:
- """
- (property) Mapping between `refs` and revision data structures.
- """
- return {ref: revision for revision in self.revisions for ref in revision.refs}
- @dataclass(frozen=True)
- class DeleteCacheStrategy:
- """Frozen data structure holding the strategy to delete cached revisions.
- This object is not meant to be instantiated programmatically but to be returned by
- [`~utils.HFCacheInfo.delete_revisions`]. See documentation for usage example.
- Args:
- expected_freed_size (`float`):
- Expected freed size once strategy is executed.
- blobs (`frozenset[Path]`):
- Set of blob file paths to be deleted.
- refs (`frozenset[Path]`):
- Set of reference file paths to be deleted.
- repos (`frozenset[Path]`):
- Set of entire repo paths to be deleted.
- snapshots (`frozenset[Path]`):
- Set of snapshots to be deleted (directory of symlinks).
- """
- expected_freed_size: int
- blobs: frozenset[Path]
- refs: frozenset[Path]
- repos: frozenset[Path]
- snapshots: frozenset[Path]
- @property
- def expected_freed_size_str(self) -> str:
- """
- (property) Expected size that will be freed as a human-readable string.
- Example: "42.2K".
- """
- return _format_size(self.expected_freed_size)
- def execute(self) -> None:
- """Execute the defined strategy.
- > [!WARNING]
- > If this method is interrupted, the cache might get corrupted. Deletion order is
- > implemented so that references and symlinks are deleted before the actual blob
- > files.
- > [!WARNING]
- > This method is irreversible. If executed, cached files are erased and must be
- > downloaded again.
- """
- # Deletion order matters. Blobs are deleted in last so that the user can't end
- # up in a state where a `ref`` refers to a missing snapshot or a snapshot
- # symlink refers to a deleted blob.
- # Delete entire repos
- for path in self.repos:
- _try_delete_path(path, path_type="repo")
- # Delete snapshot directories
- for path in self.snapshots:
- _try_delete_path(path, path_type="snapshot")
- # Delete refs files
- for path in self.refs:
- _try_delete_path(path, path_type="ref")
- # Delete blob files
- for path in self.blobs:
- _try_delete_path(path, path_type="blob")
- logger.info(f"Cache deletion done. Saved {self.expected_freed_size_str}.")
- @dataclass(frozen=True)
- class HFCacheInfo:
- """Frozen data structure holding information about the entire cache-system.
- This data structure is returned by [`scan_cache_dir`] and is immutable.
- Args:
- size_on_disk (`int`):
- Sum of all valid repo sizes in the cache-system.
- repos (`frozenset[CachedRepoInfo]`):
- Set of [`~CachedRepoInfo`] describing all valid cached repos found on the
- cache-system while scanning.
- warnings (`list[CorruptedCacheException]`):
- List of [`~CorruptedCacheException`] that occurred while scanning the cache.
- Those exceptions are captured so that the scan can continue. Corrupted repos
- are skipped from the scan.
- > [!WARNING]
- > Here `size_on_disk` is equal to the sum of all repo sizes (only blobs). However if
- > some cached repos are corrupted, their sizes are not taken into account.
- """
- size_on_disk: int
- repos: frozenset[CachedRepoInfo]
- warnings: list[CorruptedCacheException]
- @property
- def size_on_disk_str(self) -> str:
- """
- (property) Sum of all valid repo sizes in the cache-system as a human-readable
- string.
- Example: "42.2K".
- """
- return _format_size(self.size_on_disk)
- def delete_revisions(self, *revisions: str) -> DeleteCacheStrategy:
- """Prepare the strategy to delete one or more revisions cached locally.
- Input revisions can be any revision hash. If a revision hash is not found in the
- local cache, a warning is thrown but no error is raised. Revisions can be from
- different cached repos since hashes are unique across repos,
- Examples:
- ```py
- >>> from huggingface_hub import scan_cache_dir
- >>> cache_info = scan_cache_dir()
- >>> delete_strategy = cache_info.delete_revisions(
- ... "81fd1d6e7847c99f5862c9fb81387956d99ec7aa"
- ... )
- >>> print(f"Will free {delete_strategy.expected_freed_size_str}.")
- Will free 7.9K.
- >>> delete_strategy.execute()
- Cache deletion done. Saved 7.9K.
- ```
- ```py
- >>> from huggingface_hub import scan_cache_dir
- >>> scan_cache_dir().delete_revisions(
- ... "81fd1d6e7847c99f5862c9fb81387956d99ec7aa",
- ... "e2983b237dccf3ab4937c97fa717319a9ca1a96d",
- ... "6c0e6080953db56375760c0471a8c5f2929baf11",
- ... ).execute()
- Cache deletion done. Saved 8.6G.
- ```
- > [!WARNING]
- > `delete_revisions` returns a [`~utils.DeleteCacheStrategy`] object that needs to
- > be executed. The [`~utils.DeleteCacheStrategy`] is not meant to be modified but
- > allows having a dry run before actually executing the deletion.
- """
- hashes_to_delete: set[str] = set(revisions)
- repos_with_revisions: dict[CachedRepoInfo, set[CachedRevisionInfo]] = defaultdict(set)
- for repo in self.repos:
- for revision in repo.revisions:
- if revision.commit_hash in hashes_to_delete:
- repos_with_revisions[repo].add(revision)
- hashes_to_delete.remove(revision.commit_hash)
- if len(hashes_to_delete) > 0:
- logger.warning(f"Revision(s) not found - cannot delete them: {', '.join(hashes_to_delete)}")
- delete_strategy_blobs: set[Path] = set()
- delete_strategy_refs: set[Path] = set()
- delete_strategy_repos: set[Path] = set()
- delete_strategy_snapshots: set[Path] = set()
- delete_strategy_expected_freed_size = 0
- for affected_repo, revisions_to_delete in repos_with_revisions.items():
- other_revisions = affected_repo.revisions - revisions_to_delete
- # If no other revisions, it means all revisions are deleted
- # -> delete the entire cached repo
- if len(other_revisions) == 0:
- delete_strategy_repos.add(affected_repo.repo_path)
- delete_strategy_expected_freed_size += affected_repo.size_on_disk
- continue
- # Some revisions of the repo will be deleted but not all. We need to filter
- # which blob files will not be linked anymore.
- for revision_to_delete in revisions_to_delete:
- # Snapshot dir
- delete_strategy_snapshots.add(revision_to_delete.snapshot_path)
- # Refs dir
- for ref in revision_to_delete.refs:
- delete_strategy_refs.add(affected_repo.repo_path / "refs" / ref)
- # Blobs dir
- for file in revision_to_delete.files:
- if file.blob_path not in delete_strategy_blobs:
- is_file_alone = True
- for revision in other_revisions:
- for rev_file in revision.files:
- if file.blob_path == rev_file.blob_path:
- is_file_alone = False
- break
- if not is_file_alone:
- break
- # Blob file not referenced by remaining revisions -> delete
- if is_file_alone:
- delete_strategy_blobs.add(file.blob_path)
- delete_strategy_expected_freed_size += file.size_on_disk
- # Return the strategy instead of executing it.
- return DeleteCacheStrategy(
- blobs=frozenset(delete_strategy_blobs),
- refs=frozenset(delete_strategy_refs),
- repos=frozenset(delete_strategy_repos),
- snapshots=frozenset(delete_strategy_snapshots),
- expected_freed_size=delete_strategy_expected_freed_size,
- )
- def export_as_table(self, *, verbosity: int = 0) -> str:
- """Generate a table from the [`HFCacheInfo`] object.
- Pass `verbosity=0` to get a table with a single row per repo, with columns
- "repo_id", "repo_type", "size_on_disk", "nb_files", "last_accessed", "last_modified", "refs", "local_path".
- Pass `verbosity=1` to get a table with a row per repo and revision (thus multiple rows can appear for a single repo), with columns
- "repo_id", "repo_type", "revision", "size_on_disk", "nb_files", "last_modified", "refs", "local_path".
- Example:
- ```py
- >>> from huggingface_hub.utils import scan_cache_dir
- >>> hf_cache_info = scan_cache_dir()
- HFCacheInfo(...)
- >>> print(hf_cache_info.export_as_table())
- REPO ID REPO TYPE SIZE ON DISK NB FILES LAST_ACCESSED LAST_MODIFIED REFS LOCAL PATH
- --------------------------------------------------- --------- ------------ -------- ------------- ------------- ---- --------------------------------------------------------------------------------------------------
- roberta-base model 2.7M 5 1 day ago 1 week ago main ~/.cache/huggingface/hub/models--roberta-base
- suno/bark model 8.8K 1 1 week ago 1 week ago main ~/.cache/huggingface/hub/models--suno--bark
- t5-base model 893.8M 4 4 days ago 7 months ago main ~/.cache/huggingface/hub/models--t5-base
- t5-large model 3.0G 4 5 weeks ago 5 months ago main ~/.cache/huggingface/hub/models--t5-large
- >>> print(hf_cache_info.export_as_table(verbosity=1))
- REPO ID REPO TYPE REVISION SIZE ON DISK NB FILES LAST_MODIFIED REFS LOCAL PATH
- --------------------------------------------------- --------- ---------------------------------------- ------------ -------- ------------- ---- -----------------------------------------------------------------------------------------------------------------------------------------------------
- roberta-base model e2da8e2f811d1448a5b465c236feacd80ffbac7b 2.7M 5 1 week ago main ~/.cache/huggingface/hub/models--roberta-base/snapshots/e2da8e2f811d1448a5b465c236feacd80ffbac7b
- suno/bark model 70a8a7d34168586dc5d028fa9666aceade177992 8.8K 1 1 week ago main ~/.cache/huggingface/hub/models--suno--bark/snapshots/70a8a7d34168586dc5d028fa9666aceade177992
- t5-base model a9723ea7f1b39c1eae772870f3b547bf6ef7e6c1 893.8M 4 7 months ago main ~/.cache/huggingface/hub/models--t5-base/snapshots/a9723ea7f1b39c1eae772870f3b547bf6ef7e6c1
- t5-large model 150ebc2c4b72291e770f58e6057481c8d2ed331a 3.0G 4 5 months ago main ~/.cache/huggingface/hub/models--t5-large/snapshots/150ebc2c4b72291e770f58e6057481c8d2ed331a
- ```
- Args:
- verbosity (`int`, *optional*):
- The verbosity level. Defaults to 0.
- Returns:
- `str`: The table as a string.
- """
- if verbosity == 0:
- return tabulate(
- rows=[
- [
- repo.repo_id,
- repo.repo_type,
- f"{repo.size_on_disk_str:>12}",
- repo.nb_files,
- repo.last_accessed_str,
- repo.last_modified_str,
- ", ".join(sorted(repo.refs)),
- str(repo.repo_path),
- ]
- for repo in sorted(self.repos, key=lambda repo: repo.repo_path)
- ],
- headers=[
- "REPO ID",
- "REPO TYPE",
- "SIZE ON DISK",
- "NB FILES",
- "LAST_ACCESSED",
- "LAST_MODIFIED",
- "REFS",
- "LOCAL PATH",
- ],
- )
- else:
- return tabulate(
- rows=[
- [
- repo.repo_id,
- repo.repo_type,
- revision.commit_hash,
- f"{revision.size_on_disk_str:>12}",
- revision.nb_files,
- revision.last_modified_str,
- ", ".join(sorted(revision.refs)),
- str(revision.snapshot_path),
- ]
- for repo in sorted(self.repos, key=lambda repo: repo.repo_path)
- for revision in sorted(repo.revisions, key=lambda revision: revision.commit_hash)
- ],
- headers=[
- "REPO ID",
- "REPO TYPE",
- "REVISION",
- "SIZE ON DISK",
- "NB FILES",
- "LAST_MODIFIED",
- "REFS",
- "LOCAL PATH",
- ],
- )
- def scan_cache_dir(cache_dir: str | Path | None = None) -> HFCacheInfo:
- """Scan the entire HF cache-system and return a [`~HFCacheInfo`] structure.
- Use `scan_cache_dir` in order to programmatically scan your cache-system. The cache
- will be scanned repo by repo. If a repo is corrupted, a [`~CorruptedCacheException`]
- will be thrown internally but captured and returned in the [`~HFCacheInfo`]
- structure. Only valid repos get a proper report.
- ```py
- >>> from huggingface_hub import scan_cache_dir
- >>> hf_cache_info = scan_cache_dir()
- HFCacheInfo(
- size_on_disk=3398085269,
- repos=frozenset({
- CachedRepoInfo(
- repo_id='t5-small',
- repo_type='model',
- repo_path=PosixPath(...),
- size_on_disk=970726914,
- nb_files=11,
- revisions=frozenset({
- CachedRevisionInfo(
- commit_hash='d78aea13fa7ecd06c29e3e46195d6341255065d5',
- size_on_disk=970726339,
- snapshot_path=PosixPath(...),
- files=frozenset({
- CachedFileInfo(
- file_name='config.json',
- size_on_disk=1197
- file_path=PosixPath(...),
- blob_path=PosixPath(...),
- ),
- CachedFileInfo(...),
- ...
- }),
- ),
- CachedRevisionInfo(...),
- ...
- }),
- ),
- CachedRepoInfo(...),
- ...
- }),
- warnings=[
- CorruptedCacheException("Snapshots dir doesn't exist in cached repo: ..."),
- CorruptedCacheException(...),
- ...
- ],
- )
- ```
- You can also print a detailed report directly from the `hf` command line using:
- ```text
- > hf cache ls
- ID SIZE LAST_ACCESSED LAST_MODIFIED REFS
- --------------------------- -------- ------------- ------------- -----------
- dataset/nyu-mll/glue 157.4M 2 days ago 2 days ago main script
- model/LiquidAI/LFM2-VL-1.6B 3.2G 4 days ago 4 days ago main
- model/microsoft/UserLM-8b 32.1G 4 days ago 4 days ago main
- Done in 0.0s. Scanned 6 repo(s) for a total of 3.4G.
- Got 1 warning(s) while scanning. Use -vvv to print details.
- ```
- Args:
- cache_dir (`str` or `Path`, `optional`):
- Cache directory to cache. Defaults to the default HF cache directory.
- > [!WARNING]
- > Raises:
- >
- > `CacheNotFound`
- > If the cache directory does not exist.
- >
- > [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
- > If the cache directory is a file, instead of a directory.
- Returns: a [`~HFCacheInfo`] object.
- """
- if cache_dir is None:
- cache_dir = HF_HUB_CACHE
- cache_dir = Path(cache_dir).expanduser().resolve()
- if not cache_dir.exists():
- raise CacheNotFound(
- f"Cache directory not found: {cache_dir}. Please use `cache_dir` argument or set `HF_HUB_CACHE` environment variable.",
- cache_dir=cache_dir,
- )
- if cache_dir.is_file():
- raise ValueError(
- f"Scan cache expects a directory but found a file: {cache_dir}. Please use `cache_dir` argument or set `HF_HUB_CACHE` environment variable."
- )
- repos: set[CachedRepoInfo] = set()
- warnings: list[CorruptedCacheException] = []
- for repo_path in cache_dir.iterdir():
- if repo_path.name == ".locks": # skip './.locks/' folder
- continue
- if repo_path.name == "CACHEDIR.TAG": # skip CACHEDIR.TAG file
- continue
- try:
- repos.add(_scan_cached_repo(repo_path))
- except CorruptedCacheException as e:
- warnings.append(e)
- return HFCacheInfo(
- repos=frozenset(repos),
- size_on_disk=sum(repo.size_on_disk for repo in repos),
- warnings=warnings,
- )
- def _scan_cached_repo(repo_path: Path) -> CachedRepoInfo:
- """Scan a single cache repo and return information about it.
- Any unexpected behavior will raise a [`~CorruptedCacheException`].
- """
- if not repo_path.is_dir():
- raise CorruptedCacheException(f"Repo path is not a directory: {repo_path}")
- if "--" not in repo_path.name:
- raise CorruptedCacheException(f"Repo path is not a valid HuggingFace cache directory: {repo_path}")
- repo_type, repo_id = repo_path.name.split("--", maxsplit=1)
- repo_type = repo_type[:-1] # "models" -> "model"
- repo_id = repo_id.replace("--", "/") # google/fleurs -> "google/fleurs"
- if repo_type not in {"dataset", "model", "space"}:
- raise CorruptedCacheException(
- f"Repo type must be `dataset`, `model` or `space`, found `{repo_type}` ({repo_path})."
- )
- blob_stats: dict[Path, os.stat_result] = {} # Key is blob_path, value is blob stats
- snapshots_path = repo_path / "snapshots"
- refs_path = repo_path / "refs"
- if not snapshots_path.exists() or not snapshots_path.is_dir():
- raise CorruptedCacheException(f"Snapshots dir doesn't exist in cached repo: {snapshots_path}")
- # Scan over `refs` directory
- # key is revision hash, value is set of refs
- refs_by_hash: dict[str, set[str]] = defaultdict(set)
- if refs_path.exists():
- # Example of `refs` directory
- # ── refs
- # ├── main
- # └── refs
- # └── pr
- # └── 1
- if refs_path.is_file():
- raise CorruptedCacheException(f"Refs directory cannot be a file: {refs_path}")
- for ref_path in refs_path.glob("**/*"):
- # glob("**/*") iterates over all files and directories -> skip directories
- if ref_path.is_dir() or ref_path.name in FILES_TO_IGNORE:
- continue
- ref_name = str(ref_path.relative_to(refs_path))
- with ref_path.open() as f:
- commit_hash = f.read()
- refs_by_hash[commit_hash].add(ref_name)
- # Scan snapshots directory
- cached_revisions: set[CachedRevisionInfo] = set()
- for revision_path in snapshots_path.iterdir():
- # Ignore OS-created helper files
- if revision_path.name in FILES_TO_IGNORE:
- continue
- if revision_path.is_file():
- raise CorruptedCacheException(f"Snapshots folder corrupted. Found a file: {revision_path}")
- cached_files = set()
- for file_path in revision_path.glob("**/*"):
- # glob("**/*") iterates over all files and directories -> skip directories
- if file_path.is_dir():
- continue
- blob_path = Path(file_path).resolve()
- if not blob_path.exists():
- raise CorruptedCacheException(f"Blob missing (broken symlink): {blob_path}")
- if blob_path not in blob_stats:
- blob_stats[blob_path] = blob_path.stat()
- cached_files.add(
- CachedFileInfo(
- file_name=file_path.name,
- file_path=file_path,
- size_on_disk=blob_stats[blob_path].st_size,
- blob_path=blob_path,
- blob_last_accessed=blob_stats[blob_path].st_atime,
- blob_last_modified=blob_stats[blob_path].st_mtime,
- )
- )
- # Last modified is either the last modified blob file or the revision folder
- # itself if it is empty
- if len(cached_files) > 0:
- revision_last_modified = max(blob_stats[file.blob_path].st_mtime for file in cached_files)
- else:
- revision_last_modified = revision_path.stat().st_mtime
- cached_revisions.add(
- CachedRevisionInfo(
- commit_hash=revision_path.name,
- files=frozenset(cached_files),
- refs=frozenset(refs_by_hash.pop(revision_path.name, set())),
- size_on_disk=sum(
- blob_stats[blob_path].st_size for blob_path in {file.blob_path for file in cached_files}
- ),
- snapshot_path=revision_path,
- last_modified=revision_last_modified,
- )
- )
- # Check that all refs referred to an existing revision
- if len(refs_by_hash) > 0:
- raise CorruptedCacheException(
- f"Reference(s) refer to missing commit hashes: {dict(refs_by_hash)} ({repo_path})."
- )
- # Last modified is either the last modified blob file or the repo folder itself if
- # no blob files has been found. Same for last accessed.
- if len(blob_stats) > 0:
- repo_last_accessed = max(stat.st_atime for stat in blob_stats.values())
- repo_last_modified = max(stat.st_mtime for stat in blob_stats.values())
- else:
- repo_stats = repo_path.stat()
- repo_last_accessed = repo_stats.st_atime
- repo_last_modified = repo_stats.st_mtime
- # Build and return frozen structure
- return CachedRepoInfo(
- nb_files=len(blob_stats),
- repo_id=repo_id,
- repo_path=repo_path,
- repo_type=repo_type, # type: ignore
- revisions=frozenset(cached_revisions),
- size_on_disk=sum(stat.st_size for stat in blob_stats.values()),
- last_accessed=repo_last_accessed,
- last_modified=repo_last_modified,
- )
- def _format_size(num: int) -> str:
- """Format size in bytes into a human-readable string.
- Taken from https://stackoverflow.com/a/1094933
- """
- num_f = float(num)
- for unit in ["", "K", "M", "G", "T", "P", "E", "Z"]:
- if abs(num_f) < 1000.0:
- return f"{num_f:3.1f}{unit}"
- num_f /= 1000.0
- return f"{num_f:.1f}Y"
- def _try_delete_path(path: Path, path_type: str) -> None:
- """Try to delete a local file or folder.
- If the path does not exist, error is logged as a warning and then ignored.
- Args:
- path (`Path`)
- Path to delete. Can be a file or a folder.
- path_type (`str`)
- What path are we deleting ? Only for logging purposes. Example: "snapshot".
- """
- logger.info(f"Delete {path_type}: {path}")
- try:
- if path.is_file():
- os.remove(path)
- else:
- shutil.rmtree(path)
- except FileNotFoundError:
- logger.warning(f"Couldn't delete {path_type}: file not found ({path})", exc_info=True)
- except PermissionError:
- logger.warning(f"Couldn't delete {path_type}: permission denied ({path})", exc_info=True)
|