| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133 |
- # Copyright 2019-present, the HuggingFace Inc. team.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- from pathlib import Path
- from ..constants import HF_ASSETS_CACHE
- def cached_assets_path(
- library_name: str,
- namespace: str = "default",
- subfolder: str = "default",
- *,
- assets_dir: str | Path | None = None,
- ) -> Path:
- """Return a folder path to cache arbitrary files.
- `huggingface_hub` provides a canonical folder path to store assets. This is the
- recommended way to integrate cache in a downstream library as it will benefit from
- the builtins tools to scan and delete the cache properly.
- The distinction is made between files cached from the Hub and assets. Files from the
- Hub are cached in a git-aware manner and entirely managed by `huggingface_hub`. See
- [related documentation](https://huggingface.co/docs/huggingface_hub/how-to-cache).
- All other files that a downstream library caches are considered to be "assets"
- (files downloaded from external sources, extracted from a .tar archive, preprocessed
- for training,...).
- Once the folder path is generated, it is guaranteed to exist and to be a directory.
- The path is based on 3 levels of depth: the library name, a namespace and a
- subfolder. Those 3 levels grants flexibility while allowing `huggingface_hub` to
- expect folders when scanning/deleting parts of the assets cache. Within a library,
- it is expected that all namespaces share the same subset of subfolder names but this
- is not a mandatory rule. The downstream library has then full control on which file
- structure to adopt within its cache. Namespace and subfolder are optional (would
- default to a `"default/"` subfolder) but library name is mandatory as we want every
- downstream library to manage its own cache.
- Expected tree:
- ```text
- assets/
- └── datasets/
- │ ├── SQuAD/
- │ │ ├── downloaded/
- │ │ ├── extracted/
- │ │ └── processed/
- │ ├── Helsinki-NLP--tatoeba_mt/
- │ ├── downloaded/
- │ ├── extracted/
- │ └── processed/
- └── transformers/
- ├── default/
- │ ├── something/
- ├── bert-base-cased/
- │ ├── default/
- │ └── training/
- hub/
- └── models--julien-c--EsperBERTo-small/
- ├── blobs/
- │ ├── (...)
- │ ├── (...)
- ├── refs/
- │ └── (...)
- └── [ 128] snapshots/
- ├── 2439f60ef33a0d46d85da5001d52aeda5b00ce9f/
- │ ├── (...)
- └── bbc77c8132af1cc5cf678da3f1ddf2de43606d48/
- └── (...)
- ```
- Args:
- library_name (`str`):
- Name of the library that will manage the cache folder. Example: `"dataset"`.
- namespace (`str`, *optional*, defaults to "default"):
- Namespace to which the data belongs. Example: `"SQuAD"`.
- subfolder (`str`, *optional*, defaults to "default"):
- Subfolder in which the data will be stored. Example: `extracted`.
- assets_dir (`str`, `Path`, *optional*):
- Path to the folder where assets are cached. This must not be the same folder
- where Hub files are cached. Defaults to `HF_HOME / "assets"` if not provided.
- Can also be set with `HF_ASSETS_CACHE` environment variable.
- Returns:
- Path to the cache folder (`Path`).
- Example:
- ```py
- >>> from huggingface_hub import cached_assets_path
- >>> cached_assets_path(library_name="datasets", namespace="SQuAD", subfolder="download")
- PosixPath('/home/wauplin/.cache/huggingface/extra/datasets/SQuAD/download')
- >>> cached_assets_path(library_name="datasets", namespace="SQuAD", subfolder="extracted")
- PosixPath('/home/wauplin/.cache/huggingface/extra/datasets/SQuAD/extracted')
- >>> cached_assets_path(library_name="datasets", namespace="Helsinki-NLP/tatoeba_mt")
- PosixPath('/home/wauplin/.cache/huggingface/extra/datasets/Helsinki-NLP--tatoeba_mt/default')
- >>> cached_assets_path(library_name="datasets", assets_dir="/tmp/tmp123456")
- PosixPath('/tmp/tmp123456/datasets/default/default')
- ```
- """
- # Resolve assets_dir
- if assets_dir is None:
- assets_dir = HF_ASSETS_CACHE
- assets_dir = Path(assets_dir).expanduser().resolve()
- # Avoid names that could create path issues
- for part in (" ", "/", "\\"):
- library_name = library_name.replace(part, "--")
- namespace = namespace.replace(part, "--")
- subfolder = subfolder.replace(part, "--")
- # Path to subfolder is created
- path = assets_dir / library_name / namespace / subfolder
- try:
- path.mkdir(exist_ok=True, parents=True)
- except (FileExistsError, NotADirectoryError):
- raise ValueError(f"Corrupted assets folder: cannot create directory because of an existing file ({path}).")
- # Return
- return path
|