| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936 |
- import copy
- import errno
- import os
- import re
- import shutil
- import stat
- import time
- import uuid
- import warnings
- from dataclasses import dataclass
- from pathlib import Path
- from typing import Any, BinaryIO, Literal, NoReturn, overload
- from urllib.parse import quote, urlparse
- import httpx
- from tqdm.auto import tqdm as base_tqdm
- from . import constants
- from ._local_folder import (
- _create_cachedir_tag,
- get_local_download_paths,
- read_download_metadata,
- write_download_metadata,
- )
- from .errors import (
- FileMetadataError,
- GatedRepoError,
- HfHubHTTPError,
- LocalEntryNotFoundError,
- RemoteEntryNotFoundError,
- RepositoryNotFoundError,
- RevisionNotFoundError,
- )
- from .utils import (
- OfflineModeIsEnabled,
- SoftTemporaryDirectory,
- WeakFileLock,
- XetFileData,
- build_hf_headers,
- hf_raise_for_status,
- logging,
- parse_xet_file_data_from_response,
- refresh_xet_connection_info,
- tqdm,
- validate_hf_hub_args,
- )
- from .utils._http import (
- _DEFAULT_RETRY_ON_EXCEPTIONS,
- _DEFAULT_RETRY_ON_STATUS_CODES,
- _adjust_range_header,
- _httpx_follow_relative_redirects_with_backoff,
- http_stream_backoff,
- )
- from .utils._runtime import is_xet_available
- from .utils.sha import sha_fileobj
- from .utils.tqdm import _get_progress_bar_context
- logger = logging.get_logger(__name__)
- # Return value when trying to load a file from cache but the file does not exist in the distant repo.
- _CACHED_NO_EXIST = object()
- _CACHED_NO_EXIST_T = Any
- # Regex to get filename from a "Content-Disposition" header for CDN-served files
- HEADER_FILENAME_PATTERN = re.compile(r'filename="(?P<filename>.*?)";')
- # Regex to check if the revision IS directly a commit_hash
- REGEX_COMMIT_HASH = re.compile(r"^[0-9a-f]{40}$")
- # Regex to check if the file etag IS a valid sha256
- REGEX_SHA256 = re.compile(r"^[0-9a-f]{64}$")
- _are_symlinks_supported_in_dir: dict[str, bool] = {}
- # Internal retry timeout for metadata fetch when no local file exists
- _ETAG_RETRY_TIMEOUT = 60
- def are_symlinks_supported(cache_dir: str | Path | None = None) -> bool:
- """Return whether the symlinks are supported on the machine.
- Since symlinks support can change depending on the mounted disk, we need to check
- on the precise cache folder. By default, the default HF cache directory is checked.
- Args:
- cache_dir (`str`, `Path`, *optional*):
- Path to the folder where cached files are stored.
- Returns: [bool] Whether symlinks are supported in the directory.
- """
- # Defaults to HF cache
- if cache_dir is None:
- cache_dir = constants.HF_HUB_CACHE
- cache_dir = str(Path(cache_dir).expanduser().resolve()) # make it unique
- # If symlinks are explicitly disabled by the user, always return False
- if constants.HF_HUB_DISABLE_SYMLINKS:
- return False
- # Check symlink compatibility only once (per cache directory) at first time use
- if cache_dir not in _are_symlinks_supported_in_dir:
- _are_symlinks_supported_in_dir[cache_dir] = True
- os.makedirs(cache_dir, exist_ok=True)
- with SoftTemporaryDirectory(dir=cache_dir) as tmpdir:
- src_path = Path(tmpdir) / "dummy_file_src"
- src_path.touch()
- dst_path = Path(tmpdir) / "dummy_file_dst"
- # Relative source path as in `_create_symlink``
- relative_src = os.path.relpath(src_path, start=os.path.dirname(dst_path))
- try:
- os.symlink(relative_src, dst_path)
- except OSError:
- # Likely running on Windows
- _are_symlinks_supported_in_dir[cache_dir] = False
- if not constants.HF_HUB_DISABLE_SYMLINKS_WARNING:
- message = (
- "`huggingface_hub` cache-system uses symlinks by default to"
- " efficiently store duplicated files but your machine does not"
- f" support them in {cache_dir}. Caching files will still work"
- " but in a degraded version that might require more space on"
- " your disk. This warning can be disabled by setting the"
- " `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For"
- " more details, see"
- " https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations."
- )
- if os.name == "nt":
- message += (
- "\nTo support symlinks on Windows, you either need to"
- " activate Developer Mode or to run Python as an"
- " administrator. In order to activate developer mode,"
- " see this article:"
- " https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development"
- )
- warnings.warn(message)
- return _are_symlinks_supported_in_dir[cache_dir]
- @dataclass(frozen=True)
- class HfFileMetadata:
- """Data structure containing information about a file versioned on the Hub.
- Returned by [`get_hf_file_metadata`] based on a URL.
- Args:
- commit_hash (`str`, *optional*):
- The commit_hash related to the file.
- etag (`str`, *optional*):
- Etag of the file on the server.
- location (`str`):
- Location where to download the file. Can be a Hub url or not (CDN).
- size (`size`):
- Size of the file. In case of an LFS file, contains the size of the actual
- LFS file, not the pointer.
- xet_file_data (`XetFileData`, *optional*):
- Xet information for the file. This is only set if the file is stored using Xet storage.
- """
- commit_hash: str | None
- etag: str | None
- location: str
- size: int | None
- xet_file_data: XetFileData | None
- @dataclass
- class DryRunFileInfo:
- """Information returned when performing a dry run of a file download.
- Returned by [`hf_hub_download`] when `dry_run=True`.
- Args:
- commit_hash (`str`):
- The commit_hash related to the file.
- file_size (`int`):
- Size of the file. In case of an LFS file, contains the size of the actual LFS file, not the pointer.
- filename (`str`):
- Name of the file in the repo.
- is_cached (`bool`):
- Whether the file is already cached locally.
- will_download (`bool`):
- Whether the file will be downloaded if `hf_hub_download` is called with `dry_run=False`.
- In practice, will_download is `True` if the file is not cached or if `force_download=True`.
- """
- commit_hash: str
- file_size: int
- filename: str
- local_path: str
- is_cached: bool
- will_download: bool
- @validate_hf_hub_args
- def hf_hub_url(
- repo_id: str,
- filename: str,
- *,
- subfolder: str | None = None,
- repo_type: str | None = None,
- revision: str | None = None,
- endpoint: str | None = None,
- ) -> str:
- """Construct the URL of a file from the given information.
- The resolved address can either be a huggingface.co-hosted url, or a link to
- Cloudfront (a Content Delivery Network, or CDN) for large files which are
- more than a few MBs.
- Args:
- repo_id (`str`):
- A namespace (user or an organization) name and a repo name separated
- by a `/`.
- filename (`str`):
- The name of the file in the repo.
- subfolder (`str`, *optional*):
- An optional value corresponding to a folder inside the repo.
- repo_type (`str`, *optional*):
- Set to `"dataset"`, `"space"` or `"kernel"` if downloading from a dataset, space or kernel repo,
- `None` or `"model"` if downloading from a model. Default is `None`.
- revision (`str`, *optional*):
- An optional Git revision id which can be a branch name, a tag, or a
- commit hash.
- Example:
- ```python
- >>> from huggingface_hub import hf_hub_url
- >>> hf_hub_url(
- ... repo_id="julien-c/EsperBERTo-small", filename="pytorch_model.bin"
- ... )
- 'https://huggingface.co/julien-c/EsperBERTo-small/resolve/main/pytorch_model.bin'
- ```
- > [!TIP]
- > Notes:
- >
- > Cloudfront is replicated over the globe so downloads are way faster for
- > the end user (and it also lowers our bandwidth costs).
- >
- > Cloudfront aggressively caches files by default (default TTL is 24
- > hours), however this is not an issue here because we implement a
- > git-based versioning system on huggingface.co, which means that we store
- > the files on S3/Cloudfront in a content-addressable way (i.e., the file
- > name is its hash). Using content-addressable filenames means cache can't
- > ever be stale.
- >
- > In terms of client-side caching from this library, we base our caching
- > on the objects' entity tag (`ETag`), which is an identifier of a
- > specific version of a resource [1]_. An object's ETag is: its git-sha1
- > if stored in git, or its sha256 if stored in git-lfs.
- References:
- - [1] https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/ETag
- """
- if subfolder == "":
- subfolder = None
- if subfolder is not None:
- filename = f"{subfolder}/{filename}"
- if repo_type not in constants.REPO_TYPES_WITH_KERNEL:
- raise ValueError("Invalid repo type")
- if repo_type in constants.REPO_TYPES_URL_PREFIXES:
- repo_id = constants.REPO_TYPES_URL_PREFIXES[repo_type] + repo_id
- if revision is None:
- revision = constants.DEFAULT_REVISION
- url = constants.HUGGINGFACE_CO_URL_TEMPLATE.format(
- repo_id=repo_id, revision=quote(revision, safe=""), filename=quote(filename)
- )
- # Update endpoint if provided
- if endpoint is not None and url.startswith(constants.ENDPOINT):
- url = endpoint + url[len(constants.ENDPOINT) :]
- return url
- def _get_file_length_from_http_response(response: httpx.Response) -> int | None:
- """
- Get the length of the file from the HTTP response headers.
- This function extracts the file size from the HTTP response headers, either from the
- `Content-Range` or `Content-Length` header, if available (in that order).
- Args:
- response (`httpx.Response`):
- The HTTP response object.
- Returns:
- `int` or `None`: The length of the file in bytes, or None if not available.
- """
- # If HTTP response contains compressed body (e.g. gzip), the `Content-Length` header will
- # contain the length of the compressed body, not the uncompressed file size.
- # And at the start of transmission there's no way to know the uncompressed file size for gzip,
- # thus we return None in that case.
- content_encoding = response.headers.get("Content-Encoding", "identity").lower()
- if content_encoding != "identity":
- # gzip/br/deflate/zstd etc
- return None
- content_range = response.headers.get("Content-Range")
- if content_range is not None:
- return int(content_range.rsplit("/")[-1])
- content_length = response.headers.get("Content-Length")
- if content_length is not None:
- return int(content_length)
- return None
- @validate_hf_hub_args
- def http_get(
- url: str,
- temp_file: BinaryIO,
- *,
- resume_size: int = 0,
- headers: dict[str, Any] | None = None,
- expected_size: int | None = None,
- displayed_filename: str | None = None,
- tqdm_class: type[base_tqdm] | None = None,
- _nb_retries: int = 5,
- _tqdm_bar: tqdm | None = None,
- ) -> None:
- """
- Download a remote file. Do not gobble up errors, and will return errors tailored to the Hugging Face Hub.
- If ConnectionError (SSLError) or ReadTimeout happen while streaming data from the server, it is most likely a
- transient error (network outage?). We log a warning message and try to resume the download a few times before
- giving up. The method gives up after 5 attempts if no new data has being received from the server.
- Args:
- url (`str`):
- The URL of the file to download.
- temp_file (`BinaryIO`):
- The file-like object where to save the file.
- resume_size (`int`, *optional*):
- The number of bytes already downloaded. If set to 0 (default), the whole file is download. If set to a
- positive number, the download will resume at the given position.
- headers (`dict`, *optional*):
- Dictionary of HTTP Headers to send with the request.
- expected_size (`int`, *optional*):
- The expected size of the file to download. If set, the download will raise an error if the size of the
- received content is different from the expected one.
- displayed_filename (`str`, *optional*):
- The filename of the file that is being downloaded. Value is used only to display a nice progress bar. If
- not set, the filename is guessed from the URL or the `Content-Disposition` header.
- """
- if expected_size is not None and resume_size == expected_size:
- # If the file is already fully downloaded, we don't need to download it again.
- return
- initial_headers = headers
- headers = copy.deepcopy(headers) or {}
- if resume_size > 0:
- headers["Range"] = _adjust_range_header(headers.get("Range"), resume_size)
- elif expected_size and expected_size > constants.MAX_HTTP_DOWNLOAD_SIZE:
- # Any files over 50GB will not be available through basic http requests.
- raise ValueError(
- "The file is too large to be downloaded using the regular download method. "
- " Install `hf_xet` with `pip install hf_xet` for xet-powered downloads."
- )
- with http_stream_backoff(
- method="GET",
- url=url,
- headers=headers,
- timeout=constants.HF_HUB_DOWNLOAD_TIMEOUT,
- retry_on_exceptions=(),
- retry_on_status_codes=(429,),
- ) as response:
- hf_raise_for_status(response)
- # If we requested a Range but got 200 back, the server ignored our Range header
- # (e.g. CloudFront with Accept-Encoding: gzip). Reset file to avoid corruption.
- if resume_size > 0 and response.status_code == 200:
- temp_file.seek(0)
- temp_file.truncate()
- resume_size = 0
- total: int | None = _get_file_length_from_http_response(response)
- if displayed_filename is None:
- displayed_filename = url
- content_disposition = response.headers.get("Content-Disposition")
- if content_disposition is not None:
- match = HEADER_FILENAME_PATTERN.search(content_disposition)
- if match is not None:
- # Means file is on CDN
- displayed_filename = match.groupdict()["filename"]
- # Truncate filename if too long to display
- if len(displayed_filename) > 40:
- displayed_filename = f"(…){displayed_filename[-40:]}"
- consistency_error_message = (
- f"Consistency check failed: file should be of size {expected_size} but has size"
- f" {{actual_size}} ({displayed_filename}).\nThis is usually due to network issues while downloading the file."
- " Please retry with `force_download=True`."
- )
- progress_cm = _get_progress_bar_context(
- desc=displayed_filename,
- log_level=logger.getEffectiveLevel(),
- total=total,
- initial=resume_size,
- name="huggingface_hub.http_get",
- tqdm_class=tqdm_class,
- _tqdm_bar=_tqdm_bar,
- )
- with progress_cm as progress:
- new_resume_size = resume_size
- try:
- for chunk in response.iter_bytes(chunk_size=constants.DOWNLOAD_CHUNK_SIZE):
- if chunk: # filter out keep-alive new chunks
- progress.update(len(chunk))
- temp_file.write(chunk)
- new_resume_size += len(chunk)
- # Some data has been downloaded from the server so we reset the number of retries.
- _nb_retries = 5
- except (httpx.ConnectError, httpx.TimeoutException) as e:
- # If ConnectionError (SSLError) or ReadTimeout happen while streaming data from the server, it is most likely
- # a transient error (network outage?). We log a warning message and try to resume the download a few times
- # before giving up. Tre retry mechanism is basic but should be enough in most cases.
- if _nb_retries <= 0:
- logger.warning("Error while downloading from %s: %s\nMax retries exceeded.", url, str(e))
- raise
- logger.warning("Error while downloading from %s: %s\nTrying to resume download...", url, str(e))
- time.sleep(1)
- return http_get(
- url=url,
- temp_file=temp_file,
- resume_size=new_resume_size,
- headers=initial_headers,
- expected_size=expected_size,
- tqdm_class=tqdm_class,
- _nb_retries=_nb_retries - 1,
- _tqdm_bar=_tqdm_bar,
- )
- if expected_size is not None and expected_size != temp_file.tell():
- raise OSError(
- consistency_error_message.format(
- actual_size=temp_file.tell(),
- )
- )
- def xet_get(
- *,
- incomplete_path: Path,
- xet_file_data: XetFileData,
- headers: dict[str, str],
- expected_size: int | None = None,
- displayed_filename: str | None = None,
- tqdm_class: type[base_tqdm] | None = None,
- _tqdm_bar: tqdm | None = None,
- ) -> None:
- """
- Download a file using Xet storage service.
- Args:
- incomplete_path (`Path`):
- The path to the file to download.
- xet_file_data (`XetFileData`):
- The file metadata needed to make the request to the xet storage service.
- headers (`dict[str, str]`):
- The headers to send to the xet storage service.
- expected_size (`int`, *optional*):
- The expected size of the file to download. If set, the download will raise an error if the size of the
- received content is different from the expected one.
- displayed_filename (`str`, *optional*):
- The filename of the file that is being downloaded. Value is used only to display a nice progress bar. If
- not set, the filename is guessed from the URL or the `Content-Disposition` header.
- **How it works:**
- The file download system uses Xet storage, which is a content-addressable storage system that breaks files into chunks
- for efficient storage and transfer.
- `hf_xet.download_files` manages downloading files by:
- - Taking a list of files to download (each with its unique content hash)
- - Connecting to a storage server (CAS server) that knows how files are chunked
- - Using authentication to ensure secure access
- - Providing progress updates during download
- Authentication works by regularly refreshing access tokens through `refresh_xet_connection_info` to maintain a valid
- connection to the storage server.
- The download process works like this:
- 1. Create a local cache folder at `~/.cache/huggingface/xet/chunk-cache` to store reusable file chunks
- 2. Download files in parallel:
- 2.1. Prepare to write the file to disk
- 2.2. Ask the server "how is this file split into chunks?" using the file's unique hash
- The server responds with:
- - Which chunks make up the complete file
- - Where each chunk can be downloaded from
- 2.3. For each needed chunk:
- - Checks if we already have it in our local cache
- - If not, download it from cloud storage (S3)
- - Save it to cache for future use
- - Assemble the chunks in order to recreate the original file
- """
- try:
- from hf_xet import PyXetDownloadInfo, download_files # type: ignore[no-redef]
- except ImportError:
- raise ValueError(
- "To use optimized download using Xet storage, you need to install the hf_xet package. "
- 'Try `pip install "huggingface_hub[hf_xet]"` or `pip install hf_xet`.'
- )
- connection_info = refresh_xet_connection_info(file_data=xet_file_data, headers=headers)
- def token_refresher() -> tuple[str, int]:
- connection_info = refresh_xet_connection_info(file_data=xet_file_data, headers=headers)
- if connection_info is None:
- raise ValueError("Failed to refresh token using xet metadata.")
- return connection_info.access_token, connection_info.expiration_unix_epoch
- xet_download_info = [
- PyXetDownloadInfo(
- destination_path=str(incomplete_path.absolute()), hash=xet_file_data.file_hash, file_size=expected_size
- )
- ]
- if not displayed_filename:
- displayed_filename = incomplete_path.name
- # Truncate filename if too long to display
- if len(displayed_filename) > 40:
- displayed_filename = f"{displayed_filename[:40]}(…)"
- progress_cm = _get_progress_bar_context(
- desc=displayed_filename,
- log_level=logger.getEffectiveLevel(),
- total=expected_size,
- initial=0,
- name="huggingface_hub.xet_get",
- tqdm_class=tqdm_class,
- _tqdm_bar=_tqdm_bar,
- )
- xet_headers = headers.copy()
- xet_headers.pop("authorization", None)
- with progress_cm as progress:
- def progress_updater(progress_bytes: float):
- progress.update(progress_bytes)
- download_files(
- xet_download_info,
- endpoint=connection_info.endpoint,
- token_info=(connection_info.access_token, connection_info.expiration_unix_epoch),
- token_refresher=token_refresher,
- progress_updater=[progress_updater],
- request_headers=xet_headers,
- )
- def _normalize_etag(etag: str | None) -> str | None:
- """Normalize ETag HTTP header, so it can be used to create nice filepaths.
- The HTTP spec allows two forms of ETag:
- ETag: W/"<etag_value>"
- ETag: "<etag_value>"
- For now, we only expect the second form from the server, but we want to be future-proof so we support both. For
- more context, see `TestNormalizeEtag` tests and https://github.com/huggingface/huggingface_hub/pull/1428.
- Args:
- etag (`str`, *optional*): HTTP header
- Returns:
- `str` or `None`: string that can be used as a nice directory name.
- Returns `None` if input is None.
- """
- if etag is None:
- return None
- return etag.lstrip("W/").strip('"')
- def _create_relative_symlink(src: str, dst: str, new_blob: bool = False) -> None:
- """Alias method used in `transformers` conversion script."""
- return _create_symlink(src=src, dst=dst, new_blob=new_blob)
- def _create_symlink(src: str, dst: str, new_blob: bool = False) -> None:
- """Create a symbolic link named dst pointing to src.
- By default, it will try to create a symlink using a relative path. Relative paths have 2 advantages:
- - If the cache_folder is moved (example: back-up on a shared drive), relative paths within the cache folder will
- not break.
- - Relative paths seems to be better handled on Windows. Issue was reported 3 times in less than a week when
- changing from relative to absolute paths. See https://github.com/huggingface/huggingface_hub/issues/1398,
- https://github.com/huggingface/diffusers/issues/2729 and https://github.com/huggingface/transformers/pull/22228.
- NOTE: The issue with absolute paths doesn't happen on admin mode.
- When creating a symlink from the cache to a local folder, it is possible that a relative path cannot be created.
- This happens when paths are not on the same volume. In that case, we use absolute paths.
- The result layout looks something like
- └── [ 128] snapshots
- ├── [ 128] 2439f60ef33a0d46d85da5001d52aeda5b00ce9f
- │ ├── [ 52] README.md -> ../../../blobs/d7edf6bd2a681fb0175f7735299831ee1b22b812
- │ └── [ 76] pytorch_model.bin -> ../../../blobs/403450e234d65943a7dcf7e05a771ce3c92faa84dd07db4ac20f592037a1e4bd
- If symlinks cannot be created on this platform (most likely to be Windows), the workaround is to avoid symlinks by
- having the actual file in `dst`. If it is a new file (`new_blob=True`), we move it to `dst`. If it is not a new file
- (`new_blob=False`), we don't know if the blob file is already referenced elsewhere. To avoid breaking existing
- cache, the file is duplicated on the disk.
- In case symlinks are not supported, a warning message is displayed to the user once when loading `huggingface_hub`.
- The warning message can be disabled with the `DISABLE_SYMLINKS_WARNING` environment variable.
- """
- try:
- os.remove(dst)
- except OSError:
- pass
- abs_src = os.path.abspath(os.path.expanduser(src))
- abs_dst = os.path.abspath(os.path.expanduser(dst))
- abs_dst_folder = os.path.dirname(abs_dst)
- # Use relative_dst in priority
- try:
- relative_src = os.path.relpath(abs_src, abs_dst_folder)
- except ValueError:
- # Raised on Windows if src and dst are not on the same volume. This is the case when creating a symlink to a
- # local_dir instead of within the cache directory.
- # See https://docs.python.org/3/library/os.path.html#os.path.relpath
- relative_src = None
- try:
- commonpath = os.path.commonpath([abs_src, abs_dst])
- _support_symlinks = are_symlinks_supported(commonpath)
- except ValueError:
- # Raised if src and dst are not on the same volume. Symlinks will still work on Linux/Macos.
- # See https://docs.python.org/3/library/os.path.html#os.path.commonpath
- _support_symlinks = os.name != "nt" and not constants.HF_HUB_DISABLE_SYMLINKS
- except PermissionError:
- # Permission error means src and dst are not in the same volume (e.g. destination path has been provided
- # by the user via `local_dir`. Let's test symlink support there)
- _support_symlinks = are_symlinks_supported(abs_dst_folder)
- except OSError as e:
- # OS error (errno=30) means that the commonpath is readonly on Linux/MacOS.
- if e.errno == errno.EROFS:
- _support_symlinks = are_symlinks_supported(abs_dst_folder)
- else:
- raise
- # Symlinks are supported => let's create a symlink.
- if _support_symlinks:
- src_rel_or_abs = relative_src or abs_src
- logger.debug(f"Creating pointer from {src_rel_or_abs} to {abs_dst}")
- try:
- os.symlink(src_rel_or_abs, abs_dst)
- return
- except FileExistsError:
- if os.path.islink(abs_dst) and os.path.realpath(abs_dst) == os.path.realpath(abs_src):
- # `abs_dst` already exists and is a symlink to the `abs_src` blob. It is most likely that the file has
- # been cached twice concurrently (exactly between `os.remove` and `os.symlink`). Do nothing.
- return
- else:
- # Very unlikely to happen. Means a file `dst` has been created exactly between `os.remove` and
- # `os.symlink` and is not a symlink to the `abs_src` blob file. Raise exception.
- raise
- except PermissionError:
- # Permission error means src and dst are not in the same volume (e.g. download to local dir) and symlink
- # is supported on both volumes but not between them. Let's just make a hard copy in that case.
- pass
- # Symlinks are not supported => let's move or copy the file.
- if new_blob:
- logger.debug(f"Symlink not supported. Moving file from {abs_src} to {abs_dst}")
- shutil.move(abs_src, abs_dst, copy_function=_copy_no_matter_what)
- else:
- logger.debug(f"Symlink not supported. Copying file from {abs_src} to {abs_dst}")
- shutil.copyfile(abs_src, abs_dst)
- def _cache_commit_hash_for_specific_revision(storage_folder: str, revision: str, commit_hash: str) -> None:
- """Cache reference between a revision (tag, branch or truncated commit hash) and the corresponding commit hash.
- Does nothing if `revision` is already a proper `commit_hash` or reference is already cached.
- """
- if revision != commit_hash:
- ref_path = Path(storage_folder) / "refs" / revision
- ref_path.parent.mkdir(parents=True, exist_ok=True)
- if not ref_path.exists() or commit_hash != ref_path.read_text():
- # Update ref only if has been updated. Could cause useless error in case
- # repo is already cached and user doesn't have write access to cache folder.
- # See https://github.com/huggingface/huggingface_hub/issues/1216.
- ref_path.write_text(commit_hash)
- @validate_hf_hub_args
- def repo_folder_name(*, repo_id: str, repo_type: str) -> str:
- """Return a serialized version of a hf.co repo name and type, safe for disk storage
- as a single non-nested folder.
- Example: models--julien-c--EsperBERTo-small
- """
- # remove all `/` occurrences to correctly convert repo to directory name
- parts = [f"{repo_type}s", *repo_id.split("/")]
- return constants.REPO_ID_SEPARATOR.join(parts)
- def _check_disk_space(expected_size: int, target_dir: str | Path) -> None:
- """Check disk usage and log a warning if there is not enough disk space to download the file.
- Args:
- expected_size (`int`):
- The expected size of the file in bytes.
- target_dir (`str`):
- The directory where the file will be stored after downloading.
- """
- target_dir = Path(target_dir) # format as `Path`
- for path in [target_dir] + list(target_dir.parents): # first check target_dir, then each parents one by one
- try:
- target_dir_free = shutil.disk_usage(path).free
- if target_dir_free < expected_size:
- warnings.warn(
- "Not enough free disk space to download the file. "
- f"The expected file size is: {expected_size / 1e6:.2f} MB. "
- f"The target location {target_dir} only has {target_dir_free / 1e6:.2f} MB free disk space."
- )
- return
- except OSError: # raise on anything: file does not exist or space disk cannot be checked
- pass
- @overload
- def hf_hub_download(
- repo_id: str,
- filename: str,
- *,
- subfolder: str | None = None,
- repo_type: str | None = None,
- revision: str | None = None,
- library_name: str | None = None,
- library_version: str | None = None,
- cache_dir: str | Path | None = None,
- local_dir: str | Path | None = None,
- user_agent: dict | str | None = None,
- force_download: bool = False,
- etag_timeout: float = constants.DEFAULT_ETAG_TIMEOUT,
- token: bool | str | None = None,
- local_files_only: bool = False,
- headers: dict[str, str] | None = None,
- endpoint: str | None = None,
- tqdm_class: type[base_tqdm] | None = None,
- dry_run: Literal[False] = False,
- ) -> str: ...
- @overload
- def hf_hub_download(
- repo_id: str,
- filename: str,
- *,
- subfolder: str | None = None,
- repo_type: str | None = None,
- revision: str | None = None,
- library_name: str | None = None,
- library_version: str | None = None,
- cache_dir: str | Path | None = None,
- local_dir: str | Path | None = None,
- user_agent: dict | str | None = None,
- force_download: bool = False,
- etag_timeout: float = constants.DEFAULT_ETAG_TIMEOUT,
- token: bool | str | None = None,
- local_files_only: bool = False,
- headers: dict[str, str] | None = None,
- endpoint: str | None = None,
- tqdm_class: type[base_tqdm] | None = None,
- dry_run: Literal[True] = True,
- ) -> DryRunFileInfo: ...
- @overload
- def hf_hub_download(
- repo_id: str,
- filename: str,
- *,
- subfolder: str | None = None,
- repo_type: str | None = None,
- revision: str | None = None,
- library_name: str | None = None,
- library_version: str | None = None,
- cache_dir: str | Path | None = None,
- local_dir: str | Path | None = None,
- user_agent: dict | str | None = None,
- force_download: bool = False,
- etag_timeout: float = constants.DEFAULT_ETAG_TIMEOUT,
- token: bool | str | None = None,
- local_files_only: bool = False,
- headers: dict[str, str] | None = None,
- endpoint: str | None = None,
- tqdm_class: type[base_tqdm] | None = None,
- dry_run: bool = False,
- ) -> str | DryRunFileInfo: ...
- @validate_hf_hub_args
- def hf_hub_download(
- repo_id: str,
- filename: str,
- *,
- subfolder: str | None = None,
- repo_type: str | None = None,
- revision: str | None = None,
- library_name: str | None = None,
- library_version: str | None = None,
- cache_dir: str | Path | None = None,
- local_dir: str | Path | None = None,
- user_agent: dict | str | None = None,
- force_download: bool = False,
- etag_timeout: float = constants.DEFAULT_ETAG_TIMEOUT,
- token: bool | str | None = None,
- local_files_only: bool = False,
- headers: dict[str, str] | None = None,
- endpoint: str | None = None,
- tqdm_class: type[base_tqdm] | None = None,
- dry_run: bool = False,
- ) -> str | DryRunFileInfo:
- """Download a given file if it's not already present in the local cache.
- The new cache file layout looks like this:
- - The cache directory contains one subfolder per repo_id (namespaced by repo type)
- - inside each repo folder:
- - refs is a list of the latest known revision => commit_hash pairs
- - blobs contains the actual file blobs (identified by their git-sha or sha256, depending on
- whether they're LFS files or not)
- - snapshots contains one subfolder per commit, each "commit" contains the subset of the files
- that have been resolved at that particular commit. Each filename is a symlink to the blob
- at that particular commit.
- ```
- [ 96] .
- └── [ 160] models--julien-c--EsperBERTo-small
- ├── [ 160] blobs
- │ ├── [321M] 403450e234d65943a7dcf7e05a771ce3c92faa84dd07db4ac20f592037a1e4bd
- │ ├── [ 398] 7cb18dc9bafbfcf74629a4b760af1b160957a83e
- │ └── [1.4K] d7edf6bd2a681fb0175f7735299831ee1b22b812
- ├── [ 96] refs
- │ └── [ 40] main
- └── [ 128] snapshots
- ├── [ 128] 2439f60ef33a0d46d85da5001d52aeda5b00ce9f
- │ ├── [ 52] README.md -> ../../blobs/d7edf6bd2a681fb0175f7735299831ee1b22b812
- │ └── [ 76] pytorch_model.bin -> ../../blobs/403450e234d65943a7dcf7e05a771ce3c92faa84dd07db4ac20f592037a1e4bd
- └── [ 128] bbc77c8132af1cc5cf678da3f1ddf2de43606d48
- ├── [ 52] README.md -> ../../blobs/7cb18dc9bafbfcf74629a4b760af1b160957a83e
- └── [ 76] pytorch_model.bin -> ../../blobs/403450e234d65943a7dcf7e05a771ce3c92faa84dd07db4ac20f592037a1e4bd
- ```
- If `local_dir` is provided, the file structure from the repo will be replicated in this location. When using this
- option, the `cache_dir` will not be used and a `.cache/huggingface/` folder will be created at the root of `local_dir`
- to store some metadata related to the downloaded files. While this mechanism is not as robust as the main
- cache-system, it's optimized for regularly pulling the latest version of a repository.
- Args:
- repo_id (`str`):
- A user or an organization name and a repo name separated by a `/`.
- filename (`str`):
- The name of the file in the repo.
- subfolder (`str`, *optional*):
- An optional value corresponding to a folder inside the model repo.
- repo_type (`str`, *optional*):
- Set to `"dataset"`, `"space"` or `"kernel"` if downloading from a dataset, space or kernel repo,
- `None` or `"model"` if downloading from a model. Default is `None`.
- revision (`str`, *optional*):
- An optional Git revision id which can be a branch name, a tag, or a
- commit hash.
- library_name (`str`, *optional*):
- The name of the library to which the object corresponds.
- library_version (`str`, *optional*):
- The version of the library.
- cache_dir (`str`, `Path`, *optional*):
- Path to the folder where cached files are stored.
- local_dir (`str` or `Path`, *optional*):
- If provided, the downloaded file will be placed under this directory.
- user_agent (`dict`, `str`, *optional*):
- The user-agent info in the form of a dictionary or a string.
- force_download (`bool`, *optional*, defaults to `False`):
- Whether the file should be downloaded even if it already exists in
- the local cache.
- etag_timeout (`float`, *optional*, defaults to `10`):
- When fetching ETag, how many seconds to wait for the server to send
- data before giving up which is passed to `requests.request`.
- token (`str`, `bool`, *optional*):
- A token to be used for the download.
- - If `True`, the token is read from the HuggingFace config
- folder.
- - If a string, it's used as the authentication token.
- local_files_only (`bool`, *optional*, defaults to `False`):
- If `True`, avoid downloading the file and return the path to the
- local cached file if it exists.
- headers (`dict`, *optional*):
- Additional headers to be sent with the request.
- tqdm_class (`tqdm`, *optional*):
- If provided, overwrites the default behavior for the progress bar. Passed
- argument must inherit from `tqdm.auto.tqdm` or at least mimic its behavior.
- Defaults to the custom HF progress bar that can be disabled by setting
- `HF_HUB_DISABLE_PROGRESS_BARS` environment variable.
- dry_run (`bool`, *optional*, defaults to `False`):
- If `True`, perform a dry run without actually downloading the file. Returns a
- [`DryRunFileInfo`] object containing information about what would be downloaded.
- Returns:
- `str` or [`DryRunFileInfo`]:
- - If `dry_run=False`: Local path of file or if networking is off, last version of file cached on disk.
- - If `dry_run=True`: A [`DryRunFileInfo`] object containing download information.
- Raises:
- [`~utils.RepositoryNotFoundError`]
- If the repository to download from cannot be found. This may be because it doesn't exist,
- or because it is set to `private` and you do not have access.
- [`~utils.RevisionNotFoundError`]
- If the revision to download from cannot be found.
- [`~utils.RemoteEntryNotFoundError`]
- If the file to download cannot be found.
- [`~utils.LocalEntryNotFoundError`]
- If network is disabled or unavailable and file is not found in cache.
- [`EnvironmentError`](https://docs.python.org/3/library/exceptions.html#EnvironmentError)
- If `token=True` but the token cannot be found.
- [`OSError`](https://docs.python.org/3/library/exceptions.html#OSError)
- If ETag cannot be determined.
- [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
- If some parameter value is invalid.
- """
- if constants.HF_HUB_ETAG_TIMEOUT != constants.DEFAULT_ETAG_TIMEOUT:
- # Respect environment variable above user value
- etag_timeout = constants.HF_HUB_ETAG_TIMEOUT
- if cache_dir is None:
- cache_dir = constants.HF_HUB_CACHE
- if revision is None:
- revision = constants.DEFAULT_REVISION
- if isinstance(cache_dir, Path):
- cache_dir = str(cache_dir)
- if isinstance(local_dir, Path):
- local_dir = str(local_dir)
- if subfolder == "":
- subfolder = None
- if subfolder is not None:
- # This is used to create a URL, and not a local path, hence the forward slash.
- filename = f"{subfolder}/{filename}"
- if repo_type is None:
- repo_type = "model"
- if repo_type not in constants.REPO_TYPES_WITH_KERNEL:
- raise ValueError(
- f"Invalid repo type: {repo_type}. Accepted repo types are: {str(constants.REPO_TYPES_WITH_KERNEL)}"
- )
- hf_headers = build_hf_headers(
- token=token,
- library_name=library_name,
- library_version=library_version,
- user_agent=user_agent,
- headers=headers,
- )
- if local_dir is not None:
- return _hf_hub_download_to_local_dir(
- # Destination
- local_dir=local_dir,
- # File info
- repo_id=repo_id,
- repo_type=repo_type,
- filename=filename,
- revision=revision,
- # HTTP info
- endpoint=endpoint,
- etag_timeout=etag_timeout,
- headers=hf_headers,
- token=token,
- # Additional options
- cache_dir=cache_dir,
- force_download=force_download,
- local_files_only=local_files_only,
- tqdm_class=tqdm_class,
- dry_run=dry_run,
- )
- else:
- return _hf_hub_download_to_cache_dir(
- # Destination
- cache_dir=cache_dir,
- # File info
- repo_id=repo_id,
- filename=filename,
- repo_type=repo_type,
- revision=revision,
- # HTTP info
- endpoint=endpoint,
- etag_timeout=etag_timeout,
- headers=hf_headers,
- token=token,
- # Additional options
- local_files_only=local_files_only,
- force_download=force_download,
- tqdm_class=tqdm_class,
- dry_run=dry_run,
- )
- def _hf_hub_download_to_cache_dir(
- *,
- # Destination
- cache_dir: str,
- # File info
- repo_id: str,
- filename: str,
- repo_type: str,
- revision: str,
- # HTTP info
- endpoint: str | None,
- etag_timeout: float,
- headers: dict[str, str],
- token: bool | str | None,
- # Additional options
- local_files_only: bool,
- force_download: bool,
- tqdm_class: type[base_tqdm] | None,
- dry_run: bool,
- ) -> str | DryRunFileInfo:
- """Download a given file to a cache folder, if not already present.
- Method should not be called directly. Please use `hf_hub_download` instead.
- """
- locks_dir = os.path.join(cache_dir, ".locks")
- storage_folder = os.path.join(cache_dir, repo_folder_name(repo_id=repo_id, repo_type=repo_type))
- # cross-platform transcription of filename, to be used as a local file path.
- relative_filename = os.path.join(*filename.split("/"))
- if os.name == "nt":
- if relative_filename.startswith("..\\") or "\\..\\" in relative_filename:
- raise ValueError(
- f"Invalid filename: cannot handle filename '{relative_filename}' on Windows. Please ask the repository"
- " owner to rename this file."
- )
- # if user provides a commit_hash and they already have the file on disk, shortcut everything.
- if REGEX_COMMIT_HASH.match(revision):
- pointer_path = _get_pointer_path(storage_folder, revision, relative_filename)
- if os.path.exists(pointer_path):
- if dry_run:
- return DryRunFileInfo(
- commit_hash=revision,
- file_size=os.path.getsize(pointer_path),
- filename=filename,
- is_cached=True,
- local_path=pointer_path,
- will_download=force_download,
- )
- if not force_download:
- return pointer_path
- # Try to get metadata (etag, commit_hash, url, size) from the server.
- # If we can't, a HEAD request error is returned.
- (url_to_download, etag, commit_hash, expected_size, xet_file_data, head_call_error) = _get_metadata_or_catch_error(
- repo_id=repo_id,
- filename=filename,
- repo_type=repo_type,
- revision=revision,
- endpoint=endpoint,
- etag_timeout=etag_timeout,
- headers=headers,
- token=token,
- local_files_only=local_files_only,
- storage_folder=storage_folder,
- relative_filename=relative_filename,
- )
- # etag can be None for several reasons:
- # 1. we passed local_files_only.
- # 2. we don't have a connection
- # 3. Hub is down (HTTP 500, 503, 504)
- # 4. repo is not found -for example private or gated- and invalid/missing token sent
- # 5. Hub is blocked by a firewall or proxy is not set correctly.
- # => Try to get the last downloaded one from the specified revision.
- #
- # If the specified revision is a commit hash, look inside "snapshots".
- # If the specified revision is a branch or tag, look inside "refs".
- if head_call_error is not None:
- # Couldn't make a HEAD call => let's try to find a local file
- if not force_download:
- commit_hash = None
- if REGEX_COMMIT_HASH.match(revision):
- commit_hash = revision
- else:
- ref_path = os.path.join(storage_folder, "refs", revision)
- if os.path.isfile(ref_path):
- with open(ref_path) as f:
- commit_hash = f.read()
- # Return pointer file if exists
- if commit_hash is not None:
- pointer_path = _get_pointer_path(storage_folder, commit_hash, relative_filename)
- if os.path.exists(pointer_path):
- if dry_run:
- return DryRunFileInfo(
- commit_hash=commit_hash,
- file_size=os.path.getsize(pointer_path),
- filename=filename,
- is_cached=True,
- local_path=pointer_path,
- will_download=force_download,
- )
- if not force_download:
- return pointer_path
- if isinstance(head_call_error, _DEFAULT_RETRY_ON_EXCEPTIONS) or (
- isinstance(head_call_error, HfHubHTTPError)
- and head_call_error.response.status_code in _DEFAULT_RETRY_ON_STATUS_CODES
- ):
- logger.info("No local file found. Retrying..")
- (url_to_download, etag, commit_hash, expected_size, xet_file_data, head_call_error) = (
- _get_metadata_or_catch_error(
- repo_id=repo_id,
- filename=filename,
- repo_type=repo_type,
- revision=revision,
- endpoint=endpoint,
- etag_timeout=_ETAG_RETRY_TIMEOUT,
- headers=headers,
- token=token,
- local_files_only=local_files_only,
- storage_folder=storage_folder,
- relative_filename=relative_filename,
- retry_on_errors=True,
- )
- )
- # If still error, raise
- if head_call_error is not None:
- _raise_on_head_call_error(head_call_error, force_download, local_files_only)
- # From now on, etag, commit_hash, url and size are not None.
- assert etag is not None, "etag must have been retrieved from server"
- assert commit_hash is not None, "commit_hash must have been retrieved from server"
- assert url_to_download is not None, "file location must have been retrieved from server"
- assert expected_size is not None, "expected_size must have been retrieved from server"
- blob_path = os.path.join(storage_folder, "blobs", etag)
- pointer_path = _get_pointer_path(storage_folder, commit_hash, relative_filename)
- if dry_run:
- is_cached = os.path.exists(pointer_path) or os.path.exists(blob_path)
- return DryRunFileInfo(
- commit_hash=commit_hash,
- file_size=expected_size,
- filename=filename,
- is_cached=is_cached,
- local_path=pointer_path,
- will_download=force_download or not is_cached,
- )
- os.makedirs(os.path.dirname(blob_path), exist_ok=True)
- os.makedirs(os.path.dirname(pointer_path), exist_ok=True)
- # Tag cache_dir so backup tools can skip it (CACHEDIR.TAG standard).
- _create_cachedir_tag(Path(cache_dir))
- # if passed revision is not identical to commit_hash
- # then revision has to be a branch name or tag name.
- # In that case store a ref.
- _cache_commit_hash_for_specific_revision(storage_folder, revision, commit_hash)
- # Prevent parallel downloads of the same file with a lock.
- # etag could be duplicated across repos,
- lock_path = os.path.join(locks_dir, repo_folder_name(repo_id=repo_id, repo_type=repo_type), f"{etag}.lock")
- # Some Windows versions do not allow for paths longer than 255 characters.
- # In this case, we must specify it as an extended path by using the "\\?\" prefix.
- if (
- os.name == "nt"
- and len(os.path.abspath(lock_path)) > 255
- and not os.path.abspath(lock_path).startswith("\\\\?\\")
- ):
- lock_path = "\\\\?\\" + os.path.abspath(lock_path)
- if (
- os.name == "nt"
- and len(os.path.abspath(blob_path)) > 255
- and not os.path.abspath(blob_path).startswith("\\\\?\\")
- ):
- blob_path = "\\\\?\\" + os.path.abspath(blob_path)
- Path(lock_path).parent.mkdir(parents=True, exist_ok=True)
- # pointer already exists -> immediate return
- if not force_download and os.path.exists(pointer_path):
- return pointer_path
- # Blob exists but pointer must be (safely) created -> take the lock
- if not force_download and os.path.exists(blob_path):
- with WeakFileLock(lock_path):
- if not os.path.exists(pointer_path):
- _create_symlink(blob_path, pointer_path, new_blob=False)
- return pointer_path
- # Local file doesn't exist or etag isn't a match => retrieve file from remote (or cache)
- with WeakFileLock(lock_path):
- _download_to_tmp_and_move(
- incomplete_path=Path(blob_path + ".incomplete"),
- destination_path=Path(blob_path),
- url_to_download=url_to_download,
- headers=headers,
- expected_size=expected_size,
- filename=filename,
- force_download=force_download,
- etag=etag,
- xet_file_data=xet_file_data,
- tqdm_class=tqdm_class,
- )
- if not os.path.exists(pointer_path):
- _create_symlink(blob_path, pointer_path, new_blob=True)
- return pointer_path
- def _hf_hub_download_to_local_dir(
- *,
- # Destination
- local_dir: str | Path,
- # File info
- repo_id: str,
- repo_type: str,
- filename: str,
- revision: str,
- # HTTP info
- endpoint: str | None,
- etag_timeout: float,
- headers: dict[str, str],
- token: bool | str | None,
- # Additional options
- cache_dir: str,
- force_download: bool,
- local_files_only: bool,
- tqdm_class: type[base_tqdm] | None,
- dry_run: bool,
- ) -> str | DryRunFileInfo:
- """Download a given file to a local folder, if not already present.
- Method should not be called directly. Please use `hf_hub_download` instead.
- """
- # Some Windows versions do not allow for paths longer than 255 characters.
- # In this case, we must specify it as an extended path by using the "\\?\" prefix.
- if os.name == "nt" and len(os.path.abspath(local_dir)) > 255:
- local_dir = "\\\\?\\" + os.path.abspath(local_dir)
- local_dir = Path(local_dir)
- paths = get_local_download_paths(local_dir=local_dir, filename=filename)
- local_metadata = read_download_metadata(local_dir=local_dir, filename=filename)
- # Local file exists + metadata exists + commit_hash matches => return file
- if (
- REGEX_COMMIT_HASH.match(revision)
- and paths.file_path.is_file()
- and local_metadata is not None
- and local_metadata.commit_hash == revision
- ):
- local_file = str(paths.file_path)
- if dry_run:
- return DryRunFileInfo(
- commit_hash=revision,
- file_size=os.path.getsize(local_file),
- filename=filename,
- is_cached=True,
- local_path=local_file,
- will_download=force_download,
- )
- if not force_download:
- return local_file
- # Local file doesn't exist or commit_hash doesn't match => we need the etag
- (url_to_download, etag, commit_hash, expected_size, xet_file_data, head_call_error) = _get_metadata_or_catch_error(
- repo_id=repo_id,
- filename=filename,
- repo_type=repo_type,
- revision=revision,
- endpoint=endpoint,
- etag_timeout=etag_timeout,
- headers=headers,
- token=token,
- local_files_only=local_files_only,
- )
- if head_call_error is not None:
- # No HEAD call but local file exists => default to local file
- if paths.file_path.is_file():
- if dry_run or not force_download:
- logger.warning(
- f"Couldn't access the Hub to check for update but local file already exists. Defaulting to existing file. (error: {head_call_error})"
- )
- local_path = str(paths.file_path)
- if dry_run and local_metadata is not None:
- return DryRunFileInfo(
- commit_hash=local_metadata.commit_hash,
- file_size=os.path.getsize(local_path),
- filename=filename,
- is_cached=True,
- local_path=local_path,
- will_download=force_download,
- )
- if not force_download:
- return local_path
- elif not force_download:
- if isinstance(head_call_error, _DEFAULT_RETRY_ON_EXCEPTIONS) or (
- isinstance(head_call_error, HfHubHTTPError)
- and head_call_error.response.status_code in _DEFAULT_RETRY_ON_STATUS_CODES
- ):
- logger.info("No local file found. Retrying..")
- (url_to_download, etag, commit_hash, expected_size, xet_file_data, head_call_error) = (
- _get_metadata_or_catch_error(
- repo_id=repo_id,
- filename=filename,
- repo_type=repo_type,
- revision=revision,
- endpoint=endpoint,
- etag_timeout=_ETAG_RETRY_TIMEOUT,
- headers=headers,
- token=token,
- local_files_only=local_files_only,
- retry_on_errors=True,
- )
- )
- # If still error, raise
- if head_call_error is not None:
- _raise_on_head_call_error(head_call_error, force_download, local_files_only)
- # From now on, etag, commit_hash, url and size are not None.
- assert etag is not None, "etag must have been retrieved from server"
- assert commit_hash is not None, "commit_hash must have been retrieved from server"
- assert url_to_download is not None, "file location must have been retrieved from server"
- assert expected_size is not None, "expected_size must have been retrieved from server"
- # Local file exists => check if it's up-to-date
- if not force_download and paths.file_path.is_file():
- # etag matches => update metadata and return file
- if local_metadata is not None and local_metadata.etag == etag:
- write_download_metadata(local_dir=local_dir, filename=filename, commit_hash=commit_hash, etag=etag)
- if dry_run:
- return DryRunFileInfo(
- commit_hash=commit_hash,
- file_size=expected_size,
- filename=filename,
- is_cached=True,
- local_path=str(paths.file_path),
- will_download=False,
- )
- return str(paths.file_path)
- # metadata is outdated + etag is a sha256
- # => means it's an LFS file (large)
- # => let's compute local hash and compare
- # => if match, update metadata and return file
- if local_metadata is None and REGEX_SHA256.match(etag) is not None:
- with open(paths.file_path, "rb") as f:
- file_hash = sha_fileobj(f).hex()
- if file_hash == etag:
- write_download_metadata(local_dir=local_dir, filename=filename, commit_hash=commit_hash, etag=etag)
- if dry_run:
- return DryRunFileInfo(
- commit_hash=commit_hash,
- file_size=expected_size,
- filename=filename,
- is_cached=True,
- local_path=str(paths.file_path),
- will_download=False,
- )
- return str(paths.file_path)
- # Local file doesn't exist or etag isn't a match => retrieve file from remote (or cache)
- # If we are lucky enough, the file is already in the cache => copy it
- if not force_download:
- cached_path = try_to_load_from_cache(
- repo_id=repo_id,
- filename=filename,
- cache_dir=cache_dir,
- revision=commit_hash,
- repo_type=repo_type,
- )
- if isinstance(cached_path, str):
- with WeakFileLock(paths.lock_path):
- paths.file_path.parent.mkdir(parents=True, exist_ok=True)
- shutil.copyfile(cached_path, paths.file_path)
- write_download_metadata(local_dir=local_dir, filename=filename, commit_hash=commit_hash, etag=etag)
- if dry_run:
- return DryRunFileInfo(
- commit_hash=commit_hash,
- file_size=expected_size,
- filename=filename,
- is_cached=True,
- local_path=str(paths.file_path),
- will_download=False,
- )
- return str(paths.file_path)
- if dry_run:
- is_cached = paths.file_path.is_file()
- return DryRunFileInfo(
- commit_hash=commit_hash,
- file_size=expected_size,
- filename=filename,
- is_cached=is_cached,
- local_path=str(paths.file_path),
- will_download=force_download or not is_cached,
- )
- # Otherwise, let's download the file!
- with WeakFileLock(paths.lock_path):
- paths.file_path.unlink(missing_ok=True) # delete outdated file first
- _download_to_tmp_and_move(
- incomplete_path=paths.incomplete_path(etag),
- destination_path=paths.file_path,
- url_to_download=url_to_download,
- headers=headers,
- expected_size=expected_size,
- filename=filename,
- force_download=force_download,
- etag=etag,
- xet_file_data=xet_file_data,
- tqdm_class=tqdm_class,
- )
- write_download_metadata(local_dir=local_dir, filename=filename, commit_hash=commit_hash, etag=etag)
- return str(paths.file_path)
- @validate_hf_hub_args
- def try_to_load_from_cache(
- repo_id: str,
- filename: str,
- cache_dir: str | Path | None = None,
- revision: str | None = None,
- repo_type: str | None = None,
- ) -> str | _CACHED_NO_EXIST_T | None:
- """
- Explores the cache to return the latest cached file for a given revision if found.
- This function will not raise any exception if the file in not cached.
- Args:
- cache_dir (`str` or `os.PathLike`):
- The folder where the cached files lie.
- repo_id (`str`):
- The ID of the repo on huggingface.co.
- filename (`str`):
- The filename to look for inside `repo_id`.
- revision (`str`, *optional*):
- The specific model version to use. Will default to `"main"` if it's not provided and no `commit_hash` is
- provided either.
- repo_type (`str`, *optional*):
- The type of the repository. Will default to `"model"`.
- Returns:
- `Optional[str]` or `_CACHED_NO_EXIST`:
- Will return `None` if the file was not cached. Otherwise:
- - The exact path to the cached file if it's found in the cache
- - A special value `_CACHED_NO_EXIST` if the file does not exist at the given commit hash and this fact was
- cached.
- Example:
- ```python
- from huggingface_hub import try_to_load_from_cache, _CACHED_NO_EXIST
- filepath = try_to_load_from_cache()
- if isinstance(filepath, str):
- # file exists and is cached
- ...
- elif filepath is _CACHED_NO_EXIST:
- # non-existence of file is cached
- ...
- else:
- # file is not cached
- ...
- ```
- """
- if revision is None:
- revision = "main"
- if repo_type is None:
- repo_type = "model"
- if repo_type not in constants.REPO_TYPES_WITH_KERNEL:
- raise ValueError(
- f"Invalid repo type: {repo_type}. Accepted repo types are: {str(constants.REPO_TYPES_WITH_KERNEL)}"
- )
- if cache_dir is None:
- cache_dir = constants.HF_HUB_CACHE
- repo_cache = os.path.join(cache_dir, repo_folder_name(repo_id=repo_id, repo_type=repo_type))
- if not os.path.isdir(repo_cache):
- # No cache for this model
- return None
- refs_dir = os.path.join(repo_cache, "refs")
- snapshots_dir = os.path.join(repo_cache, "snapshots")
- no_exist_dir = os.path.join(repo_cache, ".no_exist")
- # Resolve refs (for instance to convert main to the associated commit sha)
- if os.path.isdir(refs_dir):
- revision_file = os.path.join(refs_dir, revision)
- if os.path.isfile(revision_file):
- with open(revision_file) as f:
- revision = f.read()
- # Check if file is cached as "no_exist"
- if os.path.isfile(os.path.join(no_exist_dir, revision, filename)):
- return _CACHED_NO_EXIST
- # Check if revision folder exists
- if not os.path.exists(snapshots_dir):
- return None
- cached_shas = os.listdir(snapshots_dir)
- if revision not in cached_shas:
- # No cache for this revision and we won't try to return a random revision
- return None
- # Check if file exists in cache
- cached_file = os.path.join(snapshots_dir, revision, filename)
- return cached_file if os.path.isfile(cached_file) else None
- @validate_hf_hub_args
- def get_hf_file_metadata(
- url: str,
- token: bool | str | None = None,
- timeout: float | None = constants.HF_HUB_ETAG_TIMEOUT,
- library_name: str | None = None,
- library_version: str | None = None,
- user_agent: dict | str | None = None,
- headers: dict[str, str] | None = None,
- endpoint: str | None = None,
- retry_on_errors: bool = False,
- ) -> HfFileMetadata:
- """Fetch metadata of a file versioned on the Hub for a given url.
- Args:
- url (`str`):
- File url, for example returned by [`hf_hub_url`].
- token (`str` or `bool`, *optional*):
- A token to be used for the download.
- - If `True`, the token is read from the HuggingFace config
- folder.
- - If `False` or `None`, no token is provided.
- - If a string, it's used as the authentication token.
- timeout (`float`, *optional*, defaults to 10):
- How many seconds to wait for the server to send metadata before giving up.
- library_name (`str`, *optional*):
- The name of the library to which the object corresponds.
- library_version (`str`, *optional*):
- The version of the library.
- user_agent (`dict`, `str`, *optional*):
- The user-agent info in the form of a dictionary or a string.
- headers (`dict`, *optional*):
- Additional headers to be sent with the request.
- endpoint (`str`, *optional*):
- Endpoint of the Hub. Defaults to <https://huggingface.co>.
- retry_on_errors (`bool`, *optional*, defaults to `False`):
- Whether to retry on errors (429, 5xx, timeout, network errors).
- If False, no retry for fast fallback to local cache.
- Returns:
- A [`HfFileMetadata`] object containing metadata such as location, etag, size and
- commit_hash.
- """
- hf_headers = build_hf_headers(
- token=token,
- library_name=library_name,
- library_version=library_version,
- user_agent=user_agent,
- headers=headers,
- )
- hf_headers["Accept-Encoding"] = "identity" # prevent any compression => we want to know the real size of the file
- # Retrieve metadata
- response = _httpx_follow_relative_redirects_with_backoff(
- method="HEAD", url=url, headers=hf_headers, timeout=timeout, retry_on_errors=retry_on_errors
- )
- hf_raise_for_status(response)
- # Return
- return HfFileMetadata(
- commit_hash=response.headers.get(constants.HUGGINGFACE_HEADER_X_REPO_COMMIT),
- # We favor a custom header indicating the etag of the linked resource, and we fall back to the regular etag header.
- etag=_normalize_etag(
- response.headers.get(constants.HUGGINGFACE_HEADER_X_LINKED_ETAG) or response.headers.get("ETag")
- ),
- # Either from response headers (if redirected) or defaults to request url
- # Do not use directly `url` as we might have followed relative redirects.
- location=response.headers.get("Location") or str(response.request.url), # type: ignore
- size=_int_or_none(
- response.headers.get(constants.HUGGINGFACE_HEADER_X_LINKED_SIZE) or response.headers.get("Content-Length")
- ),
- xet_file_data=parse_xet_file_data_from_response(response, endpoint=endpoint), # type: ignore
- )
- def _get_metadata_or_catch_error(
- *,
- repo_id: str,
- filename: str,
- repo_type: str,
- revision: str,
- endpoint: str | None,
- etag_timeout: float | None,
- headers: dict[str, str], # mutated inplace!
- token: bool | str | None,
- local_files_only: bool,
- relative_filename: str | None = None, # only used to store `.no_exists` in cache
- storage_folder: str | None = None, # only used to store `.no_exists` in cache
- retry_on_errors: bool = False,
- ) -> (
- # Either an exception is caught and returned
- tuple[None, None, None, None, None, Exception]
- |
- # Or the metadata is returned as
- # `(url_to_download, etag, commit_hash, expected_size, xet_file_data, None)`
- tuple[str, str, str, int, XetFileData | None, None]
- ):
- """Get metadata for a file on the Hub, safely handling network issues.
- Returns either the etag, commit_hash and expected size of the file, or the error
- raised while fetching the metadata.
- NOTE: This function mutates `headers` inplace! It removes the `authorization` header
- if the file is a LFS blob and the domain of the url is different from the
- domain of the location (typically an S3 bucket).
- """
- if local_files_only:
- return (
- None,
- None,
- None,
- None,
- None,
- OfflineModeIsEnabled(
- f"Cannot access file since 'local_files_only=True' as been set. (repo_id: {repo_id}, repo_type: {repo_type}, revision: {revision}, filename: {filename})"
- ),
- )
- url = hf_hub_url(repo_id, filename, repo_type=repo_type, revision=revision, endpoint=endpoint)
- url_to_download: str = url
- etag: str | None = None
- commit_hash: str | None = None
- expected_size: int | None = None
- head_error_call: Exception | None = None
- xet_file_data: XetFileData | None = None
- # Try to get metadata from the server.
- # Do not raise yet if the file is not found or not accessible.
- if not local_files_only:
- try:
- try:
- metadata = get_hf_file_metadata(
- url=url,
- timeout=etag_timeout,
- headers=headers,
- token=token,
- endpoint=endpoint,
- retry_on_errors=retry_on_errors,
- )
- except RemoteEntryNotFoundError as http_error:
- if storage_folder is not None and relative_filename is not None:
- # Cache the non-existence of the file
- commit_hash = http_error.response.headers.get(constants.HUGGINGFACE_HEADER_X_REPO_COMMIT)
- if commit_hash is not None:
- no_exist_file_path = Path(storage_folder) / ".no_exist" / commit_hash / relative_filename
- try:
- no_exist_file_path.parent.mkdir(parents=True, exist_ok=True)
- no_exist_file_path.touch()
- except OSError as e:
- logger.error(
- f"Could not cache non-existence of file. Will ignore error and continue. Error: {e}"
- )
- _cache_commit_hash_for_specific_revision(storage_folder, revision, commit_hash)
- raise
- # Commit hash must exist
- commit_hash = metadata.commit_hash
- if commit_hash is None:
- raise FileMetadataError(
- "Distant resource does not seem to be on huggingface.co. It is possible that a configuration issue"
- " prevents you from downloading resources from https://huggingface.co. Please check your firewall"
- " and proxy settings and make sure your SSL certificates are updated."
- )
- # Etag must exist
- # If we don't have any of those, raise an error.
- etag = metadata.etag
- if etag is None:
- raise FileMetadataError(
- "Distant resource does not have an ETag, we won't be able to reliably ensure reproducibility."
- )
- # Size must exist
- expected_size = metadata.size
- if expected_size is None:
- raise FileMetadataError("Distant resource does not have a Content-Length.")
- xet_file_data = metadata.xet_file_data
- # In case of a redirect, save an extra redirect on the request.get call,
- # and ensure we download the exact atomic version even if it changed
- # between the HEAD and the GET (unlikely, but hey).
- #
- # If url domain is different => we are downloading from a CDN => url is signed => don't send auth
- # If url domain is the same => redirect due to repo rename AND downloading a regular file => keep auth
- if xet_file_data is None and url != metadata.location:
- url_to_download = metadata.location
- if urlparse(url).netloc != urlparse(metadata.location).netloc:
- # Remove authorization header when downloading a LFS blob
- headers.pop("authorization", None)
- except httpx.ProxyError:
- # Actually raise on proxy error
- raise
- except (httpx.ConnectError, httpx.TimeoutException, OfflineModeIsEnabled) as error:
- # Otherwise, our Internet connection is down.
- # etag is None
- head_error_call = error
- except (RevisionNotFoundError, RemoteEntryNotFoundError):
- # The repo was found but the revision or entry doesn't exist on the Hub (never existed or got deleted)
- raise
- except HfHubHTTPError as error:
- # Multiple reasons for an http error:
- # - Repository is private and invalid/missing token sent
- # - Repository is gated and invalid/missing token sent
- # - Hub is down (error 500 or 504)
- # => let's switch to 'local_files_only=True' to check if the files are already cached.
- # (if it's not the case, the error will be re-raised)
- head_error_call = error
- except FileMetadataError as error:
- # Multiple reasons for a FileMetadataError:
- # - Wrong network configuration (proxy, firewall, SSL certificates)
- # - Inconsistency on the Hub
- # => let's switch to 'local_files_only=True' to check if the files are already cached.
- # (if it's not the case, the error will be re-raised)
- head_error_call = error
- if not (local_files_only or etag is not None or head_error_call is not None):
- raise RuntimeError("etag is empty due to uncovered problems")
- return (url_to_download, etag, commit_hash, expected_size, xet_file_data, head_error_call) # type: ignore
- def _raise_on_head_call_error(head_call_error: Exception, force_download: bool, local_files_only: bool) -> NoReturn:
- """Raise an appropriate error when the HEAD call failed and we cannot locate a local file."""
- # No head call => we cannot force download.
- if force_download:
- if local_files_only:
- raise ValueError("Cannot pass 'force_download=True' and 'local_files_only=True' at the same time.")
- elif isinstance(head_call_error, OfflineModeIsEnabled):
- raise ValueError("Cannot pass 'force_download=True' when offline mode is enabled.") from head_call_error
- else:
- raise ValueError("Force download failed due to the above error.") from head_call_error
- # No head call + couldn't find an appropriate file on disk => raise an error.
- if local_files_only:
- raise LocalEntryNotFoundError(
- "Cannot find the requested files in the disk cache and outgoing traffic has been disabled. To enable"
- " hf.co look-ups and downloads online, set 'local_files_only' to False."
- )
- elif isinstance(head_call_error, (RepositoryNotFoundError, GatedRepoError)) or (
- isinstance(head_call_error, HfHubHTTPError) and head_call_error.response.status_code == 401
- ):
- # Repo not found or gated => let's raise the actual error
- # Unauthorized => likely a token issue => let's raise the actual error
- raise head_call_error
- else:
- # Otherwise: most likely a connection issue or Hub downtime => let's warn the user
- raise LocalEntryNotFoundError(
- "An error happened while trying to locate the file on the Hub and we cannot find the requested files"
- " in the local cache. Please check your connection and try again or make sure your Internet connection"
- " is on."
- ) from head_call_error
- def _download_to_tmp_and_move(
- incomplete_path: Path,
- destination_path: Path,
- url_to_download: str,
- headers: dict[str, str],
- expected_size: int | None,
- filename: str,
- force_download: bool,
- etag: str | None,
- xet_file_data: XetFileData | None,
- tqdm_class: type[base_tqdm] | None = None,
- ) -> None:
- """Download content from a URL to a destination path.
- Internal logic:
- - return early if file is already downloaded
- - resume download if possible (from incomplete file)
- - do not resume download if `force_download=True`
- - check disk space before downloading
- - download content to a temporary file
- - set correct permissions on temporary file
- - move the temporary file to the destination path
- Both `incomplete_path` and `destination_path` must be on the same volume to avoid a local copy.
- """
- if destination_path.exists() and not force_download:
- # Do nothing if already exists (except if force_download=True)
- return
- if incomplete_path.exists() and force_download:
- # By default, we will try to resume the download if possible.
- # However, if the user has set `force_download=True`, then we should
- # not resume the download => delete the incomplete file.
- logger.debug(f"Removing incomplete file '{incomplete_path}' (force_download=True)")
- incomplete_path.unlink(missing_ok=True)
- with incomplete_path.open("ab") as f:
- resume_size = f.tell()
- message = f"Downloading '{filename}' to '{incomplete_path}'"
- if resume_size > 0 and expected_size is not None:
- message += f" (resume from {resume_size}/{expected_size})"
- logger.debug(message)
- if expected_size is not None: # might be None if HTTP header not set correctly
- # Check disk space in both tmp and destination path
- _check_disk_space(expected_size, incomplete_path.parent)
- _check_disk_space(expected_size, destination_path.parent)
- if xet_file_data is not None and is_xet_available():
- logger.debug("Xet Storage is enabled for this repo. Downloading file from Xet Storage..")
- xet_get(
- incomplete_path=incomplete_path,
- xet_file_data=xet_file_data,
- headers=headers,
- expected_size=expected_size,
- displayed_filename=filename,
- tqdm_class=tqdm_class,
- )
- else:
- if xet_file_data is not None and not constants.HF_HUB_DISABLE_XET:
- logger.warning(
- "Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. "
- "Falling back to regular HTTP download. "
- "For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`"
- )
- http_get(
- url_to_download,
- f,
- resume_size=resume_size,
- headers=headers,
- expected_size=expected_size,
- tqdm_class=tqdm_class,
- )
- logger.debug(f"Download complete. Moving file to {destination_path}")
- _chmod_and_move(incomplete_path, destination_path)
- def _int_or_none(value: str | None) -> int | None:
- try:
- return int(value) # type: ignore
- except (TypeError, ValueError):
- return None
- def _chmod_and_move(src: Path, dst: Path) -> None:
- """Set correct permission before moving a blob from tmp directory to cache dir.
- Do not take into account the `umask` from the process as there is no convenient way
- to get it that is thread-safe.
- See:
- - About umask: https://docs.python.org/3/library/os.html#os.umask
- - Thread-safety: https://stackoverflow.com/a/70343066
- - About solution: https://github.com/huggingface/huggingface_hub/pull/1220#issuecomment-1326211591
- - Fix issue: https://github.com/huggingface/huggingface_hub/issues/1141
- - Fix issue: https://github.com/huggingface/huggingface_hub/issues/1215
- """
- # Get umask by creating a temporary file in the cached repo folder.
- tmp_file = dst.parent.parent / f"tmp_{uuid.uuid4()}"
- try:
- tmp_file.touch()
- cache_dir_mode = Path(tmp_file).stat().st_mode
- os.chmod(str(src), stat.S_IMODE(cache_dir_mode))
- except OSError as e:
- logger.warning(
- f"Could not set the permissions on the file '{src}'. Error: {e}.\nContinuing without setting permissions."
- )
- finally:
- try:
- tmp_file.unlink()
- except OSError:
- # fails if `tmp_file.touch()` failed => do nothing
- # See https://github.com/huggingface/huggingface_hub/issues/2359
- pass
- shutil.move(str(src), str(dst), copy_function=_copy_no_matter_what)
- def _copy_no_matter_what(src: str, dst: str) -> None:
- """Copy file from src to dst.
- If `shutil.copy2` fails, fallback to `shutil.copyfile`.
- """
- try:
- # Copy file with metadata and permission
- # Can fail e.g. if dst is an S3 mount
- shutil.copy2(src, dst)
- except OSError:
- # Copy only file content
- shutil.copyfile(src, dst)
- def _get_pointer_path(storage_folder: str, revision: str, relative_filename: str) -> str:
- # Using `os.path.abspath` instead of `Path.resolve()` to avoid resolving symlinks
- snapshot_path = os.path.join(storage_folder, "snapshots")
- pointer_path = os.path.join(snapshot_path, revision, relative_filename)
- if Path(os.path.abspath(snapshot_path)) not in Path(os.path.abspath(pointer_path)).parents:
- raise ValueError(
- "Invalid pointer path: cannot create pointer path in snapshot folder if"
- f" `storage_folder='{storage_folder}'`, `revision='{revision}'` and"
- f" `relative_filename='{relative_filename}'`."
- )
- return pointer_path
|