| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364 |
- """Utilities to efficiently compute the SHA 256 hash of a bunch of bytes."""
- from typing import BinaryIO
- from .insecure_hashlib import sha1, sha256
- def sha_fileobj(fileobj: BinaryIO, chunk_size: int | None = None) -> bytes:
- """
- Computes the sha256 hash of the given file object, by chunks of size `chunk_size`.
- Args:
- fileobj (file-like object):
- The File object to compute sha256 for, typically obtained with `open(path, "rb")`
- chunk_size (`int`, *optional*):
- The number of bytes to read from `fileobj` at once, defaults to 1MB.
- Returns:
- `bytes`: `fileobj`'s sha256 hash as bytes
- """
- chunk_size = chunk_size if chunk_size is not None else 1024 * 1024
- sha = sha256()
- while True:
- chunk = fileobj.read(chunk_size)
- sha.update(chunk)
- if not chunk:
- break
- return sha.digest()
- def git_hash(data: bytes) -> str:
- """
- Computes the git-sha1 hash of the given bytes, using the same algorithm as git.
- This is equivalent to running `git hash-object`. See https://git-scm.com/docs/git-hash-object
- for more details.
- Note: this method is valid for regular files. For LFS files, the proper git hash is supposed to be computed on the
- pointer file content, not the actual file content. However, for simplicity, we directly compare the sha256 of
- the LFS file content when we want to compare LFS files.
- Args:
- data (`bytes`):
- The data to compute the git-hash for.
- Returns:
- `str`: the git-hash of `data` as an hexadecimal string.
- Example:
- ```python
- >>> from huggingface_hub.utils.sha import git_hash
- >>> git_hash(b"Hello, World!")
- 'b45ef6fec89518d314f546fd6c3025367b721684'
- ```
- """
- # Taken from https://gist.github.com/msabramo/763200
- # Note: no need to optimize by reading the file in chunks as we're not supposed to hash huge files (5MB maximum).
- sha = sha1()
- sha.update(b"blob ")
- sha.update(str(len(data)).encode())
- sha.update(b"\0")
- sha.update(data)
- return sha.hexdigest()
|