sha.py 2.1 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364
  1. """Utilities to efficiently compute the SHA 256 hash of a bunch of bytes."""
  2. from typing import BinaryIO
  3. from .insecure_hashlib import sha1, sha256
  4. def sha_fileobj(fileobj: BinaryIO, chunk_size: int | None = None) -> bytes:
  5. """
  6. Computes the sha256 hash of the given file object, by chunks of size `chunk_size`.
  7. Args:
  8. fileobj (file-like object):
  9. The File object to compute sha256 for, typically obtained with `open(path, "rb")`
  10. chunk_size (`int`, *optional*):
  11. The number of bytes to read from `fileobj` at once, defaults to 1MB.
  12. Returns:
  13. `bytes`: `fileobj`'s sha256 hash as bytes
  14. """
  15. chunk_size = chunk_size if chunk_size is not None else 1024 * 1024
  16. sha = sha256()
  17. while True:
  18. chunk = fileobj.read(chunk_size)
  19. sha.update(chunk)
  20. if not chunk:
  21. break
  22. return sha.digest()
  23. def git_hash(data: bytes) -> str:
  24. """
  25. Computes the git-sha1 hash of the given bytes, using the same algorithm as git.
  26. This is equivalent to running `git hash-object`. See https://git-scm.com/docs/git-hash-object
  27. for more details.
  28. Note: this method is valid for regular files. For LFS files, the proper git hash is supposed to be computed on the
  29. pointer file content, not the actual file content. However, for simplicity, we directly compare the sha256 of
  30. the LFS file content when we want to compare LFS files.
  31. Args:
  32. data (`bytes`):
  33. The data to compute the git-hash for.
  34. Returns:
  35. `str`: the git-hash of `data` as an hexadecimal string.
  36. Example:
  37. ```python
  38. >>> from huggingface_hub.utils.sha import git_hash
  39. >>> git_hash(b"Hello, World!")
  40. 'b45ef6fec89518d314f546fd6c3025367b721684'
  41. ```
  42. """
  43. # Taken from https://gist.github.com/msabramo/763200
  44. # Note: no need to optimize by reading the file in chunks as we're not supposed to hash huge files (5MB maximum).
  45. sha = sha1()
  46. sha.update(b"blob ")
  47. sha.update(str(len(data)).encode())
  48. sha.update(b"\0")
  49. sha.update(data)
  50. return sha.hexdigest()