numpy.py 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187
  1. import os
  2. import sys
  3. from typing import Dict, Optional, Union
  4. import numpy as np
  5. from safetensors import deserialize, safe_open, serialize, serialize_file
  6. def _tobytes(tensor: np.ndarray) -> bytes:
  7. if not _is_little_endian(tensor):
  8. tensor = tensor.byteswap(inplace=False)
  9. return tensor.tobytes()
  10. def save(
  11. tensor_dict: Dict[str, np.ndarray], metadata: Optional[Dict[str, str]] = None
  12. ) -> bytes:
  13. """
  14. Saves a dictionary of tensors into raw bytes in safetensors format.
  15. Args:
  16. tensor_dict (`Dict[str, np.ndarray]`):
  17. The incoming tensors. Tensors need to be contiguous and dense.
  18. metadata (`Dict[str, str]`, *optional*, defaults to `None`):
  19. Optional text only metadata you might want to save in your header.
  20. For instance it can be useful to specify more about the underlying
  21. tensors. This is purely informative and does not affect tensor loading.
  22. Returns:
  23. `bytes`: The raw bytes representing the format
  24. Example:
  25. ```python
  26. from safetensors.numpy import save
  27. import numpy as np
  28. tensors = {"embedding": np.zeros((512, 1024)), "attention": np.zeros((256, 256))}
  29. byte_data = save(tensors)
  30. ```
  31. """
  32. flattened = {
  33. k: {"dtype": v.dtype.name, "shape": v.shape, "data": _tobytes(v)}
  34. for k, v in tensor_dict.items()
  35. }
  36. serialized = serialize(flattened, metadata=metadata)
  37. result = bytes(serialized)
  38. return result
  39. def save_file(
  40. tensor_dict: Dict[str, np.ndarray],
  41. filename: Union[str, os.PathLike],
  42. metadata: Optional[Dict[str, str]] = None,
  43. ) -> None:
  44. """
  45. Saves a dictionary of tensors into raw bytes in safetensors format.
  46. Args:
  47. tensor_dict (`Dict[str, np.ndarray]`):
  48. The incoming tensors. Tensors need to be contiguous and dense.
  49. filename (`str`, or `os.PathLike`)):
  50. The filename we're saving into.
  51. metadata (`Dict[str, str]`, *optional*, defaults to `None`):
  52. Optional text only metadata you might want to save in your header.
  53. For instance it can be useful to specify more about the underlying
  54. tensors. This is purely informative and does not affect tensor loading.
  55. Returns:
  56. `None`
  57. Example:
  58. ```python
  59. from safetensors.numpy import save_file
  60. import numpy as np
  61. tensors = {"embedding": np.zeros((512, 1024)), "attention": np.zeros((256, 256))}
  62. save_file(tensors, "model.safetensors")
  63. ```
  64. """
  65. flattened = {
  66. k: {"dtype": v.dtype.name, "shape": v.shape, "data": _tobytes(v)}
  67. for k, v in tensor_dict.items()
  68. }
  69. serialize_file(flattened, filename, metadata=metadata)
  70. def load(data: bytes) -> Dict[str, np.ndarray]:
  71. """
  72. Loads a safetensors file into numpy format from pure bytes.
  73. Args:
  74. data (`bytes`):
  75. The content of a safetensors file
  76. Returns:
  77. `Dict[str, np.ndarray]`: dictionary that contains name as key, value as `np.ndarray` on cpu
  78. Example:
  79. ```python
  80. from safetensors.numpy import load
  81. file_path = "./my_folder/bert.safetensors"
  82. with open(file_path, "rb") as f:
  83. data = f.read()
  84. loaded = load(data)
  85. ```
  86. """
  87. flat = deserialize(data)
  88. return _view2np(flat)
  89. def load_file(filename: Union[str, os.PathLike]) -> Dict[str, np.ndarray]:
  90. """
  91. Loads a safetensors file into numpy format.
  92. Args:
  93. filename (`str`, or `os.PathLike`)):
  94. The name of the file which contains the tensors
  95. Returns:
  96. `Dict[str, np.ndarray]`: dictionary that contains name as key, value as `np.ndarray`
  97. Example:
  98. ```python
  99. from safetensors.numpy import load_file
  100. file_path = "./my_folder/bert.safetensors"
  101. loaded = load_file(file_path)
  102. ```
  103. """
  104. result = {}
  105. with safe_open(filename, framework="np") as f:
  106. for k in f.offset_keys():
  107. result[k] = f.get_tensor(k)
  108. return result
  109. _TYPES = {
  110. "F64": np.float64,
  111. "F32": np.float32,
  112. "F16": np.float16,
  113. "I64": np.int64,
  114. "U64": np.uint64,
  115. "I32": np.int32,
  116. "U32": np.uint32,
  117. "I16": np.int16,
  118. "U16": np.uint16,
  119. "I8": np.int8,
  120. "U8": np.uint8,
  121. "BOOL": bool,
  122. "C64": np.complex64,
  123. }
  124. def _getdtype(dtype_str: str) -> np.dtype:
  125. return _TYPES[dtype_str]
  126. def _view2np(safeview) -> Dict[str, np.ndarray]:
  127. result = {}
  128. for k, v in safeview:
  129. dtype = _getdtype(v["dtype"])
  130. arr = np.frombuffer(v["data"], dtype=dtype).reshape(v["shape"])
  131. result[k] = arr
  132. return result
  133. def _is_little_endian(tensor: np.ndarray) -> bool:
  134. byteorder = tensor.dtype.byteorder
  135. if byteorder == "=":
  136. if sys.byteorder == "little":
  137. return True
  138. else:
  139. return False
  140. elif byteorder == "|":
  141. return True
  142. elif byteorder == "<":
  143. return True
  144. elif byteorder == ">":
  145. return False
  146. raise ValueError(f"Unexpected byte order {byteorder}")