hashing.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270
  1. """
  2. Fast cryptographic hash of Python objects, with a special case for fast
  3. hashing of numpy arrays.
  4. """
  5. # Author: Gael Varoquaux <gael dot varoquaux at normalesup dot org>
  6. # Copyright (c) 2009 Gael Varoquaux
  7. # License: BSD Style, 3 clauses.
  8. import decimal
  9. import hashlib
  10. import io
  11. import pickle
  12. import struct
  13. import sys
  14. import types
  15. Pickler = pickle._Pickler
  16. class _ConsistentSet(object):
  17. """Class used to ensure the hash of Sets is preserved
  18. whatever the order of its items.
  19. """
  20. def __init__(self, set_sequence):
  21. # Forces order of elements in set to ensure consistent hash.
  22. try:
  23. # Trying first to order the set assuming the type of elements is
  24. # consistent and orderable.
  25. # This fails on python 3 when elements are unorderable
  26. # but we keep it in a try as it's faster.
  27. self._sequence = sorted(set_sequence)
  28. except (TypeError, decimal.InvalidOperation):
  29. # If elements are unorderable, sorting them using their hash.
  30. # This is slower but works in any case.
  31. self._sequence = sorted((hash(e) for e in set_sequence))
  32. class _MyHash(object):
  33. """Class used to hash objects that won't normally pickle"""
  34. def __init__(self, *args):
  35. self.args = args
  36. class Hasher(Pickler):
  37. """A subclass of pickler, to do cryptographic hashing, rather than
  38. pickling. This is used to produce a unique hash of the given
  39. Python object that is not necessarily cryptographically secure.
  40. """
  41. def __init__(self, hash_name="md5"):
  42. self.stream = io.BytesIO()
  43. # By default we want a pickle protocol that only changes with
  44. # the major python version and not the minor one
  45. protocol = 3
  46. Pickler.__init__(self, self.stream, protocol=protocol)
  47. # Initialise the hash obj
  48. self._hash = hashlib.new(hash_name, usedforsecurity=False)
  49. def hash(self, obj, return_digest=True):
  50. try:
  51. self.dump(obj)
  52. except pickle.PicklingError as e:
  53. e.args += ("PicklingError while hashing %r: %r" % (obj, e),)
  54. raise
  55. dumps = self.stream.getvalue()
  56. self._hash.update(dumps)
  57. if return_digest:
  58. return self._hash.hexdigest()
  59. def save(self, obj):
  60. if isinstance(obj, (types.MethodType, type({}.pop))):
  61. # the Pickler cannot pickle instance methods; here we decompose
  62. # them into components that make them uniquely identifiable
  63. if hasattr(obj, "__func__"):
  64. func_name = obj.__func__.__name__
  65. else:
  66. func_name = obj.__name__
  67. inst = obj.__self__
  68. if type(inst) is type(pickle):
  69. obj = _MyHash(func_name, inst.__name__)
  70. elif inst is None:
  71. # type(None) or type(module) do not pickle
  72. obj = _MyHash(func_name, inst)
  73. else:
  74. cls = obj.__self__.__class__
  75. obj = _MyHash(func_name, inst, cls)
  76. Pickler.save(self, obj)
  77. def memoize(self, obj):
  78. # We want hashing to be sensitive to value instead of reference.
  79. # For example we want ['aa', 'aa'] and ['aa', 'aaZ'[:2]]
  80. # to hash to the same value and that's why we disable memoization
  81. # for strings
  82. if isinstance(obj, (bytes, str)):
  83. return
  84. Pickler.memoize(self, obj)
  85. # The dispatch table of the pickler is not accessible in Python
  86. # 3, as these lines are only bugware for IPython, we skip them.
  87. def save_global(self, obj, name=None, pack=struct.pack):
  88. # We have to override this method in order to deal with objects
  89. # defined interactively in IPython that are not injected in
  90. # __main__
  91. kwargs = dict(name=name, pack=pack)
  92. del kwargs["pack"]
  93. try:
  94. Pickler.save_global(self, obj, **kwargs)
  95. except pickle.PicklingError:
  96. Pickler.save_global(self, obj, **kwargs)
  97. module = getattr(obj, "__module__", None)
  98. if module == "__main__":
  99. my_name = name
  100. if my_name is None:
  101. my_name = obj.__name__
  102. mod = sys.modules[module]
  103. if not hasattr(mod, my_name):
  104. # IPython doesn't inject the variables define
  105. # interactively in __main__
  106. setattr(mod, my_name, obj)
  107. dispatch = Pickler.dispatch.copy()
  108. # builtin
  109. dispatch[type(len)] = save_global
  110. # type
  111. dispatch[type(object)] = save_global
  112. # classobj
  113. dispatch[type(Pickler)] = save_global
  114. # function
  115. dispatch[type(pickle.dump)] = save_global
  116. # We use *args in _batch_setitems signature because _batch_setitems has an
  117. # additional 'obj' argument in Python 3.14
  118. def _batch_setitems(self, items, *args):
  119. # forces order of keys in dict to ensure consistent hash.
  120. try:
  121. # Trying first to compare dict assuming the type of keys is
  122. # consistent and orderable.
  123. # This fails on python 3 when keys are unorderable
  124. # but we keep it in a try as it's faster.
  125. Pickler._batch_setitems(self, iter(sorted(items)), *args)
  126. except TypeError:
  127. # If keys are unorderable, sorting them using their hash. This is
  128. # slower but works in any case.
  129. Pickler._batch_setitems(
  130. self, iter(sorted((hash(k), v) for k, v in items)), *args
  131. )
  132. def save_set(self, set_items):
  133. # forces order of items in Set to ensure consistent hash
  134. Pickler.save(self, _ConsistentSet(set_items))
  135. dispatch[type(set())] = save_set
  136. class NumpyHasher(Hasher):
  137. """Special case the hasher for when numpy is loaded."""
  138. def __init__(self, hash_name="md5", coerce_mmap=False):
  139. """
  140. Parameters
  141. ----------
  142. hash_name: string
  143. The hash algorithm to be used
  144. coerce_mmap: boolean
  145. Make no difference between np.memmap and np.ndarray
  146. objects.
  147. """
  148. self.coerce_mmap = coerce_mmap
  149. Hasher.__init__(self, hash_name=hash_name)
  150. # delayed import of numpy, to avoid tight coupling
  151. import numpy as np
  152. self.np = np
  153. if hasattr(np, "getbuffer"):
  154. self._getbuffer = np.getbuffer
  155. else:
  156. self._getbuffer = memoryview
  157. def save(self, obj):
  158. """Subclass the save method, to hash ndarray subclass, rather
  159. than pickling them. Off course, this is a total abuse of
  160. the Pickler class.
  161. """
  162. if isinstance(obj, self.np.ndarray) and not obj.dtype.hasobject:
  163. # Compute a hash of the object
  164. # The update function of the hash requires a c_contiguous buffer.
  165. if obj.shape == ():
  166. # 0d arrays need to be flattened because viewing them as bytes
  167. # raises a ValueError exception.
  168. obj_c_contiguous = obj.flatten()
  169. elif obj.flags.c_contiguous:
  170. obj_c_contiguous = obj
  171. elif obj.flags.f_contiguous:
  172. obj_c_contiguous = obj.T
  173. else:
  174. # Cater for non-single-segment arrays: this creates a
  175. # copy, and thus alleviates this issue.
  176. # XXX: There might be a more efficient way of doing this
  177. obj_c_contiguous = obj.flatten()
  178. # memoryview is not supported for some dtypes, e.g. datetime64, see
  179. # https://github.com/numpy/numpy/issues/4983. The
  180. # workaround is to view the array as bytes before
  181. # taking the memoryview.
  182. self._hash.update(self._getbuffer(obj_c_contiguous.view(self.np.uint8)))
  183. # We store the class, to be able to distinguish between
  184. # Objects with the same binary content, but different
  185. # classes.
  186. if self.coerce_mmap and isinstance(obj, self.np.memmap):
  187. # We don't make the difference between memmap and
  188. # normal ndarrays, to be able to reload previously
  189. # computed results with memmap.
  190. klass = self.np.ndarray
  191. else:
  192. klass = obj.__class__
  193. # We also return the dtype and the shape, to distinguish
  194. # different views on the same data with different dtypes.
  195. # The object will be pickled by the pickler hashed at the end.
  196. obj = (klass, ("HASHED", obj.dtype, obj.shape, obj.strides))
  197. elif isinstance(obj, self.np.dtype):
  198. # numpy.dtype consistent hashing is tricky to get right. This comes
  199. # from the fact that atomic np.dtype objects are interned:
  200. # ``np.dtype('f4') is np.dtype('f4')``. The situation is
  201. # complicated by the fact that this interning does not resist a
  202. # simple pickle.load/dump roundtrip:
  203. # ``pickle.loads(pickle.dumps(np.dtype('f4'))) is not
  204. # np.dtype('f4') Because pickle relies on memoization during
  205. # pickling, it is easy to
  206. # produce different hashes for seemingly identical objects, such as
  207. # ``[np.dtype('f4'), np.dtype('f4')]``
  208. # and ``[np.dtype('f4'), pickle.loads(pickle.dumps('f4'))]``.
  209. # To prevent memoization from interfering with hashing, we isolate
  210. # the serialization (and thus the pickle memoization) of each dtype
  211. # using each time a different ``pickle.dumps`` call unrelated to
  212. # the current Hasher instance.
  213. self._hash.update("_HASHED_DTYPE".encode("utf-8"))
  214. self._hash.update(pickle.dumps(obj))
  215. return
  216. Hasher.save(self, obj)
  217. def hash(obj, hash_name="md5", coerce_mmap=False):
  218. """Quick calculation of a hash to identify uniquely Python objects
  219. containing numpy arrays.
  220. Parameters
  221. ----------
  222. hash_name: 'md5' or 'sha1'
  223. Hashing algorithm used. sha1 is supposedly safer, but md5 is
  224. faster.
  225. coerce_mmap: boolean
  226. Make no difference between np.memmap and np.ndarray
  227. """
  228. valid_hash_names = ("md5", "sha1")
  229. if hash_name not in valid_hash_names:
  230. raise ValueError(
  231. "Valid options for 'hash_name' are {}. Got hash_name={!r} instead.".format(
  232. valid_hash_names, hash_name
  233. )
  234. )
  235. if "numpy" in sys.modules:
  236. hasher = NumpyHasher(hash_name=hash_name, coerce_mmap=coerce_mmap)
  237. else:
  238. hasher = Hasher(hash_name=hash_name)
  239. return hasher.hash(obj)