| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520 |
- """
- Test the hashing module.
- """
- # Author: Gael Varoquaux <gael dot varoquaux at normalesup dot org>
- # Copyright (c) 2009 Gael Varoquaux
- # License: BSD Style, 3 clauses.
- import collections
- import gc
- import hashlib
- import io
- import itertools
- import pickle
- import random
- import sys
- import time
- from concurrent.futures import ProcessPoolExecutor
- from decimal import Decimal
- from joblib.func_inspect import filter_args
- from joblib.hashing import hash
- from joblib.memory import Memory
- from joblib.test.common import np, with_numpy
- from joblib.testing import fixture, parametrize, raises, skipif
- def unicode(s):
- return s
- ###############################################################################
- # Helper functions for the tests
- def time_func(func, *args):
- """Time function func on *args."""
- times = list()
- for _ in range(3):
- t1 = time.time()
- func(*args)
- times.append(time.time() - t1)
- return min(times)
- def relative_time(func1, func2, *args):
- """Return the relative time between func1 and func2 applied on
- *args.
- """
- time_func1 = time_func(func1, *args)
- time_func2 = time_func(func2, *args)
- relative_diff = 0.5 * (abs(time_func1 - time_func2) / (time_func1 + time_func2))
- return relative_diff
- class Klass(object):
- def f(self, x):
- return x
- class KlassWithCachedMethod(object):
- def __init__(self, cachedir):
- mem = Memory(location=cachedir)
- self.f = mem.cache(self.f)
- def f(self, x):
- return x
- ###############################################################################
- # Tests
- input_list = [
- 1,
- 2,
- 1.0,
- 2.0,
- 1 + 1j,
- 2.0 + 1j,
- "a",
- "b",
- (1,),
- (
- 1,
- 1,
- ),
- [
- 1,
- ],
- [
- 1,
- 1,
- ],
- {1: 1},
- {1: 2},
- {2: 1},
- None,
- gc.collect,
- [
- 1,
- ].append,
- # Next 2 sets have unorderable elements in python 3.
- set(("a", 1)),
- set(("a", 1, ("a", 1))),
- # Next 2 dicts have unorderable type of keys in python 3.
- {"a": 1, 1: 2},
- {"a": 1, 1: 2, "d": {"a": 1}},
- ]
- @parametrize("obj1", input_list)
- @parametrize("obj2", input_list)
- def test_trivial_hash(obj1, obj2):
- """Smoke test hash on various types."""
- # Check that 2 objects have the same hash only if they are the same.
- are_hashes_equal = hash(obj1) == hash(obj2)
- are_objs_identical = obj1 is obj2
- assert are_hashes_equal == are_objs_identical
- def test_hash_methods():
- # Check that hashing instance methods works
- a = io.StringIO(unicode("a"))
- assert hash(a.flush) == hash(a.flush)
- a1 = collections.deque(range(10))
- a2 = collections.deque(range(9))
- assert hash(a1.extend) != hash(a2.extend)
- @fixture(scope="function")
- @with_numpy
- def three_np_arrays():
- rnd = np.random.RandomState(0)
- arr1 = rnd.random_sample((10, 10))
- arr2 = arr1.copy()
- arr3 = arr2.copy()
- arr3[0] += 1
- return arr1, arr2, arr3
- def test_hash_numpy_arrays(three_np_arrays):
- arr1, arr2, arr3 = three_np_arrays
- for obj1, obj2 in itertools.product(three_np_arrays, repeat=2):
- are_hashes_equal = hash(obj1) == hash(obj2)
- are_arrays_equal = np.all(obj1 == obj2)
- assert are_hashes_equal == are_arrays_equal
- assert hash(arr1) != hash(arr1.T)
- def test_hash_numpy_dict_of_arrays(three_np_arrays):
- arr1, arr2, arr3 = three_np_arrays
- d1 = {1: arr1, 2: arr2}
- d2 = {1: arr2, 2: arr1}
- d3 = {1: arr2, 2: arr3}
- assert hash(d1) == hash(d2)
- assert hash(d1) != hash(d3)
- @with_numpy
- @parametrize("dtype", ["datetime64[s]", "timedelta64[D]"])
- def test_numpy_datetime_array(dtype):
- # memoryview is not supported for some dtypes e.g. datetime64
- # see https://github.com/joblib/joblib/issues/188 for more details
- a_hash = hash(np.arange(10))
- array = np.arange(0, 10, dtype=dtype)
- assert hash(array) != a_hash
- @with_numpy
- def test_hash_numpy_noncontiguous():
- a = np.asarray(np.arange(6000).reshape((1000, 2, 3)), order="F")[:, :1, :]
- b = np.ascontiguousarray(a)
- assert hash(a) != hash(b)
- c = np.asfortranarray(a)
- assert hash(a) != hash(c)
- @with_numpy
- @parametrize("coerce_mmap", [True, False])
- def test_hash_memmap(tmpdir, coerce_mmap):
- """Check that memmap and arrays hash identically if coerce_mmap is True."""
- filename = tmpdir.join("memmap_temp").strpath
- try:
- m = np.memmap(filename, shape=(10, 10), mode="w+")
- a = np.asarray(m)
- are_hashes_equal = hash(a, coerce_mmap=coerce_mmap) == hash(
- m, coerce_mmap=coerce_mmap
- )
- assert are_hashes_equal == coerce_mmap
- finally:
- if "m" in locals():
- del m
- # Force a garbage-collection cycle, to be certain that the
- # object is delete, and we don't run in a problem under
- # Windows with a file handle still open.
- gc.collect()
- @with_numpy
- @skipif(
- sys.platform == "win32",
- reason="This test is not stable under windows for some reason",
- )
- def test_hash_numpy_performance():
- """Check the performance of hashing numpy arrays:
- In [22]: a = np.random.random(1000000)
- In [23]: %timeit hashlib.md5(a).hexdigest()
- 100 loops, best of 3: 20.7 ms per loop
- In [24]: %timeit hashlib.md5(pickle.dumps(a, protocol=2)).hexdigest()
- 1 loops, best of 3: 73.1 ms per loop
- In [25]: %timeit hashlib.md5(cPickle.dumps(a, protocol=2)).hexdigest()
- 10 loops, best of 3: 53.9 ms per loop
- In [26]: %timeit hash(a)
- 100 loops, best of 3: 20.8 ms per loop
- """
- rnd = np.random.RandomState(0)
- a = rnd.random_sample(1000000)
- def md5_hash(x):
- return hashlib.md5(memoryview(x)).hexdigest()
- relative_diff = relative_time(md5_hash, hash, a)
- assert relative_diff < 0.3
- # Check that hashing an tuple of 3 arrays takes approximately
- # 3 times as much as hashing one array
- time_hashlib = 3 * time_func(md5_hash, a)
- time_hash = time_func(hash, (a, a, a))
- relative_diff = 0.5 * (abs(time_hash - time_hashlib) / (time_hash + time_hashlib))
- assert relative_diff < 0.3
- def test_bound_methods_hash():
- """Make sure that calling the same method on two different instances
- of the same class does resolve to the same hashes.
- """
- a = Klass()
- b = Klass()
- assert hash(filter_args(a.f, [], (1,))) == hash(filter_args(b.f, [], (1,)))
- def test_bound_cached_methods_hash(tmpdir):
- """Make sure that calling the same _cached_ method on two different
- instances of the same class does resolve to the same hashes.
- """
- a = KlassWithCachedMethod(tmpdir.strpath)
- b = KlassWithCachedMethod(tmpdir.strpath)
- assert hash(filter_args(a.f.func, [], (1,))) == hash(
- filter_args(b.f.func, [], (1,))
- )
- @with_numpy
- def test_hash_object_dtype():
- """Make sure that ndarrays with dtype `object' hash correctly."""
- a = np.array([np.arange(i) for i in range(6)], dtype=object)
- b = np.array([np.arange(i) for i in range(6)], dtype=object)
- assert hash(a) == hash(b)
- @with_numpy
- def test_numpy_scalar():
- # Numpy scalars are built from compiled functions, and lead to
- # strange pickling paths explored, that can give hash collisions
- a = np.float64(2.0)
- b = np.float64(3.0)
- assert hash(a) != hash(b)
- def test_dict_hash(tmpdir):
- # Check that dictionaries hash consistently, even though the ordering
- # of the keys is not guaranteed
- k = KlassWithCachedMethod(tmpdir.strpath)
- d = {
- "#s12069__c_maps.nii.gz": [33],
- "#s12158__c_maps.nii.gz": [33],
- "#s12258__c_maps.nii.gz": [33],
- "#s12277__c_maps.nii.gz": [33],
- "#s12300__c_maps.nii.gz": [33],
- "#s12401__c_maps.nii.gz": [33],
- "#s12430__c_maps.nii.gz": [33],
- "#s13817__c_maps.nii.gz": [33],
- "#s13903__c_maps.nii.gz": [33],
- "#s13916__c_maps.nii.gz": [33],
- "#s13981__c_maps.nii.gz": [33],
- "#s13982__c_maps.nii.gz": [33],
- "#s13983__c_maps.nii.gz": [33],
- }
- a = k.f(d)
- b = k.f(a)
- assert hash(a) == hash(b)
- def test_set_hash(tmpdir):
- # Check that sets hash consistently, even though their ordering
- # is not guaranteed
- k = KlassWithCachedMethod(tmpdir.strpath)
- s = set(
- [
- "#s12069__c_maps.nii.gz",
- "#s12158__c_maps.nii.gz",
- "#s12258__c_maps.nii.gz",
- "#s12277__c_maps.nii.gz",
- "#s12300__c_maps.nii.gz",
- "#s12401__c_maps.nii.gz",
- "#s12430__c_maps.nii.gz",
- "#s13817__c_maps.nii.gz",
- "#s13903__c_maps.nii.gz",
- "#s13916__c_maps.nii.gz",
- "#s13981__c_maps.nii.gz",
- "#s13982__c_maps.nii.gz",
- "#s13983__c_maps.nii.gz",
- ]
- )
- a = k.f(s)
- b = k.f(a)
- assert hash(a) == hash(b)
- def test_set_decimal_hash():
- # Check that sets containing decimals hash consistently, even though
- # ordering is not guaranteed
- assert hash(set([Decimal(0), Decimal("NaN")])) == hash(
- set([Decimal("NaN"), Decimal(0)])
- )
- def test_string():
- # Test that we obtain the same hash for object owning several strings,
- # whatever the past of these strings (which are immutable in Python)
- string = "foo"
- a = {string: "bar"}
- b = {string: "bar"}
- c = pickle.loads(pickle.dumps(b))
- assert hash([a, b]) == hash([a, c])
- @with_numpy
- def test_numpy_dtype_pickling():
- # numpy dtype hashing is tricky to get right: see #231, #239, #251 #1080,
- # #1082, and explanatory comments inside
- # ``joblib.hashing.NumpyHasher.save``.
- # In this test, we make sure that the pickling of numpy dtypes is robust to
- # object identity and object copy.
- dt1 = np.dtype("f4")
- dt2 = np.dtype("f4")
- # simple dtypes objects are interned
- assert dt1 is dt2
- assert hash(dt1) == hash(dt2)
- dt1_roundtripped = pickle.loads(pickle.dumps(dt1))
- assert dt1 is not dt1_roundtripped
- assert hash(dt1) == hash(dt1_roundtripped)
- assert hash([dt1, dt1]) == hash([dt1_roundtripped, dt1_roundtripped])
- assert hash([dt1, dt1]) == hash([dt1, dt1_roundtripped])
- complex_dt1 = np.dtype([("name", np.str_, 16), ("grades", np.float64, (2,))])
- complex_dt2 = np.dtype([("name", np.str_, 16), ("grades", np.float64, (2,))])
- # complex dtypes objects are not interned
- assert hash(complex_dt1) == hash(complex_dt2)
- complex_dt1_roundtripped = pickle.loads(pickle.dumps(complex_dt1))
- assert complex_dt1_roundtripped is not complex_dt1
- assert hash(complex_dt1) == hash(complex_dt1_roundtripped)
- assert hash([complex_dt1, complex_dt1]) == hash(
- [complex_dt1_roundtripped, complex_dt1_roundtripped]
- )
- assert hash([complex_dt1, complex_dt1]) == hash(
- [complex_dt1_roundtripped, complex_dt1]
- )
- @parametrize(
- "to_hash,expected",
- [
- ("This is a string to hash", "71b3f47df22cb19431d85d92d0b230b2"),
- ("C'est l\xe9t\xe9", "2d8d189e9b2b0b2e384d93c868c0e576"),
- ((123456, 54321, -98765), "e205227dd82250871fa25aa0ec690aa3"),
- (
- [random.Random(42).random() for _ in range(5)],
- "a11ffad81f9682a7d901e6edc3d16c84",
- ),
- ({"abcde": 123, "sadfas": [-9999, 2, 3]}, "aeda150553d4bb5c69f0e69d51b0e2ef"),
- ],
- )
- def test_hashes_stay_the_same(to_hash, expected):
- # We want to make sure that hashes don't change with joblib
- # version. For end users, that would mean that they have to
- # regenerate their cache from scratch, which potentially means
- # lengthy recomputations.
- # Expected results have been generated with joblib 0.9.2
- assert hash(to_hash) == expected
- @with_numpy
- def test_hashes_are_different_between_c_and_fortran_contiguous_arrays():
- # We want to be sure that the c-contiguous and f-contiguous versions of the
- # same array produce 2 different hashes.
- rng = np.random.RandomState(0)
- arr_c = rng.random_sample((10, 10))
- arr_f = np.asfortranarray(arr_c)
- assert hash(arr_c) != hash(arr_f)
- @with_numpy
- def test_0d_array():
- hash(np.array(0))
- @with_numpy
- def test_0d_and_1d_array_hashing_is_different():
- assert hash(np.array(0)) != hash(np.array([0]))
- @with_numpy
- def test_hashes_stay_the_same_with_numpy_objects():
- # Note: joblib used to test numpy objects hashing by comparing the produced
- # hash of an object with some hard-coded target value to guarantee that
- # hashing remains the same across joblib versions. However, since numpy
- # 1.20 and joblib 1.0, joblib relies on potentially unstable implementation
- # details of numpy to hash np.dtype objects, which makes the stability of
- # hash values across different environments hard to guarantee and to test.
- # As a result, hashing stability across joblib versions becomes best-effort
- # only, and we only test the consistency within a single environment by
- # making sure:
- # - the hash of two copies of the same objects is the same
- # - hashing some object in two different python processes produces the same
- # value. This should be viewed as a proxy for testing hash consistency
- # through time between Python sessions (provided no change in the
- # environment was done between sessions).
- def create_objects_to_hash():
- rng = np.random.RandomState(42)
- # Being explicit about dtypes in order to avoid
- # architecture-related differences. Also using 'f4' rather than
- # 'f8' for float arrays because 'f8' arrays generated by
- # rng.random.randn don't seem to be bit-identical on 32bit and
- # 64bit machines.
- to_hash_list = [
- rng.randint(-1000, high=1000, size=50).astype("<i8"),
- tuple(rng.randn(3).astype("<f4") for _ in range(5)),
- [rng.randn(3).astype("<f4") for _ in range(5)],
- {
- -3333: rng.randn(3, 5).astype("<f4"),
- 0: [
- rng.randint(10, size=20).astype("<i8"),
- rng.randn(10).astype("<f4"),
- ],
- },
- # Non regression cases for
- # https://github.com/joblib/joblib/issues/308
- np.arange(100, dtype="<i8").reshape((10, 10)),
- # Fortran contiguous array
- np.asfortranarray(np.arange(100, dtype="<i8").reshape((10, 10))),
- # Non contiguous array
- np.arange(100, dtype="<i8").reshape((10, 10))[:, :2],
- ]
- return to_hash_list
- # Create two lists containing copies of the same objects. joblib.hash
- # should return the same hash for to_hash_list_one[i] and
- # to_hash_list_two[i]
- to_hash_list_one = create_objects_to_hash()
- to_hash_list_two = create_objects_to_hash()
- e1 = ProcessPoolExecutor(max_workers=1)
- e2 = ProcessPoolExecutor(max_workers=1)
- try:
- for obj_1, obj_2 in zip(to_hash_list_one, to_hash_list_two):
- # testing consistency of hashes across python processes
- hash_1 = e1.submit(hash, obj_1).result()
- hash_2 = e2.submit(hash, obj_1).result()
- assert hash_1 == hash_2
- # testing consistency when hashing two copies of the same objects.
- hash_3 = e1.submit(hash, obj_2).result()
- assert hash_1 == hash_3
- finally:
- e1.shutdown()
- e2.shutdown()
- def test_hashing_pickling_error():
- def non_picklable():
- return 42
- with raises(pickle.PicklingError) as excinfo:
- hash(non_picklable)
- excinfo.match("PicklingError while hashing")
- def test_wrong_hash_name():
- msg = "Valid options for 'hash_name' are"
- with raises(ValueError, match=msg):
- data = {"foo": "bar"}
- hash(data, hash_name="invalid")
|