test_npystrings.py 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186
  1. import numpy as np
  2. import pytest
  3. import h5py
  4. from .common import make_name
  5. NUMPY_GE2 = int(np.__version__.split(".")[0]) >= 2
  6. pytestmark = pytest.mark.skipif(not NUMPY_GE2, reason="requires numpy >=2.0")
  7. def test_create_with_dtype_T(writable_file):
  8. name = make_name()
  9. ds = writable_file.create_dataset(name, shape=(2, 2), dtype="T")
  10. data = [["foo", "bar"], ["hello world", ""]]
  11. ds[:] = data
  12. a = ds.asstr()[:]
  13. np.testing.assert_array_equal(a, data)
  14. ds = writable_file[name]
  15. assert ds.dtype == object
  16. np.testing.assert_array_equal(ds.asstr()[:], data)
  17. ds[0, 0] = "baz"
  18. data[0][0] = "baz"
  19. a = ds.astype("T")[:]
  20. assert a.dtype.kind == "T"
  21. np.testing.assert_array_equal(a, data)
  22. ds[0, 0] = np.asarray("123", dtype="O")
  23. data[0][0] = "123"
  24. np.testing.assert_array_equal(ds.asstr()[:], data)
  25. def test_fromdata(writable_file):
  26. nx = make_name("x")
  27. ny = make_name("y")
  28. nz = make_name("z")
  29. data = [["foo", "bar"]]
  30. np_data = np.asarray(data, dtype="T")
  31. x = writable_file.create_dataset(nx, data=data, dtype="T")
  32. y = writable_file.create_dataset(ny, data=data, dtype=np.dtypes.StringDType())
  33. z = writable_file.create_dataset(nz, data=np_data)
  34. for ds in (x, y, z):
  35. assert ds.dtype.kind == "O"
  36. np.testing.assert_array_equal(ds.astype("T")[:], np_data)
  37. for n in (nx, ny, nz):
  38. ds = writable_file[n]
  39. assert ds.dtype == object
  40. np.testing.assert_array_equal(ds.asstr()[:], data)
  41. ds = ds.astype("T")
  42. assert ds.dtype.kind == "T"
  43. a = ds[:]
  44. assert a.dtype.kind == "T"
  45. np.testing.assert_array_equal(a, data)
  46. def test_fixed_to_variable_width(writable_file):
  47. data = ["foo", "longer than 8 bytes"]
  48. x = writable_file.create_dataset(
  49. make_name(), data=data, dtype=h5py.string_dtype(length=20)
  50. )
  51. assert x.dtype == "S20"
  52. # read T <- S
  53. y = x.astype("T")
  54. assert y.dtype.kind == "T"
  55. assert y[:].dtype.kind == "T"
  56. np.testing.assert_array_equal(y[:], data)
  57. # write T -> S
  58. x[0] = np.asarray("1234", dtype="T")
  59. data[0] = "1234"
  60. np.testing.assert_array_equal(y[:], data)
  61. def test_fixed_to_variable_width_too_short(writable_file):
  62. # Note: this test triggers calls to H5Tconvert which are otherwise skipped.
  63. data = ["foo", "bar"]
  64. x = writable_file.create_dataset(
  65. make_name(), data=data, dtype=h5py.string_dtype(length=3)
  66. )
  67. assert x.dtype == "S3"
  68. # write T -> S
  69. x[0] = np.asarray("1234", dtype="T")
  70. np.testing.assert_array_equal(x[:], [b"123", b"bar"])
  71. def test_variable_to_fixed_width(writable_file):
  72. data = ["foo", "longer than 8 bytes"]
  73. bdata = [b"foo", b"longer than 8 bytes"]
  74. x = writable_file.create_dataset(make_name(), data=data, dtype="T")
  75. # read S <- T
  76. y = x.astype("S20")
  77. assert y.dtype == "S20"
  78. assert y[:].dtype == "S20"
  79. np.testing.assert_array_equal(y[:], bdata)
  80. y = x.astype("S3")
  81. assert y.dtype == "S3"
  82. assert y[:].dtype == "S3"
  83. np.testing.assert_array_equal(y[:], [b"foo", b"lon"])
  84. # write S -> T
  85. x[0] = np.asarray(b"1234", dtype="S5")
  86. bdata[0] = b"1234"
  87. np.testing.assert_array_equal(x[:], bdata)
  88. def test_write_object_into_npystrings(writable_file):
  89. x = writable_file.create_dataset(make_name(), data=["foo"], dtype="T")
  90. x[0] = np.asarray("1234", dtype="O")
  91. np.testing.assert_array_equal(x[:], b"1234")
  92. def test_write_npystrings_into_object(writable_file):
  93. x = writable_file.create_dataset(
  94. make_name("x"), data=["foo"], dtype=h5py.string_dtype()
  95. )
  96. assert x.dtype == object
  97. x[0] = np.asarray("1234", dtype="T")
  98. np.testing.assert_array_equal(x[:], b"1234")
  99. # Test with HDF5 variable-length strings with ASCII character set
  100. xa = writable_file.create_dataset(
  101. make_name("xa"), shape=(1,), dtype=h5py.string_dtype('ascii')
  102. )
  103. xa[0] = np.asarray("2345", dtype="T")
  104. np.testing.assert_array_equal(xa[:], b"2345")
  105. def test_fillvalue(writable_file):
  106. # Create as NpyString dtype
  107. x = writable_file.create_dataset(
  108. make_name("x"), shape=(2,), dtype="T", fillvalue="foo"
  109. )
  110. assert isinstance(x.fillvalue, bytes)
  111. assert x.fillvalue == b"foo"
  112. assert x[0] == b"foo"
  113. # Create as object dtype
  114. y = writable_file.create_dataset(
  115. make_name("y"), shape=(2,), dtype=h5py.string_dtype(), fillvalue=b"foo"
  116. )
  117. assert isinstance(y.fillvalue, bytes)
  118. assert y.fillvalue == b"foo"
  119. assert y[0] == b"foo"
  120. # Convert object dtype to NpyString
  121. y = y.astype("T")
  122. assert y[0] == "foo"
  123. def test_empty_string(writable_file):
  124. data = np.array(["", "a", "b"], dtype="T")
  125. x = writable_file.create_dataset(make_name(), data=data)
  126. np.testing.assert_array_equal(x[:], [b"", b"a", b"b"])
  127. np.testing.assert_array_equal(x.astype("T")[:], data)
  128. data[:2] = ["c", ""]
  129. x[:2] = data[:2]
  130. np.testing.assert_array_equal(x[:], [b"c", b"", b"b"])
  131. np.testing.assert_array_equal(x.astype("T")[:], data)
  132. def test_astype_nonstring(writable_file):
  133. x = writable_file.create_dataset(make_name(), shape=(2, ), dtype="i8")
  134. with pytest.raises(TypeError, match="HDF5 string datatype"):
  135. x.astype("T")
  136. def test_resized_read(writable_file):
  137. """Read default values created by resize(). This triggers a special case
  138. where libhdf5 returns a char** containing NULL pointers.
  139. """
  140. l = ["string1", "string2", "string3"]
  141. data = np.array(l, dtype='T')
  142. d = writable_file.create_dataset(make_name(), data=data, maxshape=(None,))
  143. d.resize((10,))
  144. np.testing.assert_array_equal(d[:], np.array(
  145. [s.encode() for s in l] + [b''] * 7, dtype=object
  146. ))
  147. np.testing.assert_array_equal(d.astype('T')[:], np.array(l + [''] * 7, dtype='T'))