dataset.py 44 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252
  1. # This file is part of h5py, a Python interface to the HDF5 library.
  2. #
  3. # http://www.h5py.org
  4. #
  5. # Copyright 2008-2020 Andrew Collette and contributors
  6. #
  7. # License: Standard 3-clause BSD; see "license.txt" for full license terms
  8. # and contributor agreement.
  9. """
  10. Implements support for high-level dataset access.
  11. """
  12. import posixpath as pp
  13. import sys
  14. from abc import ABC, abstractmethod
  15. from warnings import warn
  16. import numpy
  17. import h5py.h5t
  18. from .. import h5, h5s, h5t, h5r, h5d, h5p, h5fd, h5ds, _selector
  19. from ..h5py_warnings import H5pyDeprecationWarning
  20. from .base import (
  21. array_for_new_object, cached_property, Empty, find_item_type, HLObject,
  22. phil, product, with_phil,
  23. )
  24. from . import filters
  25. from . import selections as sel
  26. from . import selections2 as sel2
  27. from .datatype import Datatype
  28. from .compat import filename_decode
  29. from .vds import VDSmap, vds_support
  30. _LEGACY_GZIP_COMPRESSION_VALS = frozenset(range(10))
  31. MPI = h5.get_config().mpi
  32. def make_new_dset(parent, shape=None, dtype=None, data=None, name=None,
  33. chunks=None, compression=None, shuffle=None,
  34. fletcher32=None, maxshape=None, compression_opts=None,
  35. fillvalue=None, scaleoffset=None, track_times=False,
  36. external=None, track_order=None, dcpl=None, dapl=None,
  37. efile_prefix=None, virtual_prefix=None, allow_unknown_filter=False,
  38. rdcc_nslots=None, rdcc_nbytes=None, rdcc_w0=None, *,
  39. fill_time=None):
  40. """ Return a new low-level dataset identifier """
  41. # Convert data to a C-contiguous ndarray
  42. if data is not None and not isinstance(data, Empty):
  43. data = array_for_new_object(data, specified_dtype=dtype)
  44. # Validate shape
  45. if shape is None:
  46. if data is None:
  47. if dtype is None:
  48. raise TypeError("One of data, shape or dtype must be specified")
  49. data = Empty(dtype)
  50. shape = data.shape
  51. else:
  52. shape = (shape,) if isinstance(shape, int) else tuple(shape)
  53. if data is not None and (product(shape) != product(data.shape)):
  54. raise ValueError("Shape tuple is incompatible with data")
  55. if isinstance(maxshape, int):
  56. maxshape = (maxshape,)
  57. tmp_shape = maxshape if maxshape is not None else shape
  58. # Validate chunk shape
  59. if isinstance(chunks, int) and not isinstance(chunks, bool):
  60. chunks = (chunks,)
  61. # Logically, the following `zip` could be strict, but it's happening
  62. # before we've done checks elsewhere that raise more descriptive errors
  63. if isinstance(chunks, tuple) and any(
  64. chunk > dim for dim, chunk in zip(tmp_shape, chunks, strict=False) if dim is not None
  65. ):
  66. errmsg = "Chunk shape must not be greater than data shape in any dimension. "\
  67. "{} is not compatible with {}".format(chunks, shape)
  68. raise ValueError(errmsg)
  69. if isinstance(dtype, Datatype):
  70. # Named types are used as-is
  71. tid = dtype.id
  72. dtype = tid.dtype # Following code needs this
  73. elif isinstance(dtype, h5py.h5t.TypeID): # Low-level HDF5 data type
  74. tid = dtype
  75. dtype = tid.dtype
  76. else:
  77. # Validate dtype
  78. if dtype is None and data is None:
  79. warn(
  80. "Creating a dataset without passing data or dtype is deprecated. "
  81. "Pass an explicit dtype. Using dtype='f4' will keep the "
  82. "current default behaviour.",
  83. category=H5pyDeprecationWarning, stacklevel=3,
  84. )
  85. dtype = numpy.dtype("=f4")
  86. elif dtype is None and data is not None:
  87. dtype = data.dtype
  88. else:
  89. dtype = numpy.dtype(dtype)
  90. tid = h5t.py_create(dtype, logical=1)
  91. # Legacy
  92. if any((compression, shuffle, fletcher32, maxshape, scaleoffset)) and chunks is False:
  93. raise ValueError("Chunked format required for given storage options")
  94. # Legacy
  95. if compression is True:
  96. if compression_opts is None:
  97. compression_opts = 4
  98. compression = 'gzip'
  99. # Legacy
  100. if compression in _LEGACY_GZIP_COMPRESSION_VALS:
  101. if compression_opts is not None:
  102. raise TypeError("Conflict in compression options")
  103. compression_opts = compression
  104. compression = 'gzip'
  105. dcpl = filters.fill_dcpl(
  106. dcpl or h5p.create(h5p.DATASET_CREATE), shape, dtype,
  107. chunks, compression, compression_opts, shuffle, fletcher32,
  108. maxshape, scaleoffset, external, allow_unknown_filter,
  109. fill_time=fill_time)
  110. # Check that compression roundtrips correctly if it was specified
  111. if compression is not None:
  112. if isinstance(compression, filters.FilterRefBase):
  113. compression = compression.filter_id
  114. if isinstance(compression, int):
  115. compression = filters.get_filter_name(compression)
  116. if compression not in filters.get_filters(dcpl):
  117. raise ValueError(f'compression {compression!r} not in filters {filters.get_filters(dcpl)!r}')
  118. if fillvalue is not None:
  119. # prepare string-type dtypes for fillvalue
  120. string_info = h5t.check_string_dtype(dtype)
  121. if string_info is not None:
  122. # fake vlen dtype for fixed len string fillvalue
  123. # to not trigger unwanted encoding
  124. dtype = h5t.string_dtype(string_info.encoding)
  125. fillvalue = numpy.array(fillvalue, dtype=dtype)
  126. else:
  127. fillvalue = numpy.array(fillvalue)
  128. dcpl.set_fill_value(fillvalue)
  129. if track_times is None:
  130. # In case someone explicitly passes None for the default
  131. track_times = False
  132. if track_times in (True, False):
  133. dcpl.set_obj_track_times(track_times)
  134. else:
  135. raise TypeError("track_times must be either True or False")
  136. if track_order is True:
  137. dcpl.set_attr_creation_order(
  138. h5p.CRT_ORDER_TRACKED | h5p.CRT_ORDER_INDEXED)
  139. elif track_order is False:
  140. dcpl.set_attr_creation_order(0)
  141. elif track_order is not None:
  142. raise TypeError("track_order must be either True or False")
  143. if maxshape is not None:
  144. maxshape = tuple(m if m is not None else h5s.UNLIMITED for m in maxshape)
  145. if any([efile_prefix, virtual_prefix, rdcc_nbytes, rdcc_nslots, rdcc_w0]):
  146. dapl = dapl or h5p.create(h5p.DATASET_ACCESS)
  147. if efile_prefix is not None:
  148. dapl.set_efile_prefix(efile_prefix)
  149. if virtual_prefix is not None:
  150. dapl.set_virtual_prefix(virtual_prefix)
  151. if rdcc_nbytes or rdcc_nslots or rdcc_w0:
  152. cache_settings = list(dapl.get_chunk_cache())
  153. if rdcc_nslots is not None:
  154. cache_settings[0] = rdcc_nslots
  155. if rdcc_nbytes is not None:
  156. cache_settings[1] = rdcc_nbytes
  157. if rdcc_w0 is not None:
  158. cache_settings[2] = rdcc_w0
  159. dapl.set_chunk_cache(*cache_settings)
  160. if isinstance(data, Empty):
  161. sid = h5s.create(h5s.NULL)
  162. else:
  163. sid = h5s.create_simple(shape, maxshape)
  164. dset_id = h5d.create(parent.id, name, tid, sid, dcpl=dcpl, dapl=dapl)
  165. if (data is not None) and (not isinstance(data, Empty)):
  166. dset_id.write(h5s.ALL, h5s.ALL, data)
  167. return dset_id
  168. def open_dset(parent, name, dapl=None, efile_prefix=None, virtual_prefix=None,
  169. rdcc_nslots=None, rdcc_nbytes=None, rdcc_w0=None, **kwds):
  170. """ Return an existing low-level dataset identifier """
  171. if any([efile_prefix, virtual_prefix, rdcc_nbytes, rdcc_nslots, rdcc_w0]):
  172. dapl = dapl or h5p.create(h5p.DATASET_ACCESS)
  173. if efile_prefix is not None:
  174. dapl.set_efile_prefix(efile_prefix)
  175. if virtual_prefix is not None:
  176. dapl.set_virtual_prefix(virtual_prefix)
  177. if rdcc_nbytes or rdcc_nslots or rdcc_w0:
  178. cache_settings = list(dapl.get_chunk_cache())
  179. if rdcc_nslots is not None:
  180. cache_settings[0] = rdcc_nslots
  181. if rdcc_nbytes is not None:
  182. cache_settings[1] = rdcc_nbytes
  183. if rdcc_w0 is not None:
  184. cache_settings[2] = rdcc_w0
  185. dapl.set_chunk_cache(*cache_settings)
  186. dset_id = h5d.open(parent.id, name, dapl=dapl)
  187. return dset_id
  188. class AbstractView(ABC):
  189. _dset: "Dataset"
  190. def __init__(self, dset):
  191. self._dset = dset
  192. def __len__(self):
  193. return len(self._dset)
  194. @property
  195. @abstractmethod
  196. def dtype(self):
  197. ... # pragma: nocover
  198. @property
  199. def ndim(self):
  200. return self._dset.ndim
  201. @property
  202. def shape(self):
  203. return self._dset.shape
  204. @property
  205. def size(self):
  206. return self._dset.size
  207. @abstractmethod
  208. def __getitem__(self, idx):
  209. ... # pragma: nocover
  210. def __array__(self, dtype=None, copy=None):
  211. if copy is False:
  212. raise ValueError(
  213. f"{self.__class__.__name__}.__array__ received {copy=} "
  214. "but memory allocation cannot be avoided on read"
  215. )
  216. # If self.ndim == 0, convert np.generic back to np.ndarray
  217. return numpy.asarray(self[()], dtype=dtype or self.dtype)
  218. class AsTypeView(AbstractView):
  219. """Wrapper to convert data on reading from a dataset.
  220. """
  221. def __init__(self, dset, dtype):
  222. super().__init__(dset)
  223. self._dtype = dtype
  224. @property
  225. def dtype(self):
  226. return self._dtype
  227. def __getitem__(self, idx):
  228. return self._dset.__getitem__(idx, new_dtype=self._dtype)
  229. def __array__(self, dtype=None, copy=None):
  230. return self._dset.__array__(dtype or self._dtype, copy)
  231. class AsStrView(AbstractView):
  232. """Wrapper to decode strings on reading the dataset"""
  233. def __init__(self, dset, encoding, errors='strict'):
  234. super().__init__(dset)
  235. self.encoding = encoding
  236. self.errors = errors
  237. @property
  238. def dtype(self):
  239. return numpy.dtype(object)
  240. def __getitem__(self, idx):
  241. bytes_arr = self._dset[idx]
  242. # numpy.char.decode() seems like the obvious thing to use. But it only
  243. # accepts numpy string arrays, not object arrays of bytes (which we
  244. # return from HDF5 variable-length strings). And the numpy
  245. # implementation is not faster than doing it with a loop; in fact, by
  246. # not converting the result to a numpy unicode array, the
  247. # naive way can be faster! (Comparing with numpy 1.18.4, June 2020)
  248. if numpy.isscalar(bytes_arr):
  249. return bytes_arr.decode(self.encoding, self.errors)
  250. return numpy.array([
  251. b.decode(self.encoding, self.errors) for b in bytes_arr.flat
  252. ], dtype=object).reshape(bytes_arr.shape)
  253. class FieldsView(AbstractView):
  254. """Wrapper to extract named fields from a dataset with a struct dtype"""
  255. def __init__(self, dset, prior_dtype, names):
  256. super().__init__(dset)
  257. if isinstance(names, str):
  258. self.extract_field = names
  259. names = [names]
  260. else:
  261. self.extract_field = None
  262. self.read_dtype = readtime_dtype(prior_dtype, names)
  263. @property
  264. def dtype(self):
  265. t = self.read_dtype
  266. if self.extract_field is not None:
  267. t = t[self.extract_field]
  268. return t
  269. def __getitem__(self, idx):
  270. data = self._dset.__getitem__(idx, new_dtype=self.read_dtype)
  271. if self.extract_field is not None:
  272. data = data[self.extract_field]
  273. return data
  274. def readtime_dtype(basetype, names):
  275. """Make a NumPy compound dtype with a subset of available fields"""
  276. if basetype.names is None: # Names provided, but not compound
  277. raise ValueError("Field names only allowed for compound types")
  278. for name in names: # Check all names are legal
  279. if name not in basetype.names:
  280. raise ValueError("Field %s does not appear in this type." % name)
  281. return numpy.dtype([(name, basetype.fields[name][0]) for name in names])
  282. if MPI:
  283. class CollectiveContext:
  284. """ Manages collective I/O in MPI mode """
  285. # We don't bother with _local as threads are forbidden in MPI mode
  286. def __init__(self, dset):
  287. self._dset = dset
  288. def __enter__(self):
  289. # pylint: disable=protected-access
  290. self._dset._dxpl.set_dxpl_mpio(h5fd.MPIO_COLLECTIVE)
  291. def __exit__(self, *args):
  292. # pylint: disable=protected-access
  293. self._dset._dxpl.set_dxpl_mpio(h5fd.MPIO_INDEPENDENT)
  294. class ChunkIterator:
  295. """
  296. Class to iterate through list of chunks of a given dataset
  297. """
  298. def __init__(self, dset, source_sel=None):
  299. self._shape = dset.shape
  300. rank = len(dset.shape)
  301. if not dset.chunks:
  302. # can only use with chunked datasets
  303. raise TypeError("Chunked dataset required")
  304. self._layout = dset.chunks
  305. if source_sel is None:
  306. # select over entire dataset
  307. self._sel = tuple(
  308. slice(0, self._shape[dim]) for dim in range(rank)
  309. )
  310. else:
  311. if isinstance(source_sel, (slice, int)):
  312. sel = [source_sel]
  313. else:
  314. sel = list(source_sel)
  315. if len(sel) != rank:
  316. raise ValueError("Invalid selection - selection region must have same rank as dataset")
  317. for dim, s in enumerate(sel):
  318. start: int | None
  319. stop: int | None
  320. step: int | None
  321. match s:
  322. case int():
  323. start = s
  324. stop = s + 1
  325. step = None
  326. case slice():
  327. start = s.start or 0
  328. stop = s.stop or self._shape[dim]
  329. step = s.step
  330. case _:
  331. # TODO: use typing.assert_never when Python 3.10 is dropped
  332. raise AssertionError(f'{s}: Selection object must be a slice or integer')
  333. sel[dim] = slice(start, stop, step)
  334. self._sel = tuple(sel)
  335. self._chunk_index = []
  336. for dim in range(rank):
  337. s = self._sel[dim]
  338. if s.start < 0 or s.stop > self._shape[dim] or s.stop <= s.start:
  339. raise ValueError("Invalid selection - selection region must be within dataset space")
  340. index = s.start // self._layout[dim]
  341. self._chunk_index.append(index)
  342. def __iter__(self):
  343. return self
  344. def __next__(self):
  345. rank = len(self._shape)
  346. slices = []
  347. if rank == 0 or self._chunk_index[0] * self._layout[0] >= self._sel[0].stop:
  348. # ran past the last chunk, end iteration
  349. raise StopIteration()
  350. for dim in range(rank):
  351. s = self._sel[dim]
  352. start = self._chunk_index[dim] * self._layout[dim]
  353. stop = (self._chunk_index[dim] + 1) * self._layout[dim]
  354. # adjust the start if this is an edge chunk
  355. if start < s.start:
  356. start = s.start
  357. if stop > s.stop:
  358. stop = s.stop # trim to end of the selection
  359. s = slice(start, stop, 1)
  360. slices.append(s)
  361. # bump up the last index and carry forward if we run outside the selection
  362. dim = rank - 1
  363. while dim >= 0:
  364. s = self._sel[dim]
  365. self._chunk_index[dim] += 1
  366. chunk_end = self._chunk_index[dim] * self._layout[dim]
  367. if chunk_end < s.stop:
  368. # we still have room to extend along this dimensions
  369. return tuple(slices)
  370. if dim > 0:
  371. # reset to the start and continue iterating with higher dimension
  372. self._chunk_index[dim] = s.start // self._layout[dim]
  373. dim -= 1
  374. return tuple(slices)
  375. class Dataset(HLObject):
  376. """
  377. Represents an HDF5 dataset
  378. """
  379. def astype(self, dtype):
  380. """ Get a wrapper allowing you to perform reads to a
  381. different destination type, e.g.:
  382. >>> double_precision = dataset.astype('f8')[0:100:2]
  383. """
  384. dtype = numpy.dtype(dtype)
  385. if dtype == self.dtype:
  386. return self
  387. if dtype.kind == "T":
  388. string_info = h5t.check_string_dtype(self.dtype)
  389. if string_info is None:
  390. raise TypeError(
  391. f"dset.astype({dtype}) can only be used on datasets with "
  392. "an HDF5 string datatype"
  393. )
  394. return AsTypeView(self, dtype)
  395. def asstr(self, encoding=None, errors='strict'):
  396. """Get a wrapper to read string data as Python strings:
  397. >>> str_array = dataset.asstr()[:]
  398. The parameters have the same meaning as in ``bytes.decode()``.
  399. If ``encoding`` is unspecified, it will use the encoding in the HDF5
  400. datatype (either ascii or utf-8).
  401. .. note::
  402. On NumPy 2.0 and later, it is recommended to use native NumPy
  403. variable-width strings instead:
  404. >>> str_array = dataset.astype('T')[:]
  405. """
  406. string_info = h5t.check_string_dtype(self.dtype)
  407. if string_info is None:
  408. raise TypeError(
  409. "dset.asstr() can only be used on datasets with "
  410. "an HDF5 string datatype"
  411. )
  412. if encoding is None:
  413. encoding = string_info.encoding
  414. return AsStrView(self, encoding, errors=errors)
  415. def fields(self, names, *, _prior_dtype=None):
  416. """Get a wrapper to read a subset of fields from a compound data type:
  417. >>> 2d_coords = dataset.fields(['x', 'y'])[:]
  418. If names is a string, a single field is extracted, and the resulting
  419. arrays will have that dtype. Otherwise, it should be an iterable,
  420. and the read data will have a compound dtype.
  421. """
  422. if _prior_dtype is None:
  423. _prior_dtype = self.dtype
  424. return FieldsView(self, _prior_dtype, names)
  425. if MPI:
  426. @property
  427. @with_phil
  428. def collective(self):
  429. """ Context manager for MPI collective reads & writes """
  430. return CollectiveContext(self)
  431. @property
  432. def dims(self):
  433. """ Access dimension scales attached to this dataset. """
  434. from .dims import DimensionManager
  435. with phil:
  436. return DimensionManager(self)
  437. @property
  438. @with_phil
  439. def ndim(self):
  440. """Numpy-style attribute giving the number of dimensions"""
  441. return self.id.rank
  442. @property
  443. def shape(self):
  444. """Numpy-style shape tuple giving dataset dimensions"""
  445. if 'shape' in self._cache_props:
  446. return self._cache_props['shape']
  447. with phil:
  448. shape = self.id.shape
  449. # If the file is read-only, cache the shape to speed-up future uses.
  450. # This cache is invalidated by .refresh() when using SWMR.
  451. if self._readonly:
  452. self._cache_props['shape'] = shape
  453. return shape
  454. @shape.setter
  455. @with_phil
  456. def shape(self, shape):
  457. # pylint: disable=missing-docstring
  458. self.resize(shape)
  459. @property
  460. def size(self):
  461. """Numpy-style attribute giving the total dataset size"""
  462. if 'size' in self._cache_props:
  463. return self._cache_props['size']
  464. if self._is_empty:
  465. size = None
  466. else:
  467. size = product(self.shape)
  468. # If the file is read-only, cache the size to speed-up future uses.
  469. # This cache is invalidated by .refresh() when using SWMR.
  470. if self._readonly:
  471. self._cache_props['size'] = size
  472. return size
  473. @property
  474. def nbytes(self):
  475. """Numpy-style attribute giving the raw dataset size as the number of bytes"""
  476. size = self.size
  477. if size is None: # if we are an empty 0-D array, then there are no bytes in the dataset
  478. return 0
  479. return self.dtype.itemsize * size
  480. @property
  481. def _selector(self):
  482. """Internal object for optimised selection of data"""
  483. if '_selector' in self._cache_props:
  484. return self._cache_props['_selector']
  485. slr = _selector.Selector(self.id.get_space())
  486. # If the file is read-only, cache the reader to speed up future uses.
  487. # This cache is invalidated by .refresh() when using SWMR.
  488. if self._readonly:
  489. self._cache_props['_selector'] = slr
  490. return slr
  491. @property
  492. def _fast_reader(self):
  493. """Internal object for optimised reading of data"""
  494. if '_fast_reader' in self._cache_props:
  495. return self._cache_props['_fast_reader']
  496. rdr = _selector.Reader(self.id)
  497. # If the file is read-only, cache the reader to speed up future uses.
  498. # This cache is invalidated by .refresh() when using SWMR.
  499. if self._readonly:
  500. self._cache_props['_fast_reader'] = rdr
  501. return rdr
  502. @property
  503. @with_phil
  504. def dtype(self):
  505. """Numpy dtype representing the datatype"""
  506. return self.id.dtype
  507. @property
  508. @with_phil
  509. def chunks(self):
  510. """Dataset chunks (or None)"""
  511. dcpl = self._dcpl
  512. if dcpl.get_layout() == h5d.CHUNKED:
  513. return dcpl.get_chunk()
  514. return None
  515. @property
  516. @with_phil
  517. def compression(self):
  518. """Compression strategy (or None)"""
  519. for x in ('gzip','lzf','szip'):
  520. if x in self._filters:
  521. return x
  522. if any(f not in filters._COMP_FILTERS for f in self._filters):
  523. return 'unknown' # Filter from a plugin
  524. return None
  525. @property
  526. @with_phil
  527. def compression_opts(self):
  528. """ Compression setting. Int(0-9) for gzip, 2-tuple for szip. """
  529. return self._filters.get(self.compression, None)
  530. @property
  531. @with_phil
  532. def filter_ids(self):
  533. """Numeric IDs of HDF5 filters used for this dataset"""
  534. pl = self._dcpl
  535. return tuple([pl.get_filter(i)[0] for i in range(pl.get_nfilters())])
  536. @property
  537. @with_phil
  538. def filter_names(self):
  539. """Names, as stored in the file, of the filters used for this dataset"""
  540. pl = self._dcpl
  541. return tuple([pl.get_filter(i)[3].decode('utf-8', 'surrogateescape')
  542. for i in range(pl.get_nfilters())])
  543. @property
  544. @with_phil
  545. def shuffle(self):
  546. """Shuffle filter present (T/F)"""
  547. return 'shuffle' in self._filters
  548. @property
  549. @with_phil
  550. def fletcher32(self):
  551. """Fletcher32 filter is present (T/F)"""
  552. return 'fletcher32' in self._filters
  553. @property
  554. @with_phil
  555. def scaleoffset(self):
  556. """Scale/offset filter settings. For integer data types, this is
  557. the number of bits stored, or 0 for auto-detected. For floating
  558. point data types, this is the number of decimal places retained.
  559. If the scale/offset filter is not in use, this is None."""
  560. try:
  561. return self._filters['scaleoffset'][1]
  562. except KeyError:
  563. return None
  564. @property
  565. @with_phil
  566. def external(self):
  567. """External file settings. Returns a list of tuples of
  568. (name, offset, size) for each external file entry, or returns None
  569. if no external files are used."""
  570. count = self._dcpl.get_external_count()
  571. if count<=0:
  572. return None
  573. ext_list = list()
  574. for x in range(count):
  575. (name, offset, size) = self._dcpl.get_external(x)
  576. ext_list.append( (filename_decode(name), offset, size) )
  577. return ext_list
  578. @property
  579. @with_phil
  580. def maxshape(self):
  581. """Shape up to which this dataset can be resized. Axes with value
  582. None have no resize limit. """
  583. space = self.id.get_space()
  584. dims = space.get_simple_extent_dims(True)
  585. if dims is None:
  586. return None
  587. return tuple(x if x != h5s.UNLIMITED else None for x in dims)
  588. @property
  589. @with_phil
  590. def fillvalue(self):
  591. """Fill value for this dataset (0 by default)"""
  592. arr = numpy.zeros((1,), dtype=self.dtype)
  593. self._dcpl.get_fill_value(arr)
  594. return arr[0]
  595. @cached_property
  596. @with_phil
  597. def _extent_type(self):
  598. """Get extent type for this dataset - SIMPLE, SCALAR or NULL"""
  599. return self.id.get_space().get_simple_extent_type()
  600. @cached_property
  601. def _is_empty(self):
  602. """Check if extent type is empty"""
  603. return self._extent_type == h5s.NULL
  604. @cached_property
  605. def _dcpl(self):
  606. """
  607. The dataset creation property list used when this dataset was created.
  608. """
  609. return self.id.get_create_plist()
  610. @cached_property
  611. def _filters(self):
  612. """
  613. The active filters of the dataset.
  614. """
  615. return filters.get_filters(self._dcpl)
  616. @with_phil
  617. def __init__(self, bind, *, readonly=False):
  618. """ Create a new Dataset object by binding to a low-level DatasetID.
  619. """
  620. if not isinstance(bind, h5d.DatasetID):
  621. raise ValueError("%s is not a DatasetID" % bind)
  622. super().__init__(bind)
  623. self._dxpl = h5p.create(h5p.DATASET_XFER)
  624. self._readonly = readonly
  625. self._cache_props = {}
  626. def resize(self, size, axis=None):
  627. """ Resize the dataset, or the specified axis.
  628. The dataset must be stored in chunked format; it can be resized up to
  629. the "maximum shape" (keyword maxshape) specified at creation time.
  630. The rank of the dataset cannot be changed.
  631. "Size" should be a shape tuple, or if an axis is specified, an integer.
  632. BEWARE: This functions differently than the NumPy resize() method!
  633. The data is not "reshuffled" to fit in the new shape; each axis is
  634. grown or shrunk independently. The coordinates of existing data are
  635. fixed.
  636. """
  637. with phil:
  638. if self.chunks is None:
  639. raise TypeError("Only chunked datasets can be resized")
  640. if axis is not None:
  641. if not (axis >=0 and axis < self.id.rank):
  642. raise ValueError("Invalid axis (0 to %s allowed)" % (self.id.rank-1))
  643. try:
  644. newlen = int(size)
  645. except TypeError:
  646. raise TypeError("Argument must be a single int if axis is specified") from None
  647. size = list(self.shape)
  648. size[axis] = newlen
  649. size = tuple(size)
  650. self.id.set_extent(size)
  651. #h5f.flush(self.id) # THG recommends
  652. @with_phil
  653. def __len__(self):
  654. """ The size of the first axis. TypeError if scalar.
  655. Limited to 2**32 on 32-bit systems; Dataset.len() is preferred.
  656. """
  657. size = self.len()
  658. if size > sys.maxsize:
  659. raise OverflowError("Value too big for Python's __len__; use Dataset.len() instead.")
  660. return size
  661. def len(self):
  662. """ The size of the first axis. TypeError if scalar.
  663. Use of this method is preferred to len(dset), as Python's built-in
  664. len() cannot handle values greater then 2**32 on 32-bit systems.
  665. """
  666. with phil:
  667. shape = self.shape
  668. if len(shape) == 0:
  669. raise TypeError("Attempt to take len() of scalar dataset")
  670. return shape[0]
  671. @with_phil
  672. def __iter__(self):
  673. """ Iterate over the first axis. TypeError if scalar.
  674. BEWARE: Modifications to the yielded data are *NOT* written to file.
  675. """
  676. shape = self.shape
  677. if len(shape) == 0:
  678. raise TypeError("Can't iterate over a scalar dataset")
  679. for i in range(shape[0]):
  680. yield self[i]
  681. @with_phil
  682. def iter_chunks(self, sel=None):
  683. """ Return chunk iterator. If set, the sel argument is a slice or
  684. tuple of slices that defines the region to be used. If not set, the
  685. entire dataspace will be used for the iterator.
  686. For each chunk within the given region, the iterator yields a tuple of
  687. slices that gives the intersection of the given chunk with the
  688. selection area.
  689. A TypeError will be raised if the dataset is not chunked.
  690. A ValueError will be raised if the selection region is invalid.
  691. """
  692. return ChunkIterator(self, sel)
  693. @cached_property
  694. def _fast_read_ok(self):
  695. """Is this dataset suitable for simple reading"""
  696. return (
  697. self._extent_type == h5s.SIMPLE
  698. and isinstance(self.id.get_type(), (h5t.TypeIntegerID, h5t.TypeFloatID))
  699. )
  700. @with_phil
  701. def __getitem__(self, args, new_dtype=None):
  702. """ Read a slice from the HDF5 dataset.
  703. Takes slices and recarray-style field names (more than one is
  704. allowed!) in any order. Obeys basic NumPy rules, including
  705. broadcasting.
  706. Also supports:
  707. * Boolean "mask" array indexing
  708. """
  709. args = args if isinstance(args, tuple) else (args,)
  710. if any(a is None for a in args): # 'None in args' would fail on arrays
  711. raise TypeError("Indexing with None (or np.newaxis) is not supported")
  712. if self._fast_read_ok and (new_dtype is None):
  713. try:
  714. return self._fast_reader.read(args)
  715. except TypeError:
  716. pass # Fall back to Python read pathway below
  717. if self._is_empty:
  718. # Check 'is Ellipsis' to avoid equality comparison with an array:
  719. # array equality returns an array, not a boolean.
  720. if args == () or (len(args) == 1 and args[0] is Ellipsis):
  721. return Empty(self.dtype)
  722. raise ValueError("Empty datasets cannot be sliced")
  723. # Sort field names from the rest of the args.
  724. names = tuple(x for x in args if isinstance(x, str))
  725. if names:
  726. # Read a subset of the fields in this structured dtype
  727. if len(names) == 1:
  728. names = names[0] # Read with simpler dtype of this field
  729. args = tuple(x for x in args if not isinstance(x, str))
  730. return self.fields(names, _prior_dtype=new_dtype)[args]
  731. if new_dtype is None:
  732. new_dtype = self.dtype
  733. mtype = h5t.py_create(new_dtype)
  734. # === Special-case region references ====
  735. if len(args) == 1 and isinstance(args[0], h5r.RegionReference):
  736. obj = h5r.dereference(args[0], self.id)
  737. if obj != self.id:
  738. raise ValueError("Region reference must point to this dataset")
  739. sid = h5r.get_region(args[0], self.id)
  740. mshape = sel.guess_shape(sid)
  741. if mshape is None:
  742. # 0D with no data (NULL or deselected SCALAR)
  743. return Empty(new_dtype)
  744. out = numpy.zeros(mshape, dtype=new_dtype)
  745. if out.size == 0:
  746. return out
  747. sid_out = h5s.create_simple(mshape)
  748. sid_out.select_all()
  749. self.id.read(sid_out, sid, out, mtype)
  750. return out
  751. # === Check for zero-sized datasets =====
  752. if self.size == 0:
  753. # Check 'is Ellipsis' to avoid equality comparison with an array:
  754. # array equality returns an array, not a boolean.
  755. if args == () or (len(args) == 1 and args[0] is Ellipsis):
  756. return numpy.zeros(self.shape, dtype=new_dtype)
  757. # === Scalar dataspaces =================
  758. if self.shape == ():
  759. fspace = self.id.get_space()
  760. selection = sel2.select_read(fspace, args)
  761. if selection.mshape is None:
  762. arr = numpy.zeros((), dtype=new_dtype)
  763. else:
  764. arr = numpy.zeros(selection.mshape, dtype=new_dtype)
  765. for mspace, fspace in selection:
  766. self.id.read(mspace, fspace, arr, mtype)
  767. if selection.mshape is None:
  768. return arr[()]
  769. return arr
  770. # === Everything else ===================
  771. # Perform the dataspace selection.
  772. selection = sel.select(self.shape, args, dataset=self)
  773. if selection.nselect == 0:
  774. return numpy.zeros(selection.array_shape, dtype=new_dtype)
  775. arr = numpy.zeros(selection.array_shape, new_dtype, order='C')
  776. # Perform the actual read
  777. mspace = h5s.create_simple(selection.mshape)
  778. fspace = selection.id
  779. self.id.read(mspace, fspace, arr, mtype, dxpl=self._dxpl)
  780. # Patch up the output for NumPy
  781. if arr.shape == ():
  782. return arr[()] # 0 dim array -> numpy scalar
  783. return arr
  784. @with_phil
  785. def __setitem__(self, args, val):
  786. """ Write to the HDF5 dataset from a Numpy array.
  787. NumPy's broadcasting rules are honored, for "simple" indexing
  788. (slices and integers). For advanced indexing, the shapes must
  789. match.
  790. """
  791. args = args if isinstance(args, tuple) else (args,)
  792. # Sort field indices from the slicing
  793. names = tuple(x for x in args if isinstance(x, str))
  794. args = tuple(x for x in args if not isinstance(x, str))
  795. # Generally we try to avoid converting the arrays on the Python
  796. # side. However, for compound literals this is unavoidable.
  797. vlen = h5t.check_vlen_dtype(self.dtype)
  798. if vlen is not None and vlen not in (bytes, str):
  799. try:
  800. val = numpy.asarray(val, dtype=vlen)
  801. except (ValueError, TypeError):
  802. try:
  803. val = numpy.array([numpy.array(x, dtype=vlen)
  804. for x in val], dtype=self.dtype)
  805. except (ValueError, TypeError):
  806. pass
  807. if vlen == val.dtype:
  808. if val.ndim > 1:
  809. tmp = numpy.empty(shape=val.shape[:-1], dtype=object)
  810. tmp.ravel()[:] = [i for i in val.reshape(
  811. (product(val.shape[:-1]), val.shape[-1])
  812. )]
  813. else:
  814. tmp = numpy.array([None], dtype=object)
  815. tmp[0] = val
  816. val = tmp
  817. elif self.dtype.kind == "O" or \
  818. (self.dtype.kind == 'V' and \
  819. (not isinstance(val, numpy.ndarray) or val.dtype.kind != 'V') and \
  820. (self.dtype.subdtype is None)):
  821. if len(names) == 1 and self.dtype.fields is not None:
  822. # Single field selected for write, from a non-array source
  823. if not names[0] in self.dtype.fields:
  824. raise ValueError("No such field for indexing: %s" % names[0])
  825. dtype = self.dtype.fields[names[0]][0]
  826. cast_compound = True
  827. else:
  828. dtype = self.dtype
  829. cast_compound = False
  830. val = numpy.asarray(val, dtype=dtype.base, order='C')
  831. if cast_compound:
  832. val = val.view(numpy.dtype([(names[0], dtype)]))
  833. val = val.reshape(val.shape[:len(val.shape) - len(dtype.shape)])
  834. elif (self.dtype.kind == 'S'
  835. and (h5t.check_string_dtype(self.dtype).encoding == 'utf-8')
  836. and (find_item_type(val) is str)
  837. ):
  838. # Writing str objects to a fixed-length UTF-8 string dataset.
  839. # Numpy's normal conversion only handles ASCII characters, but
  840. # when the destination is UTF-8, we want to allow any unicode.
  841. # This *doesn't* handle numpy fixed-length unicode data ('U' dtype),
  842. # as HDF5 has no equivalent, and converting fixed length UTF-32
  843. # to variable length UTF-8 would obscure what's going on.
  844. str_array = numpy.asarray(val, order='C', dtype=object)
  845. val = numpy.array([
  846. s.encode('utf-8') for s in str_array.flat
  847. ], dtype=self.dtype).reshape(str_array.shape)
  848. else:
  849. # If the input data is already an array, let HDF5 do the conversion.
  850. # If it's a list or similar, don't make numpy guess a dtype for it.
  851. dt = None if isinstance(val, numpy.ndarray) else self.dtype.base
  852. val = numpy.asarray(val, order='C', dtype=dt)
  853. # Check for array dtype compatibility and convert
  854. if self.dtype.subdtype is not None:
  855. shp = self.dtype.subdtype[1]
  856. valshp = val.shape[-len(shp):]
  857. if valshp != shp: # Last dimension has to match
  858. raise TypeError("When writing to array types, last N dimensions have to match (got %s, but should be %s)" % (valshp, shp,))
  859. mtype = h5t.py_create(numpy.dtype((val.dtype, shp)))
  860. mshape = val.shape[0:len(val.shape)-len(shp)]
  861. # Make a compound memory type if field-name slicing is required
  862. elif len(names) != 0:
  863. mshape = val.shape
  864. # Catch common errors
  865. if self.dtype.fields is None:
  866. raise TypeError("Illegal slicing argument (not a compound dataset)")
  867. mismatch = [x for x in names if x not in self.dtype.fields]
  868. if len(mismatch) != 0:
  869. mismatch = ", ".join('"%s"'%x for x in mismatch)
  870. raise ValueError("Illegal slicing argument (fields %s not in dataset type)" % mismatch)
  871. # Write non-compound source into a single dataset field
  872. if len(names) == 1 and val.dtype.fields is None:
  873. subtype = h5t.py_create(val.dtype)
  874. mtype = h5t.create(h5t.COMPOUND, subtype.get_size())
  875. mtype.insert(self._e(names[0]), 0, subtype)
  876. # Make a new source type keeping only the requested fields
  877. else:
  878. fieldnames = [x for x in val.dtype.names if x in names] # Keep source order
  879. mtype = h5t.create(h5t.COMPOUND, val.dtype.itemsize)
  880. for fieldname in fieldnames:
  881. subtype = h5t.py_create(val.dtype.fields[fieldname][0])
  882. offset = val.dtype.fields[fieldname][1]
  883. mtype.insert(self._e(fieldname), offset, subtype)
  884. # Use mtype derived from array (let DatasetID.write figure it out)
  885. else:
  886. mshape = val.shape
  887. mtype = None
  888. # Perform the dataspace selection
  889. selection = sel.select(self.shape, args, dataset=self)
  890. # Broadcast scalars if necessary.
  891. # In order to avoid slow broadcasting filling the destination by
  892. # the scalar value, we create an intermediate array of the same
  893. # size as the destination buffer provided that size is reasonable.
  894. # We assume as reasonable a size smaller or equal as the used dataset
  895. # chunk size if any.
  896. # In case of dealing with a non-chunked destination dataset or with
  897. # a selection whose size is larger than the dataset chunk size we fall
  898. # back to using an intermediate array of size equal to the last dimension
  899. # of the destination buffer.
  900. # The reasoning behind is that it makes sense to assume the creator of
  901. # the dataset used an appropriate chunk size according the available
  902. # memory. In any case, if we cannot afford to create an intermediate
  903. # array of the same size as the dataset chunk size, the user program has
  904. # little hope to go much further. Solves h5py issue #1067
  905. if mshape == () and selection.array_shape != ():
  906. if self.dtype.subdtype is not None:
  907. raise TypeError("Scalar broadcasting is not supported for array dtypes")
  908. if self.chunks and (product(self.chunks) >= product(selection.array_shape)):
  909. val2 = numpy.empty(selection.array_shape, dtype=val.dtype)
  910. else:
  911. val2 = numpy.empty(selection.array_shape[-1], dtype=val.dtype)
  912. val2[...] = val
  913. val = val2
  914. mshape = val.shape
  915. # Perform the write, with broadcasting
  916. mspace = h5s.create_simple(selection.expand_shape(mshape))
  917. for fspace in selection.broadcast(mshape):
  918. self.id.write(mspace, fspace, val, mtype, dxpl=self._dxpl)
  919. def read_direct(self, dest, source_sel=None, dest_sel=None):
  920. """ Read data directly from HDF5 into an existing NumPy array.
  921. The destination array must be C-contiguous and writable.
  922. Selections must be the output of numpy.s_[<args>].
  923. Broadcasting is supported for simple indexing.
  924. """
  925. with phil:
  926. if self._is_empty:
  927. raise TypeError("Empty datasets have no numpy representation")
  928. if source_sel is None:
  929. source_sel = sel.SimpleSelection(self.shape)
  930. else:
  931. source_sel = sel.select(self.shape, source_sel, self) # for numpy.s_
  932. fspace = source_sel.id
  933. if dest_sel is None:
  934. dest_sel = sel.SimpleSelection(dest.shape)
  935. else:
  936. dest_sel = sel.select(dest.shape, dest_sel)
  937. for mspace in dest_sel.broadcast(source_sel.array_shape):
  938. self.id.read(mspace, fspace, dest, dxpl=self._dxpl)
  939. def write_direct(self, source, source_sel=None, dest_sel=None):
  940. """ Write data directly to HDF5 from a NumPy array.
  941. The source array must be C-contiguous. Selections must be
  942. the output of numpy.s_[<args>].
  943. Broadcasting is supported for simple indexing.
  944. """
  945. with phil:
  946. if self._is_empty:
  947. raise TypeError("Empty datasets cannot be written to")
  948. if source_sel is None:
  949. source_sel = sel.SimpleSelection(source.shape)
  950. else:
  951. source_sel = sel.select(source.shape, source_sel) # for numpy.s_
  952. mspace = source_sel.id
  953. if dest_sel is None:
  954. dest_sel = sel.SimpleSelection(self.shape)
  955. else:
  956. dest_sel = sel.select(self.shape, dest_sel, self)
  957. for fspace in dest_sel.broadcast(source_sel.array_shape):
  958. self.id.write(mspace, fspace, source, dxpl=self._dxpl)
  959. @with_phil
  960. def __array__(self, dtype=None, copy=None):
  961. """ Create a Numpy array containing the whole dataset. DON'T THINK
  962. THIS MEANS DATASETS ARE INTERCHANGEABLE WITH ARRAYS. For one thing,
  963. you have to read the whole dataset every time this method is called.
  964. """
  965. if copy is False:
  966. raise ValueError(
  967. f"Dataset.__array__ received {copy=} "
  968. "but memory allocation cannot be avoided on read"
  969. )
  970. arr = numpy.zeros(self.shape, dtype=self.dtype if dtype is None else dtype)
  971. # Special case for (0,)*-shape datasets
  972. if self.size == 0:
  973. return arr
  974. self.read_direct(arr)
  975. return arr
  976. @with_phil
  977. def __repr__(self):
  978. if not self:
  979. return "<Closed HDF5 dataset>"
  980. if self.name is None:
  981. name = "(anonymous)"
  982. else:
  983. name = pp.basename(pp.normpath(self.name))
  984. name = f'"{name}"'
  985. return f'<HDF5 dataset {name}: shape {self.shape}, type "{self.dtype.str}">'
  986. @with_phil
  987. def refresh(self):
  988. """ Refresh the dataset metadata by reloading from the file.
  989. This is part of the SWMR features.
  990. """
  991. self._id.refresh()
  992. self._cache_props.clear()
  993. @with_phil
  994. def flush(self):
  995. """ Flush the dataset data and metadata to the file.
  996. If the dataset is chunked, raw data chunks are written to the file.
  997. This is part of the SWMR features.
  998. """
  999. self._id.flush()
  1000. if vds_support:
  1001. @property
  1002. @with_phil
  1003. def is_virtual(self):
  1004. """Check if this is a virtual dataset"""
  1005. return self._dcpl.get_layout() == h5d.VIRTUAL
  1006. @with_phil
  1007. def virtual_sources(self):
  1008. """Get a list of the data mappings for a virtual dataset"""
  1009. if not self.is_virtual:
  1010. raise RuntimeError("Not a virtual dataset")
  1011. dcpl = self._dcpl
  1012. return [
  1013. VDSmap(dcpl.get_virtual_vspace(j),
  1014. dcpl.get_virtual_filename(j),
  1015. dcpl.get_virtual_dsetname(j),
  1016. dcpl.get_virtual_srcspace(j))
  1017. for j in range(dcpl.get_virtual_count())]
  1018. @with_phil
  1019. def make_scale(self, name=''):
  1020. """Make this dataset an HDF5 dimension scale.
  1021. You can then attach it to dimensions of other datasets like this::
  1022. other_ds.dims[0].attach_scale(ds)
  1023. You can optionally pass a name to associate with this scale.
  1024. """
  1025. h5ds.set_scale(self._id, self._e(name))
  1026. @property
  1027. @with_phil
  1028. def is_scale(self):
  1029. """Return ``True`` if this dataset is also a dimension scale.
  1030. Return ``False`` otherwise.
  1031. """
  1032. return h5ds.is_scale(self._id)