| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252 |
- # This file is part of h5py, a Python interface to the HDF5 library.
- #
- # http://www.h5py.org
- #
- # Copyright 2008-2020 Andrew Collette and contributors
- #
- # License: Standard 3-clause BSD; see "license.txt" for full license terms
- # and contributor agreement.
- """
- Implements support for high-level dataset access.
- """
- import posixpath as pp
- import sys
- from abc import ABC, abstractmethod
- from warnings import warn
- import numpy
- import h5py.h5t
- from .. import h5, h5s, h5t, h5r, h5d, h5p, h5fd, h5ds, _selector
- from ..h5py_warnings import H5pyDeprecationWarning
- from .base import (
- array_for_new_object, cached_property, Empty, find_item_type, HLObject,
- phil, product, with_phil,
- )
- from . import filters
- from . import selections as sel
- from . import selections2 as sel2
- from .datatype import Datatype
- from .compat import filename_decode
- from .vds import VDSmap, vds_support
- _LEGACY_GZIP_COMPRESSION_VALS = frozenset(range(10))
- MPI = h5.get_config().mpi
- def make_new_dset(parent, shape=None, dtype=None, data=None, name=None,
- chunks=None, compression=None, shuffle=None,
- fletcher32=None, maxshape=None, compression_opts=None,
- fillvalue=None, scaleoffset=None, track_times=False,
- external=None, track_order=None, dcpl=None, dapl=None,
- efile_prefix=None, virtual_prefix=None, allow_unknown_filter=False,
- rdcc_nslots=None, rdcc_nbytes=None, rdcc_w0=None, *,
- fill_time=None):
- """ Return a new low-level dataset identifier """
- # Convert data to a C-contiguous ndarray
- if data is not None and not isinstance(data, Empty):
- data = array_for_new_object(data, specified_dtype=dtype)
- # Validate shape
- if shape is None:
- if data is None:
- if dtype is None:
- raise TypeError("One of data, shape or dtype must be specified")
- data = Empty(dtype)
- shape = data.shape
- else:
- shape = (shape,) if isinstance(shape, int) else tuple(shape)
- if data is not None and (product(shape) != product(data.shape)):
- raise ValueError("Shape tuple is incompatible with data")
- if isinstance(maxshape, int):
- maxshape = (maxshape,)
- tmp_shape = maxshape if maxshape is not None else shape
- # Validate chunk shape
- if isinstance(chunks, int) and not isinstance(chunks, bool):
- chunks = (chunks,)
- # Logically, the following `zip` could be strict, but it's happening
- # before we've done checks elsewhere that raise more descriptive errors
- if isinstance(chunks, tuple) and any(
- chunk > dim for dim, chunk in zip(tmp_shape, chunks, strict=False) if dim is not None
- ):
- errmsg = "Chunk shape must not be greater than data shape in any dimension. "\
- "{} is not compatible with {}".format(chunks, shape)
- raise ValueError(errmsg)
- if isinstance(dtype, Datatype):
- # Named types are used as-is
- tid = dtype.id
- dtype = tid.dtype # Following code needs this
- elif isinstance(dtype, h5py.h5t.TypeID): # Low-level HDF5 data type
- tid = dtype
- dtype = tid.dtype
- else:
- # Validate dtype
- if dtype is None and data is None:
- warn(
- "Creating a dataset without passing data or dtype is deprecated. "
- "Pass an explicit dtype. Using dtype='f4' will keep the "
- "current default behaviour.",
- category=H5pyDeprecationWarning, stacklevel=3,
- )
- dtype = numpy.dtype("=f4")
- elif dtype is None and data is not None:
- dtype = data.dtype
- else:
- dtype = numpy.dtype(dtype)
- tid = h5t.py_create(dtype, logical=1)
- # Legacy
- if any((compression, shuffle, fletcher32, maxshape, scaleoffset)) and chunks is False:
- raise ValueError("Chunked format required for given storage options")
- # Legacy
- if compression is True:
- if compression_opts is None:
- compression_opts = 4
- compression = 'gzip'
- # Legacy
- if compression in _LEGACY_GZIP_COMPRESSION_VALS:
- if compression_opts is not None:
- raise TypeError("Conflict in compression options")
- compression_opts = compression
- compression = 'gzip'
- dcpl = filters.fill_dcpl(
- dcpl or h5p.create(h5p.DATASET_CREATE), shape, dtype,
- chunks, compression, compression_opts, shuffle, fletcher32,
- maxshape, scaleoffset, external, allow_unknown_filter,
- fill_time=fill_time)
- # Check that compression roundtrips correctly if it was specified
- if compression is not None:
- if isinstance(compression, filters.FilterRefBase):
- compression = compression.filter_id
- if isinstance(compression, int):
- compression = filters.get_filter_name(compression)
- if compression not in filters.get_filters(dcpl):
- raise ValueError(f'compression {compression!r} not in filters {filters.get_filters(dcpl)!r}')
- if fillvalue is not None:
- # prepare string-type dtypes for fillvalue
- string_info = h5t.check_string_dtype(dtype)
- if string_info is not None:
- # fake vlen dtype for fixed len string fillvalue
- # to not trigger unwanted encoding
- dtype = h5t.string_dtype(string_info.encoding)
- fillvalue = numpy.array(fillvalue, dtype=dtype)
- else:
- fillvalue = numpy.array(fillvalue)
- dcpl.set_fill_value(fillvalue)
- if track_times is None:
- # In case someone explicitly passes None for the default
- track_times = False
- if track_times in (True, False):
- dcpl.set_obj_track_times(track_times)
- else:
- raise TypeError("track_times must be either True or False")
- if track_order is True:
- dcpl.set_attr_creation_order(
- h5p.CRT_ORDER_TRACKED | h5p.CRT_ORDER_INDEXED)
- elif track_order is False:
- dcpl.set_attr_creation_order(0)
- elif track_order is not None:
- raise TypeError("track_order must be either True or False")
- if maxshape is not None:
- maxshape = tuple(m if m is not None else h5s.UNLIMITED for m in maxshape)
- if any([efile_prefix, virtual_prefix, rdcc_nbytes, rdcc_nslots, rdcc_w0]):
- dapl = dapl or h5p.create(h5p.DATASET_ACCESS)
- if efile_prefix is not None:
- dapl.set_efile_prefix(efile_prefix)
- if virtual_prefix is not None:
- dapl.set_virtual_prefix(virtual_prefix)
- if rdcc_nbytes or rdcc_nslots or rdcc_w0:
- cache_settings = list(dapl.get_chunk_cache())
- if rdcc_nslots is not None:
- cache_settings[0] = rdcc_nslots
- if rdcc_nbytes is not None:
- cache_settings[1] = rdcc_nbytes
- if rdcc_w0 is not None:
- cache_settings[2] = rdcc_w0
- dapl.set_chunk_cache(*cache_settings)
- if isinstance(data, Empty):
- sid = h5s.create(h5s.NULL)
- else:
- sid = h5s.create_simple(shape, maxshape)
- dset_id = h5d.create(parent.id, name, tid, sid, dcpl=dcpl, dapl=dapl)
- if (data is not None) and (not isinstance(data, Empty)):
- dset_id.write(h5s.ALL, h5s.ALL, data)
- return dset_id
- def open_dset(parent, name, dapl=None, efile_prefix=None, virtual_prefix=None,
- rdcc_nslots=None, rdcc_nbytes=None, rdcc_w0=None, **kwds):
- """ Return an existing low-level dataset identifier """
- if any([efile_prefix, virtual_prefix, rdcc_nbytes, rdcc_nslots, rdcc_w0]):
- dapl = dapl or h5p.create(h5p.DATASET_ACCESS)
- if efile_prefix is not None:
- dapl.set_efile_prefix(efile_prefix)
- if virtual_prefix is not None:
- dapl.set_virtual_prefix(virtual_prefix)
- if rdcc_nbytes or rdcc_nslots or rdcc_w0:
- cache_settings = list(dapl.get_chunk_cache())
- if rdcc_nslots is not None:
- cache_settings[0] = rdcc_nslots
- if rdcc_nbytes is not None:
- cache_settings[1] = rdcc_nbytes
- if rdcc_w0 is not None:
- cache_settings[2] = rdcc_w0
- dapl.set_chunk_cache(*cache_settings)
- dset_id = h5d.open(parent.id, name, dapl=dapl)
- return dset_id
- class AbstractView(ABC):
- _dset: "Dataset"
- def __init__(self, dset):
- self._dset = dset
- def __len__(self):
- return len(self._dset)
- @property
- @abstractmethod
- def dtype(self):
- ... # pragma: nocover
- @property
- def ndim(self):
- return self._dset.ndim
- @property
- def shape(self):
- return self._dset.shape
- @property
- def size(self):
- return self._dset.size
- @abstractmethod
- def __getitem__(self, idx):
- ... # pragma: nocover
- def __array__(self, dtype=None, copy=None):
- if copy is False:
- raise ValueError(
- f"{self.__class__.__name__}.__array__ received {copy=} "
- "but memory allocation cannot be avoided on read"
- )
- # If self.ndim == 0, convert np.generic back to np.ndarray
- return numpy.asarray(self[()], dtype=dtype or self.dtype)
- class AsTypeView(AbstractView):
- """Wrapper to convert data on reading from a dataset.
- """
- def __init__(self, dset, dtype):
- super().__init__(dset)
- self._dtype = dtype
- @property
- def dtype(self):
- return self._dtype
- def __getitem__(self, idx):
- return self._dset.__getitem__(idx, new_dtype=self._dtype)
- def __array__(self, dtype=None, copy=None):
- return self._dset.__array__(dtype or self._dtype, copy)
- class AsStrView(AbstractView):
- """Wrapper to decode strings on reading the dataset"""
- def __init__(self, dset, encoding, errors='strict'):
- super().__init__(dset)
- self.encoding = encoding
- self.errors = errors
- @property
- def dtype(self):
- return numpy.dtype(object)
- def __getitem__(self, idx):
- bytes_arr = self._dset[idx]
- # numpy.char.decode() seems like the obvious thing to use. But it only
- # accepts numpy string arrays, not object arrays of bytes (which we
- # return from HDF5 variable-length strings). And the numpy
- # implementation is not faster than doing it with a loop; in fact, by
- # not converting the result to a numpy unicode array, the
- # naive way can be faster! (Comparing with numpy 1.18.4, June 2020)
- if numpy.isscalar(bytes_arr):
- return bytes_arr.decode(self.encoding, self.errors)
- return numpy.array([
- b.decode(self.encoding, self.errors) for b in bytes_arr.flat
- ], dtype=object).reshape(bytes_arr.shape)
- class FieldsView(AbstractView):
- """Wrapper to extract named fields from a dataset with a struct dtype"""
- def __init__(self, dset, prior_dtype, names):
- super().__init__(dset)
- if isinstance(names, str):
- self.extract_field = names
- names = [names]
- else:
- self.extract_field = None
- self.read_dtype = readtime_dtype(prior_dtype, names)
- @property
- def dtype(self):
- t = self.read_dtype
- if self.extract_field is not None:
- t = t[self.extract_field]
- return t
- def __getitem__(self, idx):
- data = self._dset.__getitem__(idx, new_dtype=self.read_dtype)
- if self.extract_field is not None:
- data = data[self.extract_field]
- return data
- def readtime_dtype(basetype, names):
- """Make a NumPy compound dtype with a subset of available fields"""
- if basetype.names is None: # Names provided, but not compound
- raise ValueError("Field names only allowed for compound types")
- for name in names: # Check all names are legal
- if name not in basetype.names:
- raise ValueError("Field %s does not appear in this type." % name)
- return numpy.dtype([(name, basetype.fields[name][0]) for name in names])
- if MPI:
- class CollectiveContext:
- """ Manages collective I/O in MPI mode """
- # We don't bother with _local as threads are forbidden in MPI mode
- def __init__(self, dset):
- self._dset = dset
- def __enter__(self):
- # pylint: disable=protected-access
- self._dset._dxpl.set_dxpl_mpio(h5fd.MPIO_COLLECTIVE)
- def __exit__(self, *args):
- # pylint: disable=protected-access
- self._dset._dxpl.set_dxpl_mpio(h5fd.MPIO_INDEPENDENT)
- class ChunkIterator:
- """
- Class to iterate through list of chunks of a given dataset
- """
- def __init__(self, dset, source_sel=None):
- self._shape = dset.shape
- rank = len(dset.shape)
- if not dset.chunks:
- # can only use with chunked datasets
- raise TypeError("Chunked dataset required")
- self._layout = dset.chunks
- if source_sel is None:
- # select over entire dataset
- self._sel = tuple(
- slice(0, self._shape[dim]) for dim in range(rank)
- )
- else:
- if isinstance(source_sel, (slice, int)):
- sel = [source_sel]
- else:
- sel = list(source_sel)
- if len(sel) != rank:
- raise ValueError("Invalid selection - selection region must have same rank as dataset")
- for dim, s in enumerate(sel):
- start: int | None
- stop: int | None
- step: int | None
- match s:
- case int():
- start = s
- stop = s + 1
- step = None
- case slice():
- start = s.start or 0
- stop = s.stop or self._shape[dim]
- step = s.step
- case _:
- # TODO: use typing.assert_never when Python 3.10 is dropped
- raise AssertionError(f'{s}: Selection object must be a slice or integer')
- sel[dim] = slice(start, stop, step)
- self._sel = tuple(sel)
- self._chunk_index = []
- for dim in range(rank):
- s = self._sel[dim]
- if s.start < 0 or s.stop > self._shape[dim] or s.stop <= s.start:
- raise ValueError("Invalid selection - selection region must be within dataset space")
- index = s.start // self._layout[dim]
- self._chunk_index.append(index)
- def __iter__(self):
- return self
- def __next__(self):
- rank = len(self._shape)
- slices = []
- if rank == 0 or self._chunk_index[0] * self._layout[0] >= self._sel[0].stop:
- # ran past the last chunk, end iteration
- raise StopIteration()
- for dim in range(rank):
- s = self._sel[dim]
- start = self._chunk_index[dim] * self._layout[dim]
- stop = (self._chunk_index[dim] + 1) * self._layout[dim]
- # adjust the start if this is an edge chunk
- if start < s.start:
- start = s.start
- if stop > s.stop:
- stop = s.stop # trim to end of the selection
- s = slice(start, stop, 1)
- slices.append(s)
- # bump up the last index and carry forward if we run outside the selection
- dim = rank - 1
- while dim >= 0:
- s = self._sel[dim]
- self._chunk_index[dim] += 1
- chunk_end = self._chunk_index[dim] * self._layout[dim]
- if chunk_end < s.stop:
- # we still have room to extend along this dimensions
- return tuple(slices)
- if dim > 0:
- # reset to the start and continue iterating with higher dimension
- self._chunk_index[dim] = s.start // self._layout[dim]
- dim -= 1
- return tuple(slices)
- class Dataset(HLObject):
- """
- Represents an HDF5 dataset
- """
- def astype(self, dtype):
- """ Get a wrapper allowing you to perform reads to a
- different destination type, e.g.:
- >>> double_precision = dataset.astype('f8')[0:100:2]
- """
- dtype = numpy.dtype(dtype)
- if dtype == self.dtype:
- return self
- if dtype.kind == "T":
- string_info = h5t.check_string_dtype(self.dtype)
- if string_info is None:
- raise TypeError(
- f"dset.astype({dtype}) can only be used on datasets with "
- "an HDF5 string datatype"
- )
- return AsTypeView(self, dtype)
- def asstr(self, encoding=None, errors='strict'):
- """Get a wrapper to read string data as Python strings:
- >>> str_array = dataset.asstr()[:]
- The parameters have the same meaning as in ``bytes.decode()``.
- If ``encoding`` is unspecified, it will use the encoding in the HDF5
- datatype (either ascii or utf-8).
- .. note::
- On NumPy 2.0 and later, it is recommended to use native NumPy
- variable-width strings instead:
- >>> str_array = dataset.astype('T')[:]
- """
- string_info = h5t.check_string_dtype(self.dtype)
- if string_info is None:
- raise TypeError(
- "dset.asstr() can only be used on datasets with "
- "an HDF5 string datatype"
- )
- if encoding is None:
- encoding = string_info.encoding
- return AsStrView(self, encoding, errors=errors)
- def fields(self, names, *, _prior_dtype=None):
- """Get a wrapper to read a subset of fields from a compound data type:
- >>> 2d_coords = dataset.fields(['x', 'y'])[:]
- If names is a string, a single field is extracted, and the resulting
- arrays will have that dtype. Otherwise, it should be an iterable,
- and the read data will have a compound dtype.
- """
- if _prior_dtype is None:
- _prior_dtype = self.dtype
- return FieldsView(self, _prior_dtype, names)
- if MPI:
- @property
- @with_phil
- def collective(self):
- """ Context manager for MPI collective reads & writes """
- return CollectiveContext(self)
- @property
- def dims(self):
- """ Access dimension scales attached to this dataset. """
- from .dims import DimensionManager
- with phil:
- return DimensionManager(self)
- @property
- @with_phil
- def ndim(self):
- """Numpy-style attribute giving the number of dimensions"""
- return self.id.rank
- @property
- def shape(self):
- """Numpy-style shape tuple giving dataset dimensions"""
- if 'shape' in self._cache_props:
- return self._cache_props['shape']
- with phil:
- shape = self.id.shape
- # If the file is read-only, cache the shape to speed-up future uses.
- # This cache is invalidated by .refresh() when using SWMR.
- if self._readonly:
- self._cache_props['shape'] = shape
- return shape
- @shape.setter
- @with_phil
- def shape(self, shape):
- # pylint: disable=missing-docstring
- self.resize(shape)
- @property
- def size(self):
- """Numpy-style attribute giving the total dataset size"""
- if 'size' in self._cache_props:
- return self._cache_props['size']
- if self._is_empty:
- size = None
- else:
- size = product(self.shape)
- # If the file is read-only, cache the size to speed-up future uses.
- # This cache is invalidated by .refresh() when using SWMR.
- if self._readonly:
- self._cache_props['size'] = size
- return size
- @property
- def nbytes(self):
- """Numpy-style attribute giving the raw dataset size as the number of bytes"""
- size = self.size
- if size is None: # if we are an empty 0-D array, then there are no bytes in the dataset
- return 0
- return self.dtype.itemsize * size
- @property
- def _selector(self):
- """Internal object for optimised selection of data"""
- if '_selector' in self._cache_props:
- return self._cache_props['_selector']
- slr = _selector.Selector(self.id.get_space())
- # If the file is read-only, cache the reader to speed up future uses.
- # This cache is invalidated by .refresh() when using SWMR.
- if self._readonly:
- self._cache_props['_selector'] = slr
- return slr
- @property
- def _fast_reader(self):
- """Internal object for optimised reading of data"""
- if '_fast_reader' in self._cache_props:
- return self._cache_props['_fast_reader']
- rdr = _selector.Reader(self.id)
- # If the file is read-only, cache the reader to speed up future uses.
- # This cache is invalidated by .refresh() when using SWMR.
- if self._readonly:
- self._cache_props['_fast_reader'] = rdr
- return rdr
- @property
- @with_phil
- def dtype(self):
- """Numpy dtype representing the datatype"""
- return self.id.dtype
- @property
- @with_phil
- def chunks(self):
- """Dataset chunks (or None)"""
- dcpl = self._dcpl
- if dcpl.get_layout() == h5d.CHUNKED:
- return dcpl.get_chunk()
- return None
- @property
- @with_phil
- def compression(self):
- """Compression strategy (or None)"""
- for x in ('gzip','lzf','szip'):
- if x in self._filters:
- return x
- if any(f not in filters._COMP_FILTERS for f in self._filters):
- return 'unknown' # Filter from a plugin
- return None
- @property
- @with_phil
- def compression_opts(self):
- """ Compression setting. Int(0-9) for gzip, 2-tuple for szip. """
- return self._filters.get(self.compression, None)
- @property
- @with_phil
- def filter_ids(self):
- """Numeric IDs of HDF5 filters used for this dataset"""
- pl = self._dcpl
- return tuple([pl.get_filter(i)[0] for i in range(pl.get_nfilters())])
- @property
- @with_phil
- def filter_names(self):
- """Names, as stored in the file, of the filters used for this dataset"""
- pl = self._dcpl
- return tuple([pl.get_filter(i)[3].decode('utf-8', 'surrogateescape')
- for i in range(pl.get_nfilters())])
- @property
- @with_phil
- def shuffle(self):
- """Shuffle filter present (T/F)"""
- return 'shuffle' in self._filters
- @property
- @with_phil
- def fletcher32(self):
- """Fletcher32 filter is present (T/F)"""
- return 'fletcher32' in self._filters
- @property
- @with_phil
- def scaleoffset(self):
- """Scale/offset filter settings. For integer data types, this is
- the number of bits stored, or 0 for auto-detected. For floating
- point data types, this is the number of decimal places retained.
- If the scale/offset filter is not in use, this is None."""
- try:
- return self._filters['scaleoffset'][1]
- except KeyError:
- return None
- @property
- @with_phil
- def external(self):
- """External file settings. Returns a list of tuples of
- (name, offset, size) for each external file entry, or returns None
- if no external files are used."""
- count = self._dcpl.get_external_count()
- if count<=0:
- return None
- ext_list = list()
- for x in range(count):
- (name, offset, size) = self._dcpl.get_external(x)
- ext_list.append( (filename_decode(name), offset, size) )
- return ext_list
- @property
- @with_phil
- def maxshape(self):
- """Shape up to which this dataset can be resized. Axes with value
- None have no resize limit. """
- space = self.id.get_space()
- dims = space.get_simple_extent_dims(True)
- if dims is None:
- return None
- return tuple(x if x != h5s.UNLIMITED else None for x in dims)
- @property
- @with_phil
- def fillvalue(self):
- """Fill value for this dataset (0 by default)"""
- arr = numpy.zeros((1,), dtype=self.dtype)
- self._dcpl.get_fill_value(arr)
- return arr[0]
- @cached_property
- @with_phil
- def _extent_type(self):
- """Get extent type for this dataset - SIMPLE, SCALAR or NULL"""
- return self.id.get_space().get_simple_extent_type()
- @cached_property
- def _is_empty(self):
- """Check if extent type is empty"""
- return self._extent_type == h5s.NULL
- @cached_property
- def _dcpl(self):
- """
- The dataset creation property list used when this dataset was created.
- """
- return self.id.get_create_plist()
- @cached_property
- def _filters(self):
- """
- The active filters of the dataset.
- """
- return filters.get_filters(self._dcpl)
- @with_phil
- def __init__(self, bind, *, readonly=False):
- """ Create a new Dataset object by binding to a low-level DatasetID.
- """
- if not isinstance(bind, h5d.DatasetID):
- raise ValueError("%s is not a DatasetID" % bind)
- super().__init__(bind)
- self._dxpl = h5p.create(h5p.DATASET_XFER)
- self._readonly = readonly
- self._cache_props = {}
- def resize(self, size, axis=None):
- """ Resize the dataset, or the specified axis.
- The dataset must be stored in chunked format; it can be resized up to
- the "maximum shape" (keyword maxshape) specified at creation time.
- The rank of the dataset cannot be changed.
- "Size" should be a shape tuple, or if an axis is specified, an integer.
- BEWARE: This functions differently than the NumPy resize() method!
- The data is not "reshuffled" to fit in the new shape; each axis is
- grown or shrunk independently. The coordinates of existing data are
- fixed.
- """
- with phil:
- if self.chunks is None:
- raise TypeError("Only chunked datasets can be resized")
- if axis is not None:
- if not (axis >=0 and axis < self.id.rank):
- raise ValueError("Invalid axis (0 to %s allowed)" % (self.id.rank-1))
- try:
- newlen = int(size)
- except TypeError:
- raise TypeError("Argument must be a single int if axis is specified") from None
- size = list(self.shape)
- size[axis] = newlen
- size = tuple(size)
- self.id.set_extent(size)
- #h5f.flush(self.id) # THG recommends
- @with_phil
- def __len__(self):
- """ The size of the first axis. TypeError if scalar.
- Limited to 2**32 on 32-bit systems; Dataset.len() is preferred.
- """
- size = self.len()
- if size > sys.maxsize:
- raise OverflowError("Value too big for Python's __len__; use Dataset.len() instead.")
- return size
- def len(self):
- """ The size of the first axis. TypeError if scalar.
- Use of this method is preferred to len(dset), as Python's built-in
- len() cannot handle values greater then 2**32 on 32-bit systems.
- """
- with phil:
- shape = self.shape
- if len(shape) == 0:
- raise TypeError("Attempt to take len() of scalar dataset")
- return shape[0]
- @with_phil
- def __iter__(self):
- """ Iterate over the first axis. TypeError if scalar.
- BEWARE: Modifications to the yielded data are *NOT* written to file.
- """
- shape = self.shape
- if len(shape) == 0:
- raise TypeError("Can't iterate over a scalar dataset")
- for i in range(shape[0]):
- yield self[i]
- @with_phil
- def iter_chunks(self, sel=None):
- """ Return chunk iterator. If set, the sel argument is a slice or
- tuple of slices that defines the region to be used. If not set, the
- entire dataspace will be used for the iterator.
- For each chunk within the given region, the iterator yields a tuple of
- slices that gives the intersection of the given chunk with the
- selection area.
- A TypeError will be raised if the dataset is not chunked.
- A ValueError will be raised if the selection region is invalid.
- """
- return ChunkIterator(self, sel)
- @cached_property
- def _fast_read_ok(self):
- """Is this dataset suitable for simple reading"""
- return (
- self._extent_type == h5s.SIMPLE
- and isinstance(self.id.get_type(), (h5t.TypeIntegerID, h5t.TypeFloatID))
- )
- @with_phil
- def __getitem__(self, args, new_dtype=None):
- """ Read a slice from the HDF5 dataset.
- Takes slices and recarray-style field names (more than one is
- allowed!) in any order. Obeys basic NumPy rules, including
- broadcasting.
- Also supports:
- * Boolean "mask" array indexing
- """
- args = args if isinstance(args, tuple) else (args,)
- if any(a is None for a in args): # 'None in args' would fail on arrays
- raise TypeError("Indexing with None (or np.newaxis) is not supported")
- if self._fast_read_ok and (new_dtype is None):
- try:
- return self._fast_reader.read(args)
- except TypeError:
- pass # Fall back to Python read pathway below
- if self._is_empty:
- # Check 'is Ellipsis' to avoid equality comparison with an array:
- # array equality returns an array, not a boolean.
- if args == () or (len(args) == 1 and args[0] is Ellipsis):
- return Empty(self.dtype)
- raise ValueError("Empty datasets cannot be sliced")
- # Sort field names from the rest of the args.
- names = tuple(x for x in args if isinstance(x, str))
- if names:
- # Read a subset of the fields in this structured dtype
- if len(names) == 1:
- names = names[0] # Read with simpler dtype of this field
- args = tuple(x for x in args if not isinstance(x, str))
- return self.fields(names, _prior_dtype=new_dtype)[args]
- if new_dtype is None:
- new_dtype = self.dtype
- mtype = h5t.py_create(new_dtype)
- # === Special-case region references ====
- if len(args) == 1 and isinstance(args[0], h5r.RegionReference):
- obj = h5r.dereference(args[0], self.id)
- if obj != self.id:
- raise ValueError("Region reference must point to this dataset")
- sid = h5r.get_region(args[0], self.id)
- mshape = sel.guess_shape(sid)
- if mshape is None:
- # 0D with no data (NULL or deselected SCALAR)
- return Empty(new_dtype)
- out = numpy.zeros(mshape, dtype=new_dtype)
- if out.size == 0:
- return out
- sid_out = h5s.create_simple(mshape)
- sid_out.select_all()
- self.id.read(sid_out, sid, out, mtype)
- return out
- # === Check for zero-sized datasets =====
- if self.size == 0:
- # Check 'is Ellipsis' to avoid equality comparison with an array:
- # array equality returns an array, not a boolean.
- if args == () or (len(args) == 1 and args[0] is Ellipsis):
- return numpy.zeros(self.shape, dtype=new_dtype)
- # === Scalar dataspaces =================
- if self.shape == ():
- fspace = self.id.get_space()
- selection = sel2.select_read(fspace, args)
- if selection.mshape is None:
- arr = numpy.zeros((), dtype=new_dtype)
- else:
- arr = numpy.zeros(selection.mshape, dtype=new_dtype)
- for mspace, fspace in selection:
- self.id.read(mspace, fspace, arr, mtype)
- if selection.mshape is None:
- return arr[()]
- return arr
- # === Everything else ===================
- # Perform the dataspace selection.
- selection = sel.select(self.shape, args, dataset=self)
- if selection.nselect == 0:
- return numpy.zeros(selection.array_shape, dtype=new_dtype)
- arr = numpy.zeros(selection.array_shape, new_dtype, order='C')
- # Perform the actual read
- mspace = h5s.create_simple(selection.mshape)
- fspace = selection.id
- self.id.read(mspace, fspace, arr, mtype, dxpl=self._dxpl)
- # Patch up the output for NumPy
- if arr.shape == ():
- return arr[()] # 0 dim array -> numpy scalar
- return arr
- @with_phil
- def __setitem__(self, args, val):
- """ Write to the HDF5 dataset from a Numpy array.
- NumPy's broadcasting rules are honored, for "simple" indexing
- (slices and integers). For advanced indexing, the shapes must
- match.
- """
- args = args if isinstance(args, tuple) else (args,)
- # Sort field indices from the slicing
- names = tuple(x for x in args if isinstance(x, str))
- args = tuple(x for x in args if not isinstance(x, str))
- # Generally we try to avoid converting the arrays on the Python
- # side. However, for compound literals this is unavoidable.
- vlen = h5t.check_vlen_dtype(self.dtype)
- if vlen is not None and vlen not in (bytes, str):
- try:
- val = numpy.asarray(val, dtype=vlen)
- except (ValueError, TypeError):
- try:
- val = numpy.array([numpy.array(x, dtype=vlen)
- for x in val], dtype=self.dtype)
- except (ValueError, TypeError):
- pass
- if vlen == val.dtype:
- if val.ndim > 1:
- tmp = numpy.empty(shape=val.shape[:-1], dtype=object)
- tmp.ravel()[:] = [i for i in val.reshape(
- (product(val.shape[:-1]), val.shape[-1])
- )]
- else:
- tmp = numpy.array([None], dtype=object)
- tmp[0] = val
- val = tmp
- elif self.dtype.kind == "O" or \
- (self.dtype.kind == 'V' and \
- (not isinstance(val, numpy.ndarray) or val.dtype.kind != 'V') and \
- (self.dtype.subdtype is None)):
- if len(names) == 1 and self.dtype.fields is not None:
- # Single field selected for write, from a non-array source
- if not names[0] in self.dtype.fields:
- raise ValueError("No such field for indexing: %s" % names[0])
- dtype = self.dtype.fields[names[0]][0]
- cast_compound = True
- else:
- dtype = self.dtype
- cast_compound = False
- val = numpy.asarray(val, dtype=dtype.base, order='C')
- if cast_compound:
- val = val.view(numpy.dtype([(names[0], dtype)]))
- val = val.reshape(val.shape[:len(val.shape) - len(dtype.shape)])
- elif (self.dtype.kind == 'S'
- and (h5t.check_string_dtype(self.dtype).encoding == 'utf-8')
- and (find_item_type(val) is str)
- ):
- # Writing str objects to a fixed-length UTF-8 string dataset.
- # Numpy's normal conversion only handles ASCII characters, but
- # when the destination is UTF-8, we want to allow any unicode.
- # This *doesn't* handle numpy fixed-length unicode data ('U' dtype),
- # as HDF5 has no equivalent, and converting fixed length UTF-32
- # to variable length UTF-8 would obscure what's going on.
- str_array = numpy.asarray(val, order='C', dtype=object)
- val = numpy.array([
- s.encode('utf-8') for s in str_array.flat
- ], dtype=self.dtype).reshape(str_array.shape)
- else:
- # If the input data is already an array, let HDF5 do the conversion.
- # If it's a list or similar, don't make numpy guess a dtype for it.
- dt = None if isinstance(val, numpy.ndarray) else self.dtype.base
- val = numpy.asarray(val, order='C', dtype=dt)
- # Check for array dtype compatibility and convert
- if self.dtype.subdtype is not None:
- shp = self.dtype.subdtype[1]
- valshp = val.shape[-len(shp):]
- if valshp != shp: # Last dimension has to match
- raise TypeError("When writing to array types, last N dimensions have to match (got %s, but should be %s)" % (valshp, shp,))
- mtype = h5t.py_create(numpy.dtype((val.dtype, shp)))
- mshape = val.shape[0:len(val.shape)-len(shp)]
- # Make a compound memory type if field-name slicing is required
- elif len(names) != 0:
- mshape = val.shape
- # Catch common errors
- if self.dtype.fields is None:
- raise TypeError("Illegal slicing argument (not a compound dataset)")
- mismatch = [x for x in names if x not in self.dtype.fields]
- if len(mismatch) != 0:
- mismatch = ", ".join('"%s"'%x for x in mismatch)
- raise ValueError("Illegal slicing argument (fields %s not in dataset type)" % mismatch)
- # Write non-compound source into a single dataset field
- if len(names) == 1 and val.dtype.fields is None:
- subtype = h5t.py_create(val.dtype)
- mtype = h5t.create(h5t.COMPOUND, subtype.get_size())
- mtype.insert(self._e(names[0]), 0, subtype)
- # Make a new source type keeping only the requested fields
- else:
- fieldnames = [x for x in val.dtype.names if x in names] # Keep source order
- mtype = h5t.create(h5t.COMPOUND, val.dtype.itemsize)
- for fieldname in fieldnames:
- subtype = h5t.py_create(val.dtype.fields[fieldname][0])
- offset = val.dtype.fields[fieldname][1]
- mtype.insert(self._e(fieldname), offset, subtype)
- # Use mtype derived from array (let DatasetID.write figure it out)
- else:
- mshape = val.shape
- mtype = None
- # Perform the dataspace selection
- selection = sel.select(self.shape, args, dataset=self)
- # Broadcast scalars if necessary.
- # In order to avoid slow broadcasting filling the destination by
- # the scalar value, we create an intermediate array of the same
- # size as the destination buffer provided that size is reasonable.
- # We assume as reasonable a size smaller or equal as the used dataset
- # chunk size if any.
- # In case of dealing with a non-chunked destination dataset or with
- # a selection whose size is larger than the dataset chunk size we fall
- # back to using an intermediate array of size equal to the last dimension
- # of the destination buffer.
- # The reasoning behind is that it makes sense to assume the creator of
- # the dataset used an appropriate chunk size according the available
- # memory. In any case, if we cannot afford to create an intermediate
- # array of the same size as the dataset chunk size, the user program has
- # little hope to go much further. Solves h5py issue #1067
- if mshape == () and selection.array_shape != ():
- if self.dtype.subdtype is not None:
- raise TypeError("Scalar broadcasting is not supported for array dtypes")
- if self.chunks and (product(self.chunks) >= product(selection.array_shape)):
- val2 = numpy.empty(selection.array_shape, dtype=val.dtype)
- else:
- val2 = numpy.empty(selection.array_shape[-1], dtype=val.dtype)
- val2[...] = val
- val = val2
- mshape = val.shape
- # Perform the write, with broadcasting
- mspace = h5s.create_simple(selection.expand_shape(mshape))
- for fspace in selection.broadcast(mshape):
- self.id.write(mspace, fspace, val, mtype, dxpl=self._dxpl)
- def read_direct(self, dest, source_sel=None, dest_sel=None):
- """ Read data directly from HDF5 into an existing NumPy array.
- The destination array must be C-contiguous and writable.
- Selections must be the output of numpy.s_[<args>].
- Broadcasting is supported for simple indexing.
- """
- with phil:
- if self._is_empty:
- raise TypeError("Empty datasets have no numpy representation")
- if source_sel is None:
- source_sel = sel.SimpleSelection(self.shape)
- else:
- source_sel = sel.select(self.shape, source_sel, self) # for numpy.s_
- fspace = source_sel.id
- if dest_sel is None:
- dest_sel = sel.SimpleSelection(dest.shape)
- else:
- dest_sel = sel.select(dest.shape, dest_sel)
- for mspace in dest_sel.broadcast(source_sel.array_shape):
- self.id.read(mspace, fspace, dest, dxpl=self._dxpl)
- def write_direct(self, source, source_sel=None, dest_sel=None):
- """ Write data directly to HDF5 from a NumPy array.
- The source array must be C-contiguous. Selections must be
- the output of numpy.s_[<args>].
- Broadcasting is supported for simple indexing.
- """
- with phil:
- if self._is_empty:
- raise TypeError("Empty datasets cannot be written to")
- if source_sel is None:
- source_sel = sel.SimpleSelection(source.shape)
- else:
- source_sel = sel.select(source.shape, source_sel) # for numpy.s_
- mspace = source_sel.id
- if dest_sel is None:
- dest_sel = sel.SimpleSelection(self.shape)
- else:
- dest_sel = sel.select(self.shape, dest_sel, self)
- for fspace in dest_sel.broadcast(source_sel.array_shape):
- self.id.write(mspace, fspace, source, dxpl=self._dxpl)
- @with_phil
- def __array__(self, dtype=None, copy=None):
- """ Create a Numpy array containing the whole dataset. DON'T THINK
- THIS MEANS DATASETS ARE INTERCHANGEABLE WITH ARRAYS. For one thing,
- you have to read the whole dataset every time this method is called.
- """
- if copy is False:
- raise ValueError(
- f"Dataset.__array__ received {copy=} "
- "but memory allocation cannot be avoided on read"
- )
- arr = numpy.zeros(self.shape, dtype=self.dtype if dtype is None else dtype)
- # Special case for (0,)*-shape datasets
- if self.size == 0:
- return arr
- self.read_direct(arr)
- return arr
- @with_phil
- def __repr__(self):
- if not self:
- return "<Closed HDF5 dataset>"
- if self.name is None:
- name = "(anonymous)"
- else:
- name = pp.basename(pp.normpath(self.name))
- name = f'"{name}"'
- return f'<HDF5 dataset {name}: shape {self.shape}, type "{self.dtype.str}">'
- @with_phil
- def refresh(self):
- """ Refresh the dataset metadata by reloading from the file.
- This is part of the SWMR features.
- """
- self._id.refresh()
- self._cache_props.clear()
- @with_phil
- def flush(self):
- """ Flush the dataset data and metadata to the file.
- If the dataset is chunked, raw data chunks are written to the file.
- This is part of the SWMR features.
- """
- self._id.flush()
- if vds_support:
- @property
- @with_phil
- def is_virtual(self):
- """Check if this is a virtual dataset"""
- return self._dcpl.get_layout() == h5d.VIRTUAL
- @with_phil
- def virtual_sources(self):
- """Get a list of the data mappings for a virtual dataset"""
- if not self.is_virtual:
- raise RuntimeError("Not a virtual dataset")
- dcpl = self._dcpl
- return [
- VDSmap(dcpl.get_virtual_vspace(j),
- dcpl.get_virtual_filename(j),
- dcpl.get_virtual_dsetname(j),
- dcpl.get_virtual_srcspace(j))
- for j in range(dcpl.get_virtual_count())]
- @with_phil
- def make_scale(self, name=''):
- """Make this dataset an HDF5 dimension scale.
- You can then attach it to dimensions of other datasets like this::
- other_ds.dims[0].attach_scale(ds)
- You can optionally pass a name to associate with this scale.
- """
- h5ds.set_scale(self._id, self._e(name))
- @property
- @with_phil
- def is_scale(self):
- """Return ``True`` if this dataset is also a dimension scale.
- Return ``False`` otherwise.
- """
- return h5ds.is_scale(self._id)
|