base.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535
  1. # This file is part of h5py, a Python interface to the HDF5 library.
  2. #
  3. # http://www.h5py.org
  4. #
  5. # Copyright 2008-2013 Andrew Collette and contributors
  6. #
  7. # License: Standard 3-clause BSD; see "license.txt" for full license terms
  8. # and contributor agreement.
  9. """
  10. Implements operations common to all high-level objects (File, etc.).
  11. """
  12. from collections.abc import (
  13. Mapping, MutableMapping, KeysView, ValuesView, ItemsView
  14. )
  15. import os
  16. import posixpath
  17. import numpy as np
  18. # The high-level interface is serialized; every public API function & method
  19. # is wrapped in a lock. We reuse the low-level lock because (1) it's fast,
  20. # and (2) it eliminates the possibility of deadlocks due to out-of-order
  21. # lock acquisition.
  22. from .._objects import phil, with_phil
  23. from .. import h5d, h5i, h5r, h5p, h5f, h5t, h5s
  24. from .compat import filename_encode
  25. def is_hdf5(fname):
  26. """ Determine if a file is valid HDF5 (False if it doesn't exist). """
  27. with phil:
  28. fname = os.path.abspath(os.fspath(fname))
  29. if os.path.isfile(fname):
  30. return h5f.is_hdf5(filename_encode(fname))
  31. return False
  32. def find_item_type(data):
  33. """Find the item type of a simple object or collection of objects.
  34. E.g. [[['a']]] -> str
  35. The focus is on collections where all items have the same type; we'll return
  36. None if that's not the case.
  37. The aim is to treat numpy arrays of Python objects like normal Python
  38. collections, while treating arrays with specific dtypes differently.
  39. We're also only interested in array-like collections - lists and tuples,
  40. possibly nested - not things like sets or dicts.
  41. """
  42. if isinstance(data, np.ndarray):
  43. if (
  44. data.dtype.kind == 'O'
  45. and not h5t.check_string_dtype(data.dtype)
  46. and not h5t.check_vlen_dtype(data.dtype)
  47. ):
  48. item_types = {type(e) for e in data.flat}
  49. else:
  50. return None
  51. elif isinstance(data, (list, tuple)):
  52. item_types = {find_item_type(e) for e in data}
  53. else:
  54. return type(data)
  55. if len(item_types) != 1:
  56. return None
  57. return item_types.pop()
  58. def guess_dtype(data):
  59. """ Attempt to guess an appropriate dtype for the object, returning None
  60. if nothing is appropriate (or if it should be left up the the array
  61. constructor to figure out)
  62. """
  63. with phil:
  64. item_type = find_item_type(data)
  65. if item_type is h5r.RegionReference:
  66. return h5t.regionref_dtype
  67. if item_type is h5r.Reference:
  68. return h5t.ref_dtype
  69. if item_type is bytes:
  70. return h5t.string_dtype(encoding='ascii')
  71. if item_type is str:
  72. return h5t.string_dtype()
  73. return None
  74. def is_float16_dtype(dt):
  75. if dt is None:
  76. return False
  77. dt = np.dtype(dt) # normalize strings -> np.dtype objects
  78. return dt.kind == 'f' and dt.itemsize == 2
  79. def array_for_new_object(data, specified_dtype=None):
  80. """Prepare an array from data used to create a new dataset or attribute"""
  81. if not isinstance(specified_dtype, (np.dtype, type(None))):
  82. specified_dtype = np.dtype(specified_dtype)
  83. # We mostly let HDF5 convert data as necessary when it's written.
  84. # But if we are going to a float16 datatype, pre-convert in python
  85. # to workaround a bug in the conversion.
  86. # https://github.com/h5py/h5py/issues/819
  87. if is_float16_dtype(specified_dtype):
  88. as_dtype = specified_dtype
  89. elif not isinstance(data, np.ndarray) and (specified_dtype is not None):
  90. # If we need to convert e.g. a list to an array, don't leave numpy
  91. # to guess a dtype we already know.
  92. as_dtype = specified_dtype
  93. else:
  94. as_dtype = guess_dtype(data)
  95. data = np.asarray(data, order="C", dtype=as_dtype)
  96. # In most cases, this does nothing. But if data was already an array,
  97. # and as_dtype is a tagged h5py dtype (e.g. for an object array of strings),
  98. # asarray() doesn't replace its dtype object. This gives it the tagged dtype:
  99. if as_dtype is not None:
  100. data = data.view(dtype=as_dtype)
  101. return data
  102. def default_lapl():
  103. """ Default link access property list """
  104. return None
  105. def default_lcpl():
  106. """ Default link creation property list """
  107. lcpl = h5p.create(h5p.LINK_CREATE)
  108. lcpl.set_create_intermediate_group(True)
  109. return lcpl
  110. dlapl = default_lapl()
  111. dlcpl = default_lcpl()
  112. def is_empty_dataspace(obj):
  113. """ Check if an object's dataspace is empty """
  114. if obj.get_space().get_simple_extent_type() == h5s.NULL:
  115. return True
  116. return False
  117. class CommonStateObject:
  118. """
  119. Mixin class that allows sharing information between objects which
  120. reside in the same HDF5 file. Requires that the host class have
  121. a ".id" attribute which returns a low-level ObjectID subclass.
  122. Also implements Unicode operations.
  123. """
  124. @property
  125. def _lapl(self):
  126. """ Fetch the link access property list appropriate for this object
  127. """
  128. return dlapl
  129. @property
  130. def _lcpl(self):
  131. """ Fetch the link creation property list appropriate for this object
  132. """
  133. return dlcpl
  134. def _e(self, name, lcpl=None):
  135. """ Encode a name according to the current file settings.
  136. Returns name, or 2-tuple (name, lcpl) if lcpl is True
  137. - Binary strings are always passed as-is, h5t.CSET_ASCII
  138. - Unicode strings are encoded utf8, h5t.CSET_UTF8
  139. If name is None, returns either None or (None, None) appropriately.
  140. """
  141. def get_lcpl(coding):
  142. """ Create an appropriate link creation property list """
  143. lcpl = self._lcpl.copy()
  144. lcpl.set_char_encoding(coding)
  145. return lcpl
  146. if name is None:
  147. return (None, None) if lcpl else None
  148. if isinstance(name, bytes):
  149. coding = h5t.CSET_ASCII
  150. elif isinstance(name, str):
  151. try:
  152. name = name.encode('ascii')
  153. coding = h5t.CSET_ASCII
  154. except UnicodeEncodeError:
  155. name = name.encode('utf8')
  156. coding = h5t.CSET_UTF8
  157. else:
  158. raise TypeError(f"A name should be string or bytes, not {type(name)}")
  159. if lcpl:
  160. return name, get_lcpl(coding)
  161. return name
  162. def _d(self, name):
  163. """ Decode a name according to the current file settings.
  164. - Try to decode utf8
  165. - Failing that, return the byte string
  166. If name is None, returns None.
  167. """
  168. if name is None:
  169. return None
  170. try:
  171. return name.decode('utf8')
  172. except UnicodeDecodeError:
  173. pass
  174. return name
  175. class _RegionProxy:
  176. """
  177. Proxy object which handles region references.
  178. To create a new region reference (datasets only), use slicing syntax:
  179. >>> newref = obj.regionref[0:10:2]
  180. To determine the target dataset shape from an existing reference:
  181. >>> shape = obj.regionref.shape(existingref)
  182. where <obj> may be any object in the file. To determine the shape of
  183. the selection in use on the target dataset:
  184. >>> selection_shape = obj.regionref.selection(existingref)
  185. """
  186. def __init__(self, obj):
  187. self.obj = obj
  188. self.id = obj.id
  189. def __getitem__(self, args):
  190. if not isinstance(self.id, h5d.DatasetID):
  191. raise TypeError("Region references can only be made to datasets")
  192. from . import selections
  193. with phil:
  194. selection = selections.select(self.id.shape, args, dataset=self.obj)
  195. return h5r.create(self.id, b'.', h5r.DATASET_REGION, selection.id)
  196. def shape(self, ref):
  197. """ Get the shape of the target dataspace referred to by *ref*. """
  198. with phil:
  199. sid = h5r.get_region(ref, self.id)
  200. return sid.shape
  201. def selection(self, ref):
  202. """ Get the shape of the target dataspace selection referred to by *ref*
  203. """
  204. from . import selections
  205. with phil:
  206. sid = h5r.get_region(ref, self.id)
  207. return selections.guess_shape(sid)
  208. class HLObject(CommonStateObject):
  209. """
  210. Base class for high-level interface objects.
  211. """
  212. @property
  213. def file(self):
  214. """ Return a File instance associated with this object """
  215. from . import files
  216. with phil:
  217. return files.File(self.id)
  218. @property
  219. @with_phil
  220. def name(self):
  221. """ Return the full name of this object. None if anonymous. """
  222. return self._d(h5i.get_name(self.id))
  223. @property
  224. @with_phil
  225. def parent(self):
  226. """Return the parent group of this object.
  227. This is always equivalent to obj.file[posixpath.dirname(obj.name)].
  228. ValueError if this object is anonymous.
  229. """
  230. if self.name is None:
  231. raise ValueError("Parent of an anonymous object is undefined")
  232. return self.file[posixpath.dirname(self.name)]
  233. @property
  234. @with_phil
  235. def id(self):
  236. """ Low-level identifier appropriate for this object """
  237. return self._id
  238. @property
  239. @with_phil
  240. def ref(self):
  241. """ An (opaque) HDF5 reference to this object """
  242. return h5r.create(self.id, b'.', h5r.OBJECT)
  243. @property
  244. @with_phil
  245. def regionref(self):
  246. """Create a region reference (Datasets only).
  247. The syntax is regionref[<slices>]. For example, dset.regionref[...]
  248. creates a region reference in which the whole dataset is selected.
  249. Can also be used to determine the shape of the referenced dataset
  250. (via .shape property), or the shape of the selection (via the
  251. .selection property).
  252. """
  253. return _RegionProxy(self)
  254. @property
  255. def attrs(self):
  256. """ Attributes attached to this object """
  257. from . import attrs
  258. with phil:
  259. return attrs.AttributeManager(self)
  260. @with_phil
  261. def __init__(self, oid):
  262. """ Setup this object, given its low-level identifier """
  263. self._id = oid
  264. @with_phil
  265. def __hash__(self):
  266. return hash(self.id)
  267. @with_phil
  268. def __eq__(self, other):
  269. if hasattr(other, 'id'):
  270. return self.id == other.id
  271. return NotImplemented
  272. def __bool__(self):
  273. with phil:
  274. return bool(self.id)
  275. def __getnewargs__(self):
  276. """Disable pickle.
  277. Handles for HDF5 objects can't be reliably deserialised, because the
  278. recipient may not have access to the same files. So we do this to
  279. fail early.
  280. If you really want to pickle h5py objects and can live with some
  281. limitations, look at the h5pickle project on PyPI.
  282. """
  283. raise TypeError("h5py objects cannot be pickled")
  284. def __getstate__(self):
  285. # Pickle protocols 0 and 1 use this instead of __getnewargs__
  286. raise TypeError("h5py objects cannot be pickled")
  287. # --- Dictionary-style interface ----------------------------------------------
  288. # To implement the dictionary-style interface from groups and attributes,
  289. # we inherit from the appropriate abstract base classes in collections.
  290. #
  291. # All locking is taken care of by the subclasses.
  292. # We have to override ValuesView and ItemsView here because Group and
  293. # AttributeManager can only test for key names.
  294. class KeysViewHDF5(KeysView):
  295. def __str__(self):
  296. return "<KeysViewHDF5 {}>".format(list(self))
  297. def __reversed__(self):
  298. yield from reversed(self._mapping)
  299. __repr__ = __str__
  300. class ValuesViewHDF5(ValuesView):
  301. """
  302. Wraps e.g. a Group or AttributeManager to provide a value view.
  303. Note that __contains__ will have poor performance as it has
  304. to scan all the links or attributes.
  305. """
  306. def __contains__(self, value):
  307. with phil:
  308. for key in self._mapping:
  309. if value == self._mapping.get(key):
  310. return True
  311. return False
  312. def __iter__(self):
  313. with phil:
  314. for key in self._mapping:
  315. yield self._mapping.get(key)
  316. def __reversed__(self):
  317. with phil:
  318. for key in reversed(self._mapping):
  319. yield self._mapping.get(key)
  320. class ItemsViewHDF5(ItemsView):
  321. """
  322. Wraps e.g. a Group or AttributeManager to provide an items view.
  323. """
  324. def __contains__(self, item):
  325. with phil:
  326. key, val = item
  327. if key in self._mapping:
  328. return val == self._mapping.get(key)
  329. return False
  330. def __iter__(self):
  331. with phil:
  332. for key in self._mapping:
  333. yield (key, self._mapping.get(key))
  334. def __reversed__(self):
  335. with phil:
  336. for key in reversed(self._mapping):
  337. yield (key, self._mapping.get(key))
  338. class MappingHDF5(Mapping):
  339. """
  340. Wraps a Group, AttributeManager or DimensionManager object to provide
  341. an immutable mapping interface.
  342. We don't inherit directly from MutableMapping because certain
  343. subclasses, for example DimensionManager, are read-only.
  344. """
  345. def keys(self):
  346. """ Get a view object on member names """
  347. return KeysViewHDF5(self)
  348. def values(self):
  349. """ Get a view object on member objects """
  350. return ValuesViewHDF5(self)
  351. def items(self):
  352. """ Get a view object on member items """
  353. return ItemsViewHDF5(self)
  354. def _ipython_key_completions_(self):
  355. """ Custom tab completions for __getitem__ in IPython >=5.0. """
  356. return sorted(self.keys())
  357. class MutableMappingHDF5(MappingHDF5, MutableMapping):
  358. """
  359. Wraps a Group or AttributeManager object to provide a mutable
  360. mapping interface, in contrast to the read-only mapping of
  361. MappingHDF5.
  362. """
  363. pass
  364. class Empty:
  365. """
  366. Proxy object to represent empty/null dataspaces (a.k.a H5S_NULL).
  367. This can have an associated dtype, but has no shape or data. This is not
  368. the same as an array with shape (0,).
  369. """
  370. shape = None
  371. size = None
  372. def __init__(self, dtype):
  373. self.dtype = np.dtype(dtype)
  374. def __eq__(self, other):
  375. if isinstance(other, Empty) and self.dtype == other.dtype:
  376. return True
  377. return False
  378. def __repr__(self):
  379. return "Empty(dtype={0!r})".format(self.dtype)
  380. def product(nums):
  381. """Calculate a numeric product
  382. For small amounts of data (e.g. shape tuples), this simple code is much
  383. faster than calling numpy.prod().
  384. """
  385. prod = 1
  386. for n in nums:
  387. prod *= n
  388. return prod
  389. # Simple variant of cached_property:
  390. # Unlike functools, this has no locking, so we don't have to worry about
  391. # deadlocks with phil (see issue gh-2064). Unlike cached-property on PyPI, it
  392. # doesn't try to import asyncio (which can be ~100 extra modules).
  393. # Many projects seem to have similar variants of this, often without attribution,
  394. # but to be cautious, this code comes from cached-property (Copyright (c) 2015,
  395. # Daniel Greenfeld, BSD license), where it is attributed to bottle (Copyright
  396. # (c) 2009-2022, Marcel Hellkamp, MIT license).
  397. class cached_property:
  398. def __init__(self, func):
  399. self.__doc__ = getattr(func, "__doc__")
  400. self.func = func
  401. def __get__(self, obj, cls):
  402. if obj is None:
  403. return self
  404. value = obj.__dict__[self.func.__name__] = self.func(obj)
  405. return value