_arrayterator_impl.py 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224
  1. """
  2. A buffered iterator for big arrays.
  3. This module solves the problem of iterating over a big file-based array
  4. without having to read it into memory. The `Arrayterator` class wraps
  5. an array object, and when iterated it will return sub-arrays with at most
  6. a user-specified number of elements.
  7. """
  8. from operator import mul
  9. from functools import reduce
  10. __all__ = ['Arrayterator']
  11. class Arrayterator:
  12. """
  13. Buffered iterator for big arrays.
  14. `Arrayterator` creates a buffered iterator for reading big arrays in small
  15. contiguous blocks. The class is useful for objects stored in the
  16. file system. It allows iteration over the object *without* reading
  17. everything in memory; instead, small blocks are read and iterated over.
  18. `Arrayterator` can be used with any object that supports multidimensional
  19. slices. This includes NumPy arrays, but also variables from
  20. Scientific.IO.NetCDF or pynetcdf for example.
  21. Parameters
  22. ----------
  23. var : array_like
  24. The object to iterate over.
  25. buf_size : int, optional
  26. The buffer size. If `buf_size` is supplied, the maximum amount of
  27. data that will be read into memory is `buf_size` elements.
  28. Default is None, which will read as many element as possible
  29. into memory.
  30. Attributes
  31. ----------
  32. var
  33. buf_size
  34. start
  35. stop
  36. step
  37. shape
  38. flat
  39. See Also
  40. --------
  41. numpy.ndenumerate : Multidimensional array iterator.
  42. numpy.flatiter : Flat array iterator.
  43. numpy.memmap : Create a memory-map to an array stored
  44. in a binary file on disk.
  45. Notes
  46. -----
  47. The algorithm works by first finding a "running dimension", along which
  48. the blocks will be extracted. Given an array of dimensions
  49. ``(d1, d2, ..., dn)``, e.g. if `buf_size` is smaller than ``d1``, the
  50. first dimension will be used. If, on the other hand,
  51. ``d1 < buf_size < d1*d2`` the second dimension will be used, and so on.
  52. Blocks are extracted along this dimension, and when the last block is
  53. returned the process continues from the next dimension, until all
  54. elements have been read.
  55. Examples
  56. --------
  57. >>> import numpy as np
  58. >>> a = np.arange(3 * 4 * 5 * 6).reshape(3, 4, 5, 6)
  59. >>> a_itor = np.lib.Arrayterator(a, 2)
  60. >>> a_itor.shape
  61. (3, 4, 5, 6)
  62. Now we can iterate over ``a_itor``, and it will return arrays of size
  63. two. Since `buf_size` was smaller than any dimension, the first
  64. dimension will be iterated over first:
  65. >>> for subarr in a_itor:
  66. ... if not subarr.all():
  67. ... print(subarr, subarr.shape) # doctest: +SKIP
  68. >>> # [[[[0 1]]]] (1, 1, 1, 2)
  69. """
  70. __module__ = "numpy.lib"
  71. def __init__(self, var, buf_size=None):
  72. self.var = var
  73. self.buf_size = buf_size
  74. self.start = [0 for dim in var.shape]
  75. self.stop = list(var.shape)
  76. self.step = [1 for dim in var.shape]
  77. def __getattr__(self, attr):
  78. return getattr(self.var, attr)
  79. def __getitem__(self, index):
  80. """
  81. Return a new arrayterator.
  82. """
  83. # Fix index, handling ellipsis and incomplete slices.
  84. if not isinstance(index, tuple):
  85. index = (index,)
  86. fixed = []
  87. length, dims = len(index), self.ndim
  88. for slice_ in index:
  89. if slice_ is Ellipsis:
  90. fixed.extend([slice(None)] * (dims-length+1))
  91. length = len(fixed)
  92. elif isinstance(slice_, int):
  93. fixed.append(slice(slice_, slice_+1, 1))
  94. else:
  95. fixed.append(slice_)
  96. index = tuple(fixed)
  97. if len(index) < dims:
  98. index += (slice(None),) * (dims-len(index))
  99. # Return a new arrayterator object.
  100. out = self.__class__(self.var, self.buf_size)
  101. for i, (start, stop, step, slice_) in enumerate(
  102. zip(self.start, self.stop, self.step, index)):
  103. out.start[i] = start + (slice_.start or 0)
  104. out.step[i] = step * (slice_.step or 1)
  105. out.stop[i] = start + (slice_.stop or stop-start)
  106. out.stop[i] = min(stop, out.stop[i])
  107. return out
  108. def __array__(self, dtype=None, copy=None):
  109. """
  110. Return corresponding data.
  111. """
  112. slice_ = tuple(slice(*t) for t in zip(
  113. self.start, self.stop, self.step))
  114. return self.var[slice_]
  115. @property
  116. def flat(self):
  117. """
  118. A 1-D flat iterator for Arrayterator objects.
  119. This iterator returns elements of the array to be iterated over in
  120. `~lib.Arrayterator` one by one.
  121. It is similar to `flatiter`.
  122. See Also
  123. --------
  124. lib.Arrayterator
  125. flatiter
  126. Examples
  127. --------
  128. >>> a = np.arange(3 * 4 * 5 * 6).reshape(3, 4, 5, 6)
  129. >>> a_itor = np.lib.Arrayterator(a, 2)
  130. >>> for subarr in a_itor.flat:
  131. ... if not subarr:
  132. ... print(subarr, type(subarr))
  133. ...
  134. 0 <class 'numpy.int64'>
  135. """
  136. for block in self:
  137. yield from block.flat
  138. @property
  139. def shape(self):
  140. """
  141. The shape of the array to be iterated over.
  142. For an example, see `Arrayterator`.
  143. """
  144. return tuple(((stop-start-1)//step+1) for start, stop, step in
  145. zip(self.start, self.stop, self.step))
  146. def __iter__(self):
  147. # Skip arrays with degenerate dimensions
  148. if [dim for dim in self.shape if dim <= 0]:
  149. return
  150. start = self.start[:]
  151. stop = self.stop[:]
  152. step = self.step[:]
  153. ndims = self.var.ndim
  154. while True:
  155. count = self.buf_size or reduce(mul, self.shape)
  156. # iterate over each dimension, looking for the
  157. # running dimension (ie, the dimension along which
  158. # the blocks will be built from)
  159. rundim = 0
  160. for i in range(ndims-1, -1, -1):
  161. # if count is zero we ran out of elements to read
  162. # along higher dimensions, so we read only a single position
  163. if count == 0:
  164. stop[i] = start[i]+1
  165. elif count <= self.shape[i]:
  166. # limit along this dimension
  167. stop[i] = start[i] + count*step[i]
  168. rundim = i
  169. else:
  170. # read everything along this dimension
  171. stop[i] = self.stop[i]
  172. stop[i] = min(self.stop[i], stop[i])
  173. count = count//self.shape[i]
  174. # yield a block
  175. slice_ = tuple(slice(*t) for t in zip(start, stop, step))
  176. yield self.var[slice_]
  177. # Update start position, taking care of overflow to
  178. # other dimensions
  179. start[rundim] = stop[rundim] # start where we stopped
  180. for i in range(ndims-1, 0, -1):
  181. if start[i] >= self.stop[i]:
  182. start[i] = self.start[i]
  183. start[i-1] += self.step[i-1]
  184. if start[0] >= self.stop[0]:
  185. return