memmap.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361
  1. from contextlib import nullcontext
  2. import operator
  3. import numpy as np
  4. from .._utils import set_module
  5. from .numeric import uint8, ndarray, dtype
  6. __all__ = ['memmap']
  7. dtypedescr = dtype
  8. valid_filemodes = ["r", "c", "r+", "w+"]
  9. writeable_filemodes = ["r+", "w+"]
  10. mode_equivalents = {
  11. "readonly":"r",
  12. "copyonwrite":"c",
  13. "readwrite":"r+",
  14. "write":"w+"
  15. }
  16. @set_module('numpy')
  17. class memmap(ndarray):
  18. """Create a memory-map to an array stored in a *binary* file on disk.
  19. Memory-mapped files are used for accessing small segments of large files
  20. on disk, without reading the entire file into memory. NumPy's
  21. memmap's are array-like objects. This differs from Python's ``mmap``
  22. module, which uses file-like objects.
  23. This subclass of ndarray has some unpleasant interactions with
  24. some operations, because it doesn't quite fit properly as a subclass.
  25. An alternative to using this subclass is to create the ``mmap``
  26. object yourself, then create an ndarray with ndarray.__new__ directly,
  27. passing the object created in its 'buffer=' parameter.
  28. This class may at some point be turned into a factory function
  29. which returns a view into an mmap buffer.
  30. Flush the memmap instance to write the changes to the file. Currently there
  31. is no API to close the underlying ``mmap``. It is tricky to ensure the
  32. resource is actually closed, since it may be shared between different
  33. memmap instances.
  34. Parameters
  35. ----------
  36. filename : str, file-like object, or pathlib.Path instance
  37. The file name or file object to be used as the array data buffer.
  38. dtype : data-type, optional
  39. The data-type used to interpret the file contents.
  40. Default is `uint8`.
  41. mode : {'r+', 'r', 'w+', 'c'}, optional
  42. The file is opened in this mode:
  43. +------+-------------------------------------------------------------+
  44. | 'r' | Open existing file for reading only. |
  45. +------+-------------------------------------------------------------+
  46. | 'r+' | Open existing file for reading and writing. |
  47. +------+-------------------------------------------------------------+
  48. | 'w+' | Create or overwrite existing file for reading and writing. |
  49. | | If ``mode == 'w+'`` then `shape` must also be specified. |
  50. +------+-------------------------------------------------------------+
  51. | 'c' | Copy-on-write: assignments affect data in memory, but |
  52. | | changes are not saved to disk. The file on disk is |
  53. | | read-only. |
  54. +------+-------------------------------------------------------------+
  55. Default is 'r+'.
  56. offset : int, optional
  57. In the file, array data starts at this offset. Since `offset` is
  58. measured in bytes, it should normally be a multiple of the byte-size
  59. of `dtype`. When ``mode != 'r'``, even positive offsets beyond end of
  60. file are valid; The file will be extended to accommodate the
  61. additional data. By default, ``memmap`` will start at the beginning of
  62. the file, even if ``filename`` is a file pointer ``fp`` and
  63. ``fp.tell() != 0``.
  64. shape : int or sequence of ints, optional
  65. The desired shape of the array. If ``mode == 'r'`` and the number
  66. of remaining bytes after `offset` is not a multiple of the byte-size
  67. of `dtype`, you must specify `shape`. By default, the returned array
  68. will be 1-D with the number of elements determined by file size
  69. and data-type.
  70. .. versionchanged:: 2.0
  71. The shape parameter can now be any integer sequence type, previously
  72. types were limited to tuple and int.
  73. order : {'C', 'F'}, optional
  74. Specify the order of the ndarray memory layout:
  75. :term:`row-major`, C-style or :term:`column-major`,
  76. Fortran-style. This only has an effect if the shape is
  77. greater than 1-D. The default order is 'C'.
  78. Attributes
  79. ----------
  80. filename : str or pathlib.Path instance
  81. Path to the mapped file.
  82. offset : int
  83. Offset position in the file.
  84. mode : str
  85. File mode.
  86. Methods
  87. -------
  88. flush
  89. Flush any changes in memory to file on disk.
  90. When you delete a memmap object, flush is called first to write
  91. changes to disk.
  92. See also
  93. --------
  94. lib.format.open_memmap : Create or load a memory-mapped ``.npy`` file.
  95. Notes
  96. -----
  97. The memmap object can be used anywhere an ndarray is accepted.
  98. Given a memmap ``fp``, ``isinstance(fp, numpy.ndarray)`` returns
  99. ``True``.
  100. Memory-mapped files cannot be larger than 2GB on 32-bit systems.
  101. When a memmap causes a file to be created or extended beyond its
  102. current size in the filesystem, the contents of the new part are
  103. unspecified. On systems with POSIX filesystem semantics, the extended
  104. part will be filled with zero bytes.
  105. Examples
  106. --------
  107. >>> import numpy as np
  108. >>> data = np.arange(12, dtype='float32')
  109. >>> data.resize((3,4))
  110. This example uses a temporary file so that doctest doesn't write
  111. files to your directory. You would use a 'normal' filename.
  112. >>> from tempfile import mkdtemp
  113. >>> import os.path as path
  114. >>> filename = path.join(mkdtemp(), 'newfile.dat')
  115. Create a memmap with dtype and shape that matches our data:
  116. >>> fp = np.memmap(filename, dtype='float32', mode='w+', shape=(3,4))
  117. >>> fp
  118. memmap([[0., 0., 0., 0.],
  119. [0., 0., 0., 0.],
  120. [0., 0., 0., 0.]], dtype=float32)
  121. Write data to memmap array:
  122. >>> fp[:] = data[:]
  123. >>> fp
  124. memmap([[ 0., 1., 2., 3.],
  125. [ 4., 5., 6., 7.],
  126. [ 8., 9., 10., 11.]], dtype=float32)
  127. >>> fp.filename == path.abspath(filename)
  128. True
  129. Flushes memory changes to disk in order to read them back
  130. >>> fp.flush()
  131. Load the memmap and verify data was stored:
  132. >>> newfp = np.memmap(filename, dtype='float32', mode='r', shape=(3,4))
  133. >>> newfp
  134. memmap([[ 0., 1., 2., 3.],
  135. [ 4., 5., 6., 7.],
  136. [ 8., 9., 10., 11.]], dtype=float32)
  137. Read-only memmap:
  138. >>> fpr = np.memmap(filename, dtype='float32', mode='r', shape=(3,4))
  139. >>> fpr.flags.writeable
  140. False
  141. Copy-on-write memmap:
  142. >>> fpc = np.memmap(filename, dtype='float32', mode='c', shape=(3,4))
  143. >>> fpc.flags.writeable
  144. True
  145. It's possible to assign to copy-on-write array, but values are only
  146. written into the memory copy of the array, and not written to disk:
  147. >>> fpc
  148. memmap([[ 0., 1., 2., 3.],
  149. [ 4., 5., 6., 7.],
  150. [ 8., 9., 10., 11.]], dtype=float32)
  151. >>> fpc[0,:] = 0
  152. >>> fpc
  153. memmap([[ 0., 0., 0., 0.],
  154. [ 4., 5., 6., 7.],
  155. [ 8., 9., 10., 11.]], dtype=float32)
  156. File on disk is unchanged:
  157. >>> fpr
  158. memmap([[ 0., 1., 2., 3.],
  159. [ 4., 5., 6., 7.],
  160. [ 8., 9., 10., 11.]], dtype=float32)
  161. Offset into a memmap:
  162. >>> fpo = np.memmap(filename, dtype='float32', mode='r', offset=16)
  163. >>> fpo
  164. memmap([ 4., 5., 6., 7., 8., 9., 10., 11.], dtype=float32)
  165. """
  166. __array_priority__ = -100.0
  167. def __new__(subtype, filename, dtype=uint8, mode='r+', offset=0,
  168. shape=None, order='C'):
  169. # Import here to minimize 'import numpy' overhead
  170. import mmap
  171. import os.path
  172. try:
  173. mode = mode_equivalents[mode]
  174. except KeyError as e:
  175. if mode not in valid_filemodes:
  176. raise ValueError(
  177. "mode must be one of {!r} (got {!r})"
  178. .format(valid_filemodes + list(mode_equivalents.keys()), mode)
  179. ) from None
  180. if mode == 'w+' and shape is None:
  181. raise ValueError("shape must be given if mode == 'w+'")
  182. if hasattr(filename, 'read'):
  183. f_ctx = nullcontext(filename)
  184. else:
  185. f_ctx = open(
  186. os.fspath(filename),
  187. ('r' if mode == 'c' else mode)+'b'
  188. )
  189. with f_ctx as fid:
  190. fid.seek(0, 2)
  191. flen = fid.tell()
  192. descr = dtypedescr(dtype)
  193. _dbytes = descr.itemsize
  194. if shape is None:
  195. bytes = flen - offset
  196. if bytes % _dbytes:
  197. raise ValueError("Size of available data is not a "
  198. "multiple of the data-type size.")
  199. size = bytes // _dbytes
  200. shape = (size,)
  201. else:
  202. if type(shape) not in (tuple, list):
  203. try:
  204. shape = [operator.index(shape)]
  205. except TypeError:
  206. pass
  207. shape = tuple(shape)
  208. size = np.intp(1) # avoid default choice of np.int_, which might overflow
  209. for k in shape:
  210. size *= k
  211. bytes = int(offset + size*_dbytes)
  212. if mode in ('w+', 'r+'):
  213. # gh-27723
  214. # if bytes == 0, we write out 1 byte to allow empty memmap.
  215. bytes = max(bytes, 1)
  216. if flen < bytes:
  217. fid.seek(bytes - 1, 0)
  218. fid.write(b'\0')
  219. fid.flush()
  220. if mode == 'c':
  221. acc = mmap.ACCESS_COPY
  222. elif mode == 'r':
  223. acc = mmap.ACCESS_READ
  224. else:
  225. acc = mmap.ACCESS_WRITE
  226. start = offset - offset % mmap.ALLOCATIONGRANULARITY
  227. bytes -= start
  228. # bytes == 0 is problematic as in mmap length=0 maps the full file.
  229. # See PR gh-27723 for a more detailed explanation.
  230. if bytes == 0 and start > 0:
  231. bytes += mmap.ALLOCATIONGRANULARITY
  232. start -= mmap.ALLOCATIONGRANULARITY
  233. array_offset = offset - start
  234. mm = mmap.mmap(fid.fileno(), bytes, access=acc, offset=start)
  235. self = ndarray.__new__(subtype, shape, dtype=descr, buffer=mm,
  236. offset=array_offset, order=order)
  237. self._mmap = mm
  238. self.offset = offset
  239. self.mode = mode
  240. if isinstance(filename, os.PathLike):
  241. # special case - if we were constructed with a pathlib.path,
  242. # then filename is a path object, not a string
  243. self.filename = filename.resolve()
  244. elif hasattr(fid, "name") and isinstance(fid.name, str):
  245. # py3 returns int for TemporaryFile().name
  246. self.filename = os.path.abspath(fid.name)
  247. # same as memmap copies (e.g. memmap + 1)
  248. else:
  249. self.filename = None
  250. return self
  251. def __array_finalize__(self, obj):
  252. if hasattr(obj, '_mmap') and np.may_share_memory(self, obj):
  253. self._mmap = obj._mmap
  254. self.filename = obj.filename
  255. self.offset = obj.offset
  256. self.mode = obj.mode
  257. else:
  258. self._mmap = None
  259. self.filename = None
  260. self.offset = None
  261. self.mode = None
  262. def flush(self):
  263. """
  264. Write any changes in the array to the file on disk.
  265. For further information, see `memmap`.
  266. Parameters
  267. ----------
  268. None
  269. See Also
  270. --------
  271. memmap
  272. """
  273. if self.base is not None and hasattr(self.base, 'flush'):
  274. self.base.flush()
  275. def __array_wrap__(self, arr, context=None, return_scalar=False):
  276. arr = super().__array_wrap__(arr, context)
  277. # Return a memmap if a memmap was given as the output of the
  278. # ufunc. Leave the arr class unchanged if self is not a memmap
  279. # to keep original memmap subclasses behavior
  280. if self is arr or type(self) is not memmap:
  281. return arr
  282. # Return scalar instead of 0d memmap, e.g. for np.sum with
  283. # axis=None (note that subclasses will not reach here)
  284. if return_scalar:
  285. return arr[()]
  286. # Return ndarray otherwise
  287. return arr.view(np.ndarray)
  288. def __getitem__(self, index):
  289. res = super().__getitem__(index)
  290. if type(res) is memmap and res._mmap is None:
  291. return res.view(type=ndarray)
  292. return res