_mio4.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632
  1. ''' Classes for read / write of matlab (TM) 4 files
  2. '''
  3. import sys
  4. import warnings
  5. import math
  6. from operator import mul
  7. import numpy as np
  8. import scipy.sparse
  9. from ._miobase import (MatFileReader, docfiller, matdims, read_dtype,
  10. convert_dtypes, arr_to_chars, arr_dtype_number)
  11. from ._mio_utils import squeeze_element, chars_to_strings
  12. from functools import reduce
  13. __all__ = [
  14. 'MatFile4Reader', 'MatFile4Writer', 'SYS_LITTLE_ENDIAN',
  15. 'VarHeader4', 'VarReader4', 'VarWriter4', 'arr_to_2d', 'mclass_info',
  16. 'mdtypes_template', 'miDOUBLE', 'miINT16', 'miINT32', 'miSINGLE',
  17. 'miUINT16', 'miUINT8', 'mxCHAR_CLASS', 'mxFULL_CLASS', 'mxSPARSE_CLASS',
  18. 'np_to_mtypes', 'order_codes'
  19. ]
  20. SYS_LITTLE_ENDIAN = sys.byteorder == 'little'
  21. miDOUBLE = 0
  22. miSINGLE = 1
  23. miINT32 = 2
  24. miINT16 = 3
  25. miUINT16 = 4
  26. miUINT8 = 5
  27. mdtypes_template = {
  28. miDOUBLE: 'f8',
  29. miSINGLE: 'f4',
  30. miINT32: 'i4',
  31. miINT16: 'i2',
  32. miUINT16: 'u2',
  33. miUINT8: 'u1',
  34. 'header': [('mopt', 'i4'),
  35. ('mrows', 'i4'),
  36. ('ncols', 'i4'),
  37. ('imagf', 'i4'),
  38. ('namlen', 'i4')],
  39. 'U1': 'U1',
  40. }
  41. np_to_mtypes = {
  42. 'f8': miDOUBLE,
  43. 'c32': miDOUBLE,
  44. 'c24': miDOUBLE,
  45. 'c16': miDOUBLE,
  46. 'f4': miSINGLE,
  47. 'c8': miSINGLE,
  48. 'i4': miINT32,
  49. 'i2': miINT16,
  50. 'u2': miUINT16,
  51. 'u1': miUINT8,
  52. 'S1': miUINT8,
  53. }
  54. # matrix classes
  55. mxFULL_CLASS = 0
  56. mxCHAR_CLASS = 1
  57. mxSPARSE_CLASS = 2
  58. order_codes = {
  59. 0: '<',
  60. 1: '>',
  61. 2: 'VAX D-float', # !
  62. 3: 'VAX G-float',
  63. 4: 'Cray', # !!
  64. }
  65. mclass_info = {
  66. mxFULL_CLASS: 'double',
  67. mxCHAR_CLASS: 'char',
  68. mxSPARSE_CLASS: 'sparse',
  69. }
  70. _MAX_INTP = np.iinfo(np.intp).max
  71. class VarHeader4:
  72. # Mat4 variables never logical or global
  73. is_logical = False
  74. is_global = False
  75. def __init__(self,
  76. name,
  77. dtype,
  78. mclass,
  79. dims,
  80. is_complex):
  81. self.name = name
  82. self.dtype = dtype
  83. self.mclass = mclass
  84. self.dims = dims
  85. self.is_complex = is_complex
  86. class VarReader4:
  87. ''' Class to read matlab 4 variables '''
  88. def __init__(self, file_reader):
  89. self.file_reader = file_reader
  90. self.mat_stream = file_reader.mat_stream
  91. self.dtypes = file_reader.dtypes
  92. self.chars_as_strings = file_reader.chars_as_strings
  93. self.squeeze_me = file_reader.squeeze_me
  94. def read_header(self):
  95. ''' Read and return header for variable '''
  96. data = read_dtype(self.mat_stream, self.dtypes['header'])
  97. name = self.mat_stream.read(int(data['namlen'])).strip(b'\x00')
  98. if data['mopt'] < 0 or data['mopt'] > 5000:
  99. raise ValueError('Mat 4 mopt wrong format, byteswapping problem?')
  100. M, rest = divmod(data['mopt'], 1000) # order code
  101. if M not in (0, 1):
  102. warnings.warn(f"We do not support byte ordering '{order_codes[M]}';"
  103. " returned data may be corrupt",
  104. UserWarning, stacklevel=3)
  105. O, rest = divmod(rest, 100) # unused, should be 0
  106. if O != 0:
  107. raise ValueError('O in MOPT integer should be 0, wrong format?')
  108. P, rest = divmod(rest, 10) # data type code e.g miDOUBLE (see above)
  109. T = rest # matrix type code e.g., mxFULL_CLASS (see above)
  110. dims = (data['mrows'], data['ncols'])
  111. is_complex = data['imagf'] == 1
  112. dtype = self.dtypes[P]
  113. return VarHeader4(
  114. name,
  115. dtype,
  116. T,
  117. dims,
  118. is_complex)
  119. def array_from_header(self, hdr, process=True):
  120. mclass = hdr.mclass
  121. if mclass == mxFULL_CLASS:
  122. arr = self.read_full_array(hdr)
  123. elif mclass == mxCHAR_CLASS:
  124. arr = self.read_char_array(hdr)
  125. if process and self.chars_as_strings:
  126. arr = chars_to_strings(arr)
  127. elif mclass == mxSPARSE_CLASS:
  128. # no current processing (below) makes sense for sparse
  129. return self.read_sparse_array(hdr)
  130. else:
  131. raise TypeError(f'No reader for class code {mclass}')
  132. if process and self.squeeze_me:
  133. return squeeze_element(arr)
  134. return arr
  135. def read_sub_array(self, hdr, copy=True):
  136. ''' Mat4 read using header `hdr` dtype and dims
  137. Parameters
  138. ----------
  139. hdr : object
  140. object with attributes ``dtype``, ``dims``. dtype is assumed to be
  141. the correct endianness
  142. copy : bool, optional
  143. copies array before return if True (default True)
  144. (buffer is usually read only)
  145. Returns
  146. -------
  147. arr : ndarray
  148. of dtype given by `hdr` ``dtype`` and shape given by `hdr` ``dims``
  149. '''
  150. dt = hdr.dtype
  151. # Fast product for large (>2GB) arrays.
  152. num_bytes = reduce(mul, hdr.dims, np.int64(dt.itemsize))
  153. if num_bytes > _MAX_INTP:
  154. raise ValueError(
  155. f"Variable '{hdr.name.decode('latin1')}' has byte length "
  156. f"longer than largest possible NumPy array on this platform.")
  157. buffer = self.mat_stream.read(num_bytes)
  158. if len(buffer) != num_bytes:
  159. raise ValueError(
  160. f"Not enough bytes to read matrix "
  161. f"'{hdr.name.decode('latin1')}'; is this a badly-formed file? "
  162. f"Consider listing matrices with `whosmat` and loading named "
  163. f"matrices with `variable_names` kwarg to `loadmat`")
  164. arr = np.ndarray(shape=hdr.dims,
  165. dtype=dt,
  166. buffer=buffer,
  167. order='F')
  168. if copy:
  169. arr = arr.copy()
  170. return arr
  171. def read_full_array(self, hdr):
  172. ''' Full (rather than sparse) matrix getter
  173. Read matrix (array) can be real or complex
  174. Parameters
  175. ----------
  176. hdr : ``VarHeader4`` instance
  177. Returns
  178. -------
  179. arr : ndarray
  180. complex array if ``hdr.is_complex`` is True, otherwise a real
  181. numeric array
  182. '''
  183. if hdr.is_complex:
  184. # avoid array copy to save memory
  185. res = self.read_sub_array(hdr, copy=False)
  186. res_j = self.read_sub_array(hdr, copy=False)
  187. return res + (res_j * 1j)
  188. return self.read_sub_array(hdr)
  189. def read_char_array(self, hdr):
  190. ''' latin-1 text matrix (char matrix) reader
  191. Parameters
  192. ----------
  193. hdr : ``VarHeader4`` instance
  194. Returns
  195. -------
  196. arr : ndarray
  197. with dtype 'U1', shape given by `hdr` ``dims``
  198. '''
  199. arr = self.read_sub_array(hdr).astype(np.uint8)
  200. S = arr.tobytes().decode('latin-1')
  201. return np.ndarray(shape=hdr.dims,
  202. dtype=np.dtype('U1'),
  203. buffer=np.array(S)).copy()
  204. def read_sparse_array(self, hdr):
  205. ''' Read and return sparse matrix type
  206. Parameters
  207. ----------
  208. hdr : ``VarHeader4`` instance
  209. Returns
  210. -------
  211. arr : coo_array
  212. with dtype ``float`` and shape read from the sparse array data
  213. Notes
  214. -----
  215. MATLAB 4 real sparse arrays are saved in a N+1 by 3 array format, where
  216. N is the number of non-zero values. Column 1 values [0:N] are the
  217. (1-based) row indices of the each non-zero value, column 2 [0:N] are the
  218. column indices, column 3 [0:N] are the (real) values. The last values
  219. [-1,0:2] of the rows, column indices are shape[0] and shape[1]
  220. respectively of the output matrix. The last value for the values column
  221. is a padding 0. mrows and ncols values from the header give the shape of
  222. the stored matrix, here [N+1, 3]. Complex data are saved as a 4 column
  223. matrix, where the fourth column contains the imaginary component; the
  224. last value is again 0. Complex sparse data do *not* have the header
  225. ``imagf`` field set to True; the fact that the data are complex is only
  226. detectable because there are 4 storage columns.
  227. '''
  228. res = self.read_sub_array(hdr)
  229. tmp = res[:-1,:]
  230. # All numbers are float64 in Matlab, but SciPy sparse expects int shape
  231. dims = (int(res[-1,0]), int(res[-1,1]))
  232. I = np.ascontiguousarray(tmp[:,0],dtype='intc') # fixes byte order also
  233. J = np.ascontiguousarray(tmp[:,1],dtype='intc')
  234. I -= 1 # for 1-based indexing
  235. J -= 1
  236. if res.shape[1] == 3:
  237. V = np.ascontiguousarray(tmp[:,2],dtype='float')
  238. else:
  239. V = np.ascontiguousarray(tmp[:,2],dtype='complex')
  240. V.imag = tmp[:,3]
  241. return scipy.sparse.coo_array((V,(I,J)), dims)
  242. def shape_from_header(self, hdr):
  243. '''Read the shape of the array described by the header.
  244. The file position after this call is unspecified.
  245. '''
  246. mclass = hdr.mclass
  247. if mclass == mxFULL_CLASS:
  248. shape = tuple(map(int, hdr.dims))
  249. elif mclass == mxCHAR_CLASS:
  250. shape = tuple(map(int, hdr.dims))
  251. if self.chars_as_strings:
  252. shape = shape[:-1]
  253. elif mclass == mxSPARSE_CLASS:
  254. dt = hdr.dtype
  255. dims = hdr.dims
  256. if not (len(dims) == 2 and dims[0] >= 1 and dims[1] >= 1):
  257. return ()
  258. # Read only the row and column counts
  259. self.mat_stream.seek(dt.itemsize * (dims[0] - 1), 1)
  260. rows = np.ndarray(shape=(), dtype=dt,
  261. buffer=self.mat_stream.read(dt.itemsize))
  262. self.mat_stream.seek(dt.itemsize * (dims[0] - 1), 1)
  263. cols = np.ndarray(shape=(), dtype=dt,
  264. buffer=self.mat_stream.read(dt.itemsize))
  265. shape = (int(rows), int(cols))
  266. else:
  267. raise TypeError(f'No reader for class code {mclass}')
  268. if self.squeeze_me:
  269. shape = tuple([x for x in shape if x != 1])
  270. return shape
  271. class MatFile4Reader(MatFileReader):
  272. ''' Reader for Mat4 files '''
  273. @docfiller
  274. def __init__(self, mat_stream, *args, **kwargs):
  275. ''' Initialize matlab 4 file reader
  276. %(matstream_arg)s
  277. %(load_args)s
  278. '''
  279. super().__init__(mat_stream, *args, **kwargs)
  280. self._matrix_reader = None
  281. def guess_byte_order(self):
  282. self.mat_stream.seek(0)
  283. mopt = read_dtype(self.mat_stream, np.dtype('i4'))
  284. self.mat_stream.seek(0)
  285. if mopt == 0:
  286. return '<'
  287. if mopt < 0 or mopt > 5000:
  288. # Number must have been byteswapped
  289. return SYS_LITTLE_ENDIAN and '>' or '<'
  290. # Not byteswapped
  291. return SYS_LITTLE_ENDIAN and '<' or '>'
  292. def initialize_read(self):
  293. ''' Run when beginning read of variables
  294. Sets up readers from parameters in `self`
  295. '''
  296. self.dtypes = convert_dtypes(mdtypes_template, self.byte_order)
  297. self._matrix_reader = VarReader4(self)
  298. def read_var_header(self):
  299. ''' Read and return header, next position
  300. Parameters
  301. ----------
  302. None
  303. Returns
  304. -------
  305. header : object
  306. object that can be passed to self.read_var_array, and that
  307. has attributes ``name`` and ``is_global``
  308. next_position : int
  309. position in stream of next variable
  310. '''
  311. hdr = self._matrix_reader.read_header()
  312. # Fast product for large (>2GB) arrays.
  313. remaining_bytes = reduce(mul, hdr.dims, np.int64(hdr.dtype.itemsize))
  314. if hdr.is_complex and not hdr.mclass == mxSPARSE_CLASS:
  315. remaining_bytes *= 2
  316. next_position = self.mat_stream.tell() + remaining_bytes
  317. return hdr, next_position
  318. def read_var_array(self, header, process=True):
  319. ''' Read array, given `header`
  320. Parameters
  321. ----------
  322. header : header object
  323. object with fields defining variable header
  324. process : {True, False}, optional
  325. If True, apply recursive post-processing during loading of array.
  326. Returns
  327. -------
  328. arr : array
  329. array with post-processing applied or not according to
  330. `process`.
  331. '''
  332. return self._matrix_reader.array_from_header(header, process)
  333. def get_variables(self, variable_names=None):
  334. ''' get variables from stream as dictionary
  335. Parameters
  336. ----------
  337. variable_names : None or str or sequence of str, optional
  338. variable name, or sequence of variable names to get from Mat file /
  339. file stream. If None, then get all variables in file.
  340. '''
  341. if isinstance(variable_names, str):
  342. variable_names = [variable_names]
  343. elif variable_names is not None:
  344. variable_names = list(variable_names)
  345. self.mat_stream.seek(0)
  346. # set up variable reader
  347. self.initialize_read()
  348. mdict = {}
  349. while not self.end_of_stream():
  350. hdr, next_position = self.read_var_header()
  351. name = 'None' if hdr.name is None else hdr.name.decode('latin1')
  352. if variable_names is not None and name not in variable_names:
  353. self.mat_stream.seek(next_position)
  354. continue
  355. mdict[name] = self.read_var_array(hdr)
  356. self.mat_stream.seek(next_position)
  357. if variable_names is not None:
  358. variable_names.remove(name)
  359. if len(variable_names) == 0:
  360. break
  361. return mdict
  362. def list_variables(self):
  363. ''' list variables from stream '''
  364. self.mat_stream.seek(0)
  365. # set up variable reader
  366. self.initialize_read()
  367. vars = []
  368. while not self.end_of_stream():
  369. hdr, next_position = self.read_var_header()
  370. name = 'None' if hdr.name is None else hdr.name.decode('latin1')
  371. shape = self._matrix_reader.shape_from_header(hdr)
  372. info = mclass_info.get(hdr.mclass, 'unknown')
  373. vars.append((name, shape, info))
  374. self.mat_stream.seek(next_position)
  375. return vars
  376. def arr_to_2d(arr, oned_as='row'):
  377. ''' Make ``arr`` exactly two dimensional
  378. If `arr` has more than 2 dimensions, raise a ValueError
  379. Parameters
  380. ----------
  381. arr : array
  382. oned_as : {'row', 'column'}, optional
  383. Whether to reshape 1-D vectors as row vectors or column vectors.
  384. See documentation for ``matdims`` for more detail
  385. Returns
  386. -------
  387. arr2d : array
  388. 2-D version of the array
  389. '''
  390. dims = matdims(arr, oned_as)
  391. if len(dims) > 2:
  392. raise ValueError('Matlab 4 files cannot save arrays with more than '
  393. '2 dimensions')
  394. return arr.reshape(dims)
  395. class VarWriter4:
  396. def __init__(self, file_writer):
  397. self.file_stream = file_writer.file_stream
  398. self.oned_as = file_writer.oned_as
  399. def write_bytes(self, arr):
  400. self.file_stream.write(arr.tobytes(order='F'))
  401. def write_string(self, s):
  402. self.file_stream.write(s)
  403. def write_header(self, name, shape, P=miDOUBLE, T=mxFULL_CLASS, imagf=0):
  404. ''' Write header for given data options
  405. Parameters
  406. ----------
  407. name : str
  408. name of variable
  409. shape : sequence
  410. Shape of array as it will be read in matlab
  411. P : int, optional
  412. code for mat4 data type, one of ``miDOUBLE, miSINGLE, miINT32,
  413. miINT16, miUINT16, miUINT8``
  414. T : int, optional
  415. code for mat4 matrix class, one of ``mxFULL_CLASS, mxCHAR_CLASS,
  416. mxSPARSE_CLASS``
  417. imagf : int, optional
  418. flag indicating complex
  419. '''
  420. header = np.empty((), mdtypes_template['header'])
  421. M = not SYS_LITTLE_ENDIAN
  422. O = 0
  423. header['mopt'] = (M * 1000 +
  424. O * 100 +
  425. P * 10 +
  426. T)
  427. header['mrows'] = shape[0]
  428. header['ncols'] = shape[1]
  429. header['imagf'] = imagf
  430. header['namlen'] = len(name) + 1
  431. self.write_bytes(header)
  432. data = name + '\0'
  433. self.write_string(data.encode('latin1'))
  434. def write(self, arr, name):
  435. ''' Write matrix `arr`, with name `name`
  436. Parameters
  437. ----------
  438. arr : array_like
  439. array to write
  440. name : str
  441. name in matlab workspace
  442. '''
  443. # we need to catch sparse first, because np.asarray returns an
  444. # an object array for scipy.sparse
  445. if scipy.sparse.issparse(arr):
  446. self.write_sparse(arr, name)
  447. return
  448. arr = np.asarray(arr)
  449. dt = arr.dtype
  450. if not dt.isnative:
  451. arr = arr.astype(dt.newbyteorder('='))
  452. dtt = dt.type
  453. if dtt is np.object_:
  454. raise TypeError('Cannot save object arrays in Mat4')
  455. elif dtt is np.void:
  456. raise TypeError('Cannot save void type arrays')
  457. elif dtt in (np.str_, np.bytes_):
  458. self.write_char(arr, name)
  459. return
  460. self.write_numeric(arr, name)
  461. def write_numeric(self, arr, name):
  462. arr = arr_to_2d(arr, self.oned_as)
  463. imagf = arr.dtype.kind == 'c'
  464. try:
  465. P = np_to_mtypes[arr.dtype.str[1:]]
  466. except KeyError:
  467. if imagf:
  468. arr = arr.astype('c128')
  469. else:
  470. arr = arr.astype('f8')
  471. P = miDOUBLE
  472. self.write_header(name,
  473. arr.shape,
  474. P=P,
  475. T=mxFULL_CLASS,
  476. imagf=imagf)
  477. if imagf:
  478. self.write_bytes(arr.real)
  479. self.write_bytes(arr.imag)
  480. else:
  481. self.write_bytes(arr)
  482. def write_char(self, arr, name):
  483. if arr.dtype.type == np.str_ and arr.dtype.itemsize != np.dtype('U1').itemsize:
  484. arr = arr_to_chars(arr)
  485. arr = arr_to_2d(arr, self.oned_as)
  486. dims = arr.shape
  487. self.write_header(
  488. name,
  489. dims,
  490. P=miUINT8,
  491. T=mxCHAR_CLASS)
  492. if arr.dtype.kind == 'U':
  493. # Recode unicode to latin1
  494. n_chars = math.prod(dims)
  495. st_arr = np.ndarray(shape=(),
  496. dtype=arr_dtype_number(arr, n_chars),
  497. buffer=arr)
  498. st = st_arr.item().encode('latin-1')
  499. arr = np.ndarray(shape=dims, dtype='S1', buffer=st)
  500. self.write_bytes(arr)
  501. def write_sparse(self, arr, name):
  502. ''' Sparse matrices are 2-D
  503. See docstring for VarReader4.read_sparse_array
  504. '''
  505. A = arr.tocoo() # convert to sparse COO format (ijv)
  506. imagf = A.dtype.kind == 'c'
  507. ijv = np.zeros((A.nnz + 1, 3+imagf), dtype='f8')
  508. ijv[:-1,0] = A.row
  509. ijv[:-1,1] = A.col
  510. ijv[:-1,0:2] += 1 # 1 based indexing
  511. if imagf:
  512. ijv[:-1,2] = A.data.real
  513. ijv[:-1,3] = A.data.imag
  514. else:
  515. ijv[:-1,2] = A.data
  516. ijv[-1,0:2] = A.shape
  517. self.write_header(
  518. name,
  519. ijv.shape,
  520. P=miDOUBLE,
  521. T=mxSPARSE_CLASS)
  522. self.write_bytes(ijv)
  523. class MatFile4Writer:
  524. ''' Class for writing matlab 4 format files '''
  525. def __init__(self, file_stream, oned_as=None):
  526. self.file_stream = file_stream
  527. if oned_as is None:
  528. oned_as = 'row'
  529. self.oned_as = oned_as
  530. self._matrix_writer = None
  531. def put_variables(self, mdict, write_header=None):
  532. ''' Write variables in `mdict` to stream
  533. Parameters
  534. ----------
  535. mdict : mapping
  536. mapping with method ``items`` return name, contents pairs
  537. where ``name`` which will appeak in the matlab workspace in
  538. file load, and ``contents`` is something writeable to a
  539. matlab file, such as a NumPy array.
  540. write_header : {None, True, False}
  541. If True, then write the matlab file header before writing the
  542. variables. If None (the default) then write the file header
  543. if we are at position 0 in the stream. By setting False
  544. here, and setting the stream position to the end of the file,
  545. you can append variables to a matlab file
  546. '''
  547. # there is no header for a matlab 4 mat file, so we ignore the
  548. # ``write_header`` input argument. It's there for compatibility
  549. # with the matlab 5 version of this method
  550. self._matrix_writer = VarWriter4(self)
  551. for name, var in mdict.items():
  552. self._matrix_writer.write(var, name)