_miobase.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435
  1. # Authors: Travis Oliphant, Matthew Brett
  2. """
  3. Base classes for MATLAB file stream reading.
  4. MATLAB is a registered trademark of the Mathworks inc.
  5. """
  6. from typing import Final
  7. import numpy as np
  8. from scipy._lib import doccer
  9. from . import _byteordercodes as boc
  10. __all__ = [
  11. 'MatReadError', 'MatReadWarning', 'MatWriteError', 'MatWriteWarning',
  12. ]
  13. class MatReadError(Exception):
  14. """Exception indicating a read issue."""
  15. class MatWriteError(Exception):
  16. """Exception indicating a write issue."""
  17. class MatReadWarning(UserWarning):
  18. """Warning class for read issues."""
  19. class MatWriteWarning(UserWarning):
  20. """Warning class for write issues."""
  21. doc_dict = \
  22. {'file_arg':
  23. '''file_name : str
  24. Name of the mat file (do not need .mat extension if
  25. appendmat==True) Can also pass open file-like object.''',
  26. 'append_arg':
  27. '''appendmat : bool, optional
  28. True to append the .mat extension to the end of the given
  29. filename, if not already present. Default is True.''',
  30. 'load_args':
  31. '''byte_order : str or None, optional
  32. None by default, implying byte order guessed from mat
  33. file. Otherwise can be one of ('native', '=', 'little', '<',
  34. 'BIG', '>').
  35. mat_dtype : bool, optional
  36. If True, return arrays in same dtype as would be loaded into
  37. MATLAB (instead of the dtype with which they are saved).
  38. squeeze_me : bool, optional
  39. Whether to squeeze unit matrix dimensions or not.
  40. chars_as_strings : bool, optional
  41. Whether to convert char arrays to string arrays.
  42. matlab_compatible : bool, optional
  43. Returns matrices as would be loaded by MATLAB (implies
  44. squeeze_me=False, chars_as_strings=False, mat_dtype=True,
  45. struct_as_record=True).''',
  46. 'struct_arg':
  47. '''struct_as_record : bool, optional
  48. Whether to load MATLAB structs as NumPy record arrays, or as
  49. old-style NumPy arrays with dtype=object. Setting this flag to
  50. False replicates the behavior of SciPy version 0.7.x (returning
  51. numpy object arrays). The default setting is True, because it
  52. allows easier round-trip load and save of MATLAB files.''',
  53. 'matstream_arg':
  54. '''mat_stream : file-like
  55. Object with file API, open for reading.''',
  56. 'long_fields':
  57. '''long_field_names : bool, optional
  58. * False - maximum field name length in a structure is 31 characters
  59. which is the documented maximum length. This is the default.
  60. * True - maximum field name length in a structure is 63 characters
  61. which works for MATLAB 7.6''',
  62. 'do_compression':
  63. '''do_compression : bool, optional
  64. Whether to compress matrices on write. Default is False.''',
  65. 'oned_as':
  66. '''oned_as : {'row', 'column'}, optional
  67. If 'column', write 1-D NumPy arrays as column vectors.
  68. If 'row', write 1D NumPy arrays as row vectors.''',
  69. 'unicode_strings':
  70. '''unicode_strings : bool, optional
  71. If True, write strings as Unicode, else MATLAB usual encoding.'''}
  72. docfiller: Final = doccer.filldoc(doc_dict)
  73. '''
  74. Note on architecture
  75. ======================
  76. There are three sets of parameters relevant for reading files. The
  77. first are *file read parameters* - containing options that are common
  78. for reading the whole file, and therefore every variable within that
  79. file. At the moment these are:
  80. * mat_stream
  81. * dtypes (derived from byte code)
  82. * byte_order
  83. * chars_as_strings
  84. * squeeze_me
  85. * struct_as_record (MATLAB 5 files)
  86. * class_dtypes (derived from order code, MATLAB 5 files)
  87. * codecs (MATLAB 5 files)
  88. * uint16_codec (MATLAB 5 files)
  89. Another set of parameters are those that apply only to the current
  90. variable being read - the *header*:
  91. * header related variables (different for v4 and v5 mat files)
  92. * is_complex
  93. * mclass
  94. * var_stream
  95. With the header, we need ``next_position`` to tell us where the next
  96. variable in the stream is.
  97. Then, for each element in a matrix, there can be *element read
  98. parameters*. An element is, for example, one element in a MATLAB cell
  99. array. At the moment, these are:
  100. * mat_dtype
  101. The file-reading object contains the *file read parameters*. The
  102. *header* is passed around as a data object, or may be read and discarded
  103. in a single function. The *element read parameters* - the mat_dtype in
  104. this instance, is passed into a general post-processing function - see
  105. ``mio_utils`` for details.
  106. '''
  107. def convert_dtypes(dtype_template, order_code):
  108. ''' Convert dtypes in mapping to given order
  109. Parameters
  110. ----------
  111. dtype_template : mapping
  112. mapping with values returning numpy dtype from ``np.dtype(val)``
  113. order_code : str
  114. an order code suitable for using in ``dtype.newbyteorder()``
  115. Returns
  116. -------
  117. dtypes : mapping
  118. mapping where values have been replaced by
  119. ``np.dtype(val).newbyteorder(order_code)``
  120. '''
  121. dtypes = dtype_template.copy()
  122. for k in dtypes:
  123. dtypes[k] = np.dtype(dtypes[k]).newbyteorder(order_code)
  124. return dtypes
  125. def read_dtype(mat_stream, a_dtype):
  126. """
  127. Generic get of byte stream data of known type
  128. Parameters
  129. ----------
  130. mat_stream : file_like object
  131. MATLAB (tm) mat file stream
  132. a_dtype : dtype
  133. dtype of array to read. `a_dtype` is assumed to be correct
  134. endianness.
  135. Returns
  136. -------
  137. arr : ndarray
  138. Array of dtype `a_dtype` read from stream.
  139. """
  140. num_bytes = a_dtype.itemsize
  141. arr = np.ndarray(shape=(),
  142. dtype=a_dtype,
  143. buffer=mat_stream.read(num_bytes),
  144. order='F')
  145. return arr
  146. def matfile_version(file_name, *, appendmat=True):
  147. """
  148. Return major, minor tuple depending on apparent mat file type
  149. Where:
  150. #. 0,x -> version 4 format mat files
  151. #. 1,x -> version 5 format mat files
  152. #. 2,x -> version 7.3 format mat files (HDF format)
  153. Parameters
  154. ----------
  155. file_name : str
  156. Name of the mat file (do not need .mat extension if
  157. appendmat==True). Can also pass open file-like object.
  158. appendmat : bool, optional
  159. True to append the .mat extension to the end of the given
  160. filename, if not already present. Default is True.
  161. Returns
  162. -------
  163. major_version : {0, 1, 2}
  164. major MATLAB File format version
  165. minor_version : int
  166. minor MATLAB file format version
  167. Raises
  168. ------
  169. MatReadError
  170. If the file is empty.
  171. ValueError
  172. The matfile version is unknown.
  173. Notes
  174. -----
  175. Has the side effect of setting the file read pointer to 0
  176. """
  177. from ._mio import _open_file_context
  178. with _open_file_context(file_name, appendmat=appendmat) as fileobj:
  179. return _get_matfile_version(fileobj)
  180. get_matfile_version = matfile_version
  181. _HDR_N_BYTES = 20
  182. def _get_matfile_version(fileobj):
  183. # Mat4 files have a zero somewhere in first 4 bytes
  184. fileobj.seek(0)
  185. hdr_bytes = fileobj.read(_HDR_N_BYTES)
  186. if len(hdr_bytes) < _HDR_N_BYTES:
  187. raise MatReadError("Mat file appears to be truncated")
  188. if hdr_bytes.count(0) == _HDR_N_BYTES:
  189. raise MatReadError("Mat file appears to be corrupt "
  190. f"(first {_HDR_N_BYTES} bytes == 0)")
  191. mopt_ints = np.ndarray(shape=(4,), dtype=np.uint8, buffer=hdr_bytes[:4])
  192. if 0 in mopt_ints:
  193. fileobj.seek(0)
  194. return (0,0)
  195. # For 5 format or 7.3 format we need to read an integer in the
  196. # header. Bytes 124 through 128 contain a version integer and an
  197. # endian test string
  198. fileobj.seek(124)
  199. tst_str = fileobj.read(4)
  200. fileobj.seek(0)
  201. maj_ind = int(tst_str[2] == b'I'[0])
  202. maj_val = int(tst_str[maj_ind])
  203. min_val = int(tst_str[1 - maj_ind])
  204. ret = (maj_val, min_val)
  205. if maj_val in (1, 2):
  206. return ret
  207. raise ValueError('Unknown mat file type, version {}, {}'.format(*ret))
  208. def matdims(arr, oned_as='column'):
  209. """
  210. Determine equivalent MATLAB dimensions for given array
  211. Parameters
  212. ----------
  213. arr : ndarray
  214. Input array
  215. oned_as : {'column', 'row'}, optional
  216. Whether 1-D arrays are returned as MATLAB row or column matrices.
  217. Default is 'column'.
  218. Returns
  219. -------
  220. dims : tuple
  221. Shape tuple, in the form MATLAB expects it.
  222. Notes
  223. -----
  224. We had to decide what shape a 1 dimensional array would be by
  225. default. ``np.atleast_2d`` thinks it is a row vector. The
  226. default for a vector in MATLAB (e.g., ``>> 1:12``) is a row vector.
  227. Versions of scipy up to and including 0.11 resulted (accidentally)
  228. in 1-D arrays being read as column vectors. For the moment, we
  229. maintain the same tradition here.
  230. Examples
  231. --------
  232. >>> import numpy as np
  233. >>> from scipy.io.matlab._miobase import matdims
  234. >>> matdims(np.array(1)) # NumPy scalar
  235. (1, 1)
  236. >>> matdims(np.array([1])) # 1-D array, 1 element
  237. (1, 1)
  238. >>> matdims(np.array([1,2])) # 1-D array, 2 elements
  239. (2, 1)
  240. >>> matdims(np.array([[2],[3]])) # 2-D array, column vector
  241. (2, 1)
  242. >>> matdims(np.array([[2,3]])) # 2-D array, row vector
  243. (1, 2)
  244. >>> matdims(np.array([[[2,3]]])) # 3-D array, rowish vector
  245. (1, 1, 2)
  246. >>> matdims(np.array([])) # empty 1-D array
  247. (0, 0)
  248. >>> matdims(np.array([[]])) # empty 2-D array
  249. (0, 0)
  250. >>> matdims(np.array([[[]]])) # empty 3-D array
  251. (0, 0, 0)
  252. Optional argument flips 1-D shape behavior.
  253. >>> matdims(np.array([1,2]), 'row') # 1-D array, 2 elements
  254. (1, 2)
  255. The argument has to make sense though
  256. >>> matdims(np.array([1,2]), 'bizarre')
  257. Traceback (most recent call last):
  258. ...
  259. ValueError: 1-D option "bizarre" is strange
  260. """
  261. shape = arr.shape
  262. if shape == (): # scalar
  263. return (1, 1)
  264. if len(shape) == 1: # 1D
  265. if shape[0] == 0:
  266. return (0, 0)
  267. elif oned_as == 'column':
  268. return shape + (1,)
  269. elif oned_as == 'row':
  270. return (1,) + shape
  271. else:
  272. raise ValueError(f'1-D option "{oned_as}" is strange')
  273. return shape
  274. class MatVarReader:
  275. ''' Abstract class defining required interface for var readers'''
  276. def __init__(self, file_reader):
  277. pass
  278. def read_header(self):
  279. ''' Returns header '''
  280. pass
  281. def array_from_header(self, header):
  282. ''' Reads array given header '''
  283. pass
  284. class MatFileReader:
  285. """ Base object for reading mat files
  286. To make this class functional, you will need to override the
  287. following methods:
  288. matrix_getter_factory - gives object to fetch next matrix from stream
  289. guess_byte_order - guesses file byte order from file
  290. """
  291. @docfiller
  292. def __init__(self, mat_stream,
  293. byte_order=None,
  294. mat_dtype=False,
  295. squeeze_me=False,
  296. chars_as_strings=True,
  297. matlab_compatible=False,
  298. struct_as_record=True,
  299. verify_compressed_data_integrity=True,
  300. simplify_cells=False):
  301. '''
  302. Initializer for mat file reader
  303. mat_stream : file-like
  304. object with file API, open for reading
  305. %(load_args)s
  306. '''
  307. # Initialize stream
  308. self.mat_stream = mat_stream
  309. self.dtypes = {}
  310. if not byte_order:
  311. byte_order = self.guess_byte_order()
  312. else:
  313. byte_order = boc.to_numpy_code(byte_order)
  314. self.byte_order = byte_order
  315. self.struct_as_record = struct_as_record
  316. if matlab_compatible:
  317. self.set_matlab_compatible()
  318. else:
  319. self.squeeze_me = squeeze_me
  320. self.chars_as_strings = chars_as_strings
  321. self.mat_dtype = mat_dtype
  322. self.verify_compressed_data_integrity = verify_compressed_data_integrity
  323. self.simplify_cells = simplify_cells
  324. if simplify_cells:
  325. self.squeeze_me = True
  326. self.struct_as_record = False
  327. def set_matlab_compatible(self):
  328. ''' Sets options to return arrays as MATLAB loads them '''
  329. self.mat_dtype = True
  330. self.squeeze_me = False
  331. self.chars_as_strings = False
  332. def guess_byte_order(self):
  333. ''' As we do not know what file type we have, assume native '''
  334. return boc.native_code
  335. def end_of_stream(self):
  336. b = self.mat_stream.read(1)
  337. curpos = self.mat_stream.tell()
  338. self.mat_stream.seek(curpos-1)
  339. return len(b) == 0
  340. def arr_dtype_number(arr, num):
  341. ''' Return dtype for given number of items per element'''
  342. return np.dtype(arr.dtype.str[:2] + str(num))
  343. def arr_to_chars(arr):
  344. ''' Convert string array to char array '''
  345. dims = list(arr.shape)
  346. if not dims:
  347. dims = [1]
  348. dims.append(int(arr.dtype.str[2:]))
  349. arr = np.ndarray(shape=dims,
  350. dtype=arr_dtype_number(arr, 1),
  351. buffer=arr)
  352. empties = [arr == np.array('', dtype=arr.dtype)]
  353. if not np.any(empties):
  354. return arr
  355. arr = arr.copy()
  356. arr[tuple(empties)] = ' '
  357. return arr