hb.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571
  1. """
  2. Implementation of Harwell-Boeing read/write.
  3. At the moment not the full Harwell-Boeing format is supported. Supported
  4. features are:
  5. - assembled, non-symmetric, real matrices
  6. - integer for pointer/indices
  7. - exponential format for float values, and int format
  8. """
  9. # TODO:
  10. # - Add more support (symmetric/complex matrices, non-assembled matrices ?)
  11. # XXX: reading is reasonably efficient (>= 85 % is in numpy.fromstring), but
  12. # takes a lot of memory. Being faster would require compiled code.
  13. # write is not efficient. Although not a terribly exciting task,
  14. # having reusable facilities to efficiently read/write fortran-formatted files
  15. # would be useful outside this module.
  16. import warnings
  17. import numpy as np
  18. from scipy.sparse import csc_array, csc_matrix
  19. from ._fortran_format_parser import FortranFormatParser, IntFormat, ExpFormat
  20. __all__ = ["hb_read", "hb_write"]
  21. class MalformedHeader(Exception):
  22. pass
  23. class LineOverflow(Warning):
  24. pass
  25. def _nbytes_full(fmt, nlines):
  26. """Return the number of bytes to read to get every full lines for the
  27. given parsed fortran format."""
  28. return (fmt.repeat * fmt.width + 1) * (nlines - 1)
  29. class HBInfo:
  30. @classmethod
  31. def from_data(cls, m, title="Default title", key="0", mxtype=None, fmt=None):
  32. """Create a HBInfo instance from an existing sparse matrix.
  33. Parameters
  34. ----------
  35. m : sparse array or matrix
  36. the HBInfo instance will derive its parameters from m
  37. title : str
  38. Title to put in the HB header
  39. key : str
  40. Key
  41. mxtype : HBMatrixType
  42. type of the input matrix
  43. fmt : dict
  44. not implemented
  45. Returns
  46. -------
  47. hb_info : HBInfo instance
  48. """
  49. m = m.tocsc(copy=False)
  50. pointer = m.indptr
  51. indices = m.indices
  52. values = m.data
  53. nrows, ncols = m.shape
  54. nnon_zeros = m.nnz
  55. if fmt is None:
  56. # +1 because HB use one-based indexing (Fortran), and we will write
  57. # the indices /pointer as such
  58. pointer_fmt = IntFormat.from_number(np.max(pointer+1))
  59. indices_fmt = IntFormat.from_number(np.max(indices+1))
  60. if values.dtype.kind in np.typecodes["AllFloat"]:
  61. values_fmt = ExpFormat.from_number(-np.max(np.abs(values)))
  62. elif values.dtype.kind in np.typecodes["AllInteger"]:
  63. values_fmt = IntFormat.from_number(-np.max(np.abs(values)))
  64. else:
  65. message = f"type {values.dtype.kind} not implemented yet"
  66. raise NotImplementedError(message)
  67. else:
  68. raise NotImplementedError("fmt argument not supported yet.")
  69. if mxtype is None:
  70. if not np.isrealobj(values):
  71. raise ValueError("Complex values not supported yet")
  72. if values.dtype.kind in np.typecodes["AllInteger"]:
  73. tp = "integer"
  74. elif values.dtype.kind in np.typecodes["AllFloat"]:
  75. tp = "real"
  76. else:
  77. raise NotImplementedError(
  78. f"type {values.dtype} for values not implemented")
  79. mxtype = HBMatrixType(tp, "unsymmetric", "assembled")
  80. else:
  81. raise ValueError("mxtype argument not handled yet.")
  82. def _nlines(fmt, size):
  83. nlines = size // fmt.repeat
  84. if nlines * fmt.repeat != size:
  85. nlines += 1
  86. return nlines
  87. pointer_nlines = _nlines(pointer_fmt, pointer.size)
  88. indices_nlines = _nlines(indices_fmt, indices.size)
  89. values_nlines = _nlines(values_fmt, values.size)
  90. total_nlines = pointer_nlines + indices_nlines + values_nlines
  91. return cls(title, key,
  92. total_nlines, pointer_nlines, indices_nlines, values_nlines,
  93. mxtype, nrows, ncols, nnon_zeros,
  94. pointer_fmt.fortran_format, indices_fmt.fortran_format,
  95. values_fmt.fortran_format)
  96. @classmethod
  97. def from_file(cls, fid):
  98. """Create a HBInfo instance from a file object containing a matrix in the
  99. HB format.
  100. Parameters
  101. ----------
  102. fid : file-like matrix
  103. File or file-like object containing a matrix in the HB format.
  104. Returns
  105. -------
  106. hb_info : HBInfo instance
  107. """
  108. # First line
  109. line = fid.readline().strip("\n")
  110. if not len(line) > 72:
  111. raise ValueError("Expected at least 72 characters for first line, "
  112. f"got: \n{line}")
  113. title = line[:72]
  114. key = line[72:]
  115. # Second line
  116. line = fid.readline().strip("\n")
  117. if not len(line.rstrip()) >= 56:
  118. raise ValueError("Expected at least 56 characters for second line, "
  119. f"got: \n{line}")
  120. total_nlines = _expect_int(line[:14])
  121. pointer_nlines = _expect_int(line[14:28])
  122. indices_nlines = _expect_int(line[28:42])
  123. values_nlines = _expect_int(line[42:56])
  124. rhs_nlines = line[56:72].strip()
  125. if rhs_nlines == '':
  126. rhs_nlines = 0
  127. else:
  128. rhs_nlines = _expect_int(rhs_nlines)
  129. if not rhs_nlines == 0:
  130. raise ValueError("Only files without right hand side supported for "
  131. "now.")
  132. # Third line
  133. line = fid.readline().strip("\n")
  134. if not len(line) >= 70:
  135. raise ValueError(f"Expected at least 72 character for third line, "
  136. f"got:\n{line}")
  137. mxtype_s = line[:3].upper()
  138. if not len(mxtype_s) == 3:
  139. raise ValueError("mxtype expected to be 3 characters long")
  140. mxtype = HBMatrixType.from_fortran(mxtype_s)
  141. if mxtype.value_type not in ["real", "integer"]:
  142. raise ValueError("Only real or integer matrices supported for "
  143. f"now (detected {mxtype})")
  144. if not mxtype.structure == "unsymmetric":
  145. raise ValueError("Only unsymmetric matrices supported for "
  146. f"now (detected {mxtype})")
  147. if not mxtype.storage == "assembled":
  148. raise ValueError("Only assembled matrices supported for now")
  149. if not line[3:14] == " " * 11:
  150. raise ValueError(f"Malformed data for third line: {line}")
  151. nrows = _expect_int(line[14:28])
  152. ncols = _expect_int(line[28:42])
  153. nnon_zeros = _expect_int(line[42:56])
  154. nelementals = _expect_int(line[56:70])
  155. if not nelementals == 0:
  156. raise ValueError(
  157. f"Unexpected value {nelementals} for nltvl (last entry of line 3)"
  158. )
  159. # Fourth line
  160. line = fid.readline().strip("\n")
  161. ct = line.split()
  162. if not len(ct) == 3:
  163. raise ValueError(f"Expected 3 formats, got {ct}")
  164. return cls(title, key,
  165. total_nlines, pointer_nlines, indices_nlines, values_nlines,
  166. mxtype, nrows, ncols, nnon_zeros,
  167. ct[0], ct[1], ct[2],
  168. rhs_nlines, nelementals)
  169. def __init__(self, title, key,
  170. total_nlines, pointer_nlines, indices_nlines, values_nlines,
  171. mxtype, nrows, ncols, nnon_zeros,
  172. pointer_format_str, indices_format_str, values_format_str,
  173. right_hand_sides_nlines=0, nelementals=0):
  174. """Do not use this directly, but the class ctrs (from_* functions)."""
  175. if title is None:
  176. title = "No Title"
  177. if len(title) > 72:
  178. raise ValueError("title cannot be > 72 characters")
  179. if key is None:
  180. key = "|No Key"
  181. if len(key) > 8:
  182. warnings.warn(f"key is > 8 characters (key is {key})",
  183. LineOverflow, stacklevel=3)
  184. self.title = title
  185. self.key = key
  186. self.total_nlines = total_nlines
  187. self.pointer_nlines = pointer_nlines
  188. self.indices_nlines = indices_nlines
  189. self.values_nlines = values_nlines
  190. parser = FortranFormatParser()
  191. pointer_format = parser.parse(pointer_format_str)
  192. if not isinstance(pointer_format, IntFormat):
  193. raise ValueError("Expected int format for pointer format, got "
  194. f"{pointer_format}")
  195. indices_format = parser.parse(indices_format_str)
  196. if not isinstance(indices_format, IntFormat):
  197. raise ValueError("Expected int format for indices format, got "
  198. f"{indices_format}")
  199. values_format = parser.parse(values_format_str)
  200. if isinstance(values_format, ExpFormat):
  201. if mxtype.value_type not in ["real", "complex"]:
  202. raise ValueError(f"Inconsistency between matrix type {mxtype} and "
  203. f"value type {values_format}")
  204. values_dtype = np.float64
  205. elif isinstance(values_format, IntFormat):
  206. if mxtype.value_type not in ["integer"]:
  207. raise ValueError(f"Inconsistency between matrix type {mxtype} and "
  208. f"value type {values_format}")
  209. # XXX: fortran int -> dtype association ?
  210. values_dtype = int
  211. else:
  212. raise ValueError(f"Unsupported format for values {values_format!r}")
  213. self.pointer_format = pointer_format
  214. self.indices_format = indices_format
  215. self.values_format = values_format
  216. self.pointer_dtype = np.int32
  217. self.indices_dtype = np.int32
  218. self.values_dtype = values_dtype
  219. self.pointer_nlines = pointer_nlines
  220. self.pointer_nbytes_full = _nbytes_full(pointer_format, pointer_nlines)
  221. self.indices_nlines = indices_nlines
  222. self.indices_nbytes_full = _nbytes_full(indices_format, indices_nlines)
  223. self.values_nlines = values_nlines
  224. self.values_nbytes_full = _nbytes_full(values_format, values_nlines)
  225. self.nrows = nrows
  226. self.ncols = ncols
  227. self.nnon_zeros = nnon_zeros
  228. self.nelementals = nelementals
  229. self.mxtype = mxtype
  230. def dump(self):
  231. """Gives the header corresponding to this instance as a string."""
  232. header = [self.title.ljust(72) + self.key.ljust(8)]
  233. header.append(f"{self.total_nlines:14d}{self.pointer_nlines:14d}{self.indices_nlines:14d}{self.values_nlines:14d}")
  234. header.append(f"{self.mxtype.fortran_format.ljust(14):14s}{self.nrows:14d}{self.ncols:14d}{self.nnon_zeros:14d}{0:14d}")
  235. pffmt = self.pointer_format.fortran_format
  236. iffmt = self.indices_format.fortran_format
  237. vffmt = self.values_format.fortran_format
  238. header.append(f"{pffmt.ljust(16):16s}{iffmt.ljust(16):16s}{vffmt.ljust(20):20s}")
  239. return "\n".join(header)
  240. def _expect_int(value, msg=None):
  241. try:
  242. return int(value)
  243. except ValueError as e:
  244. if msg is None:
  245. msg = "Expected an int, got %s"
  246. raise ValueError(msg % value) from e
  247. def _read_hb_data(content, header):
  248. # XXX: look at a way to reduce memory here (big string creation)
  249. ptr_string = "".join([content.read(header.pointer_nbytes_full),
  250. content.readline()])
  251. ptr = np.fromstring(ptr_string,
  252. dtype=int, sep=' ')
  253. ind_string = "".join([content.read(header.indices_nbytes_full),
  254. content.readline()])
  255. ind = np.fromstring(ind_string,
  256. dtype=int, sep=' ')
  257. val_string = "".join([content.read(header.values_nbytes_full),
  258. content.readline()])
  259. val = np.fromstring(val_string,
  260. dtype=header.values_dtype, sep=' ')
  261. return csc_array((val, ind-1, ptr-1), shape=(header.nrows, header.ncols))
  262. def _write_data(m, fid, header):
  263. m = m.tocsc(copy=False)
  264. def write_array(f, ar, nlines, fmt):
  265. # ar_nlines is the number of full lines, n is the number of items per
  266. # line, ffmt the fortran format
  267. pyfmt = fmt.python_format
  268. pyfmt_full = pyfmt * fmt.repeat
  269. # for each array to write, we first write the full lines, and special
  270. # case for partial line
  271. full = ar[:(nlines - 1) * fmt.repeat]
  272. for row in full.reshape((nlines-1, fmt.repeat)):
  273. f.write(pyfmt_full % tuple(row) + "\n")
  274. nremain = ar.size - full.size
  275. if nremain > 0:
  276. f.write((pyfmt * nremain) % tuple(ar[ar.size - nremain:]) + "\n")
  277. fid.write(header.dump())
  278. fid.write("\n")
  279. # +1 is for Fortran one-based indexing
  280. write_array(fid, m.indptr+1, header.pointer_nlines,
  281. header.pointer_format)
  282. write_array(fid, m.indices+1, header.indices_nlines,
  283. header.indices_format)
  284. write_array(fid, m.data, header.values_nlines,
  285. header.values_format)
  286. class HBMatrixType:
  287. """Class to hold the matrix type."""
  288. # q2f* translates qualified names to Fortran character
  289. _q2f_type = {
  290. "real": "R",
  291. "complex": "C",
  292. "pattern": "P",
  293. "integer": "I",
  294. }
  295. _q2f_structure = {
  296. "symmetric": "S",
  297. "unsymmetric": "U",
  298. "hermitian": "H",
  299. "skewsymmetric": "Z",
  300. "rectangular": "R"
  301. }
  302. _q2f_storage = {
  303. "assembled": "A",
  304. "elemental": "E",
  305. }
  306. _f2q_type = {j: i for i, j in _q2f_type.items()}
  307. _f2q_structure = {j: i for i, j in _q2f_structure.items()}
  308. _f2q_storage = {j: i for i, j in _q2f_storage.items()}
  309. @classmethod
  310. def from_fortran(cls, fmt):
  311. if not len(fmt) == 3:
  312. raise ValueError("Fortran format for matrix type should be 3 "
  313. "characters long")
  314. try:
  315. value_type = cls._f2q_type[fmt[0]]
  316. structure = cls._f2q_structure[fmt[1]]
  317. storage = cls._f2q_storage[fmt[2]]
  318. return cls(value_type, structure, storage)
  319. except KeyError as e:
  320. raise ValueError(f"Unrecognized format {fmt}") from e
  321. def __init__(self, value_type, structure, storage="assembled"):
  322. self.value_type = value_type
  323. self.structure = structure
  324. self.storage = storage
  325. if value_type not in self._q2f_type:
  326. raise ValueError(f"Unrecognized type {value_type}")
  327. if structure not in self._q2f_structure:
  328. raise ValueError(f"Unrecognized structure {structure}")
  329. if storage not in self._q2f_storage:
  330. raise ValueError(f"Unrecognized storage {storage}")
  331. @property
  332. def fortran_format(self):
  333. return self._q2f_type[self.value_type] + \
  334. self._q2f_structure[self.structure] + \
  335. self._q2f_storage[self.storage]
  336. def __repr__(self):
  337. return f"HBMatrixType({self.value_type}, {self.structure}, {self.storage})"
  338. class HBFile:
  339. def __init__(self, file, hb_info=None):
  340. """Create a HBFile instance.
  341. Parameters
  342. ----------
  343. file : file-object
  344. StringIO work as well
  345. hb_info : HBInfo, optional
  346. Should be given as an argument for writing, in which case the file
  347. should be writable.
  348. """
  349. self._fid = file
  350. if hb_info is None:
  351. self._hb_info = HBInfo.from_file(file)
  352. else:
  353. #raise OSError("file %s is not writable, and hb_info "
  354. # "was given." % file)
  355. self._hb_info = hb_info
  356. @property
  357. def title(self):
  358. return self._hb_info.title
  359. @property
  360. def key(self):
  361. return self._hb_info.key
  362. @property
  363. def type(self):
  364. return self._hb_info.mxtype.value_type
  365. @property
  366. def structure(self):
  367. return self._hb_info.mxtype.structure
  368. @property
  369. def storage(self):
  370. return self._hb_info.mxtype.storage
  371. def read_matrix(self):
  372. return _read_hb_data(self._fid, self._hb_info)
  373. def write_matrix(self, m):
  374. return _write_data(m, self._fid, self._hb_info)
  375. def hb_read(path_or_open_file, *, spmatrix=True):
  376. """Read HB-format file.
  377. Parameters
  378. ----------
  379. path_or_open_file : path-like or file-like
  380. If a file-like object, it is used as-is. Otherwise, it is opened
  381. before reading.
  382. spmatrix : bool, optional (default: True)
  383. If ``True``, return sparse matrix. Otherwise return sparse array.
  384. Returns
  385. -------
  386. data : csc_array or csc_matrix
  387. The data read from the HB file as a sparse array.
  388. Notes
  389. -----
  390. At the moment not the full Harwell-Boeing format is supported. Supported
  391. features are:
  392. - assembled, non-symmetric, real matrices
  393. - integer for pointer/indices
  394. - exponential format for float values, and int format
  395. Examples
  396. --------
  397. We can read and write a harwell-boeing format file:
  398. >>> from scipy.io import hb_read, hb_write
  399. >>> from scipy.sparse import csr_array, eye
  400. >>> data = csr_array(eye(3)) # create a sparse array
  401. >>> hb_write("data.hb", data) # write a hb file
  402. >>> print(hb_read("data.hb", spmatrix=False)) # read a hb file
  403. <Compressed Sparse Column sparse array of dtype 'float64'
  404. with 3 stored elements and shape (3, 3)>
  405. Coords Values
  406. (0, 0) 1.0
  407. (1, 1) 1.0
  408. (2, 2) 1.0
  409. """
  410. def _get_matrix(fid):
  411. hb = HBFile(fid)
  412. return hb.read_matrix()
  413. if hasattr(path_or_open_file, 'read'):
  414. data = _get_matrix(path_or_open_file)
  415. else:
  416. with open(path_or_open_file) as f:
  417. data = _get_matrix(f)
  418. if spmatrix:
  419. return csc_matrix(data)
  420. return data
  421. def hb_write(path_or_open_file, m, hb_info=None):
  422. """Write HB-format file.
  423. Parameters
  424. ----------
  425. path_or_open_file : path-like or file-like
  426. If a file-like object, it is used as-is. Otherwise, it is opened
  427. before writing.
  428. m : sparse array or matrix
  429. the sparse array to write
  430. hb_info : HBInfo
  431. contains the meta-data for write
  432. Returns
  433. -------
  434. None
  435. Notes
  436. -----
  437. At the moment not the full Harwell-Boeing format is supported. Supported
  438. features are:
  439. - assembled, non-symmetric, real matrices
  440. - integer for pointer/indices
  441. - exponential format for float values, and int format
  442. Examples
  443. --------
  444. We can read and write a harwell-boeing format file:
  445. >>> from scipy.io import hb_read, hb_write
  446. >>> from scipy.sparse import csr_array, eye
  447. >>> data = csr_array(eye(3)) # create a sparse array
  448. >>> hb_write("data.hb", data) # write a hb file
  449. >>> print(hb_read("data.hb", spmatrix=False)) # read a hb file
  450. <Compressed Sparse Column sparse array of dtype 'float64'
  451. with 3 stored elements and shape (3, 3)>
  452. Coords Values
  453. (0, 0) 1.0
  454. (1, 1) 1.0
  455. (2, 2) 1.0
  456. """
  457. m = m.tocsc(copy=False)
  458. if hb_info is None:
  459. hb_info = HBInfo.from_data(m)
  460. def _set_matrix(fid):
  461. hb = HBFile(fid, hb_info)
  462. return hb.write_matrix(m)
  463. if hasattr(path_or_open_file, 'write'):
  464. return _set_matrix(path_or_open_file)
  465. else:
  466. with open(path_or_open_file, 'w') as f:
  467. return _set_matrix(f)