_csr.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568
  1. """Compressed Sparse Row matrix format"""
  2. __docformat__ = "restructuredtext en"
  3. __all__ = ['csr_array', 'csr_matrix', 'isspmatrix_csr']
  4. import numpy as np
  5. from ._matrix import spmatrix
  6. from ._base import _spbase, sparray
  7. from ._sparsetools import (csr_tocsc, csr_tobsr, csr_count_blocks,
  8. get_csr_submatrix, csr_sample_values)
  9. from ._sputils import upcast
  10. from ._compressed import _cs_matrix
  11. class _csr_base(_cs_matrix):
  12. _format = 'csr'
  13. _allow_nd = (1, 2)
  14. def transpose(self, axes=None, copy=False):
  15. if axes is not None and axes != (1, 0):
  16. raise ValueError("Sparse arrays/matrices do not support "
  17. "an 'axes' parameter because swapping "
  18. "dimensions is the only logical permutation.")
  19. if self.ndim == 1:
  20. return self.copy() if copy else self
  21. M, N = self.shape
  22. return self._csc_container((self.data, self.indices,
  23. self.indptr), shape=(N, M), copy=copy)
  24. transpose.__doc__ = _spbase.transpose.__doc__
  25. def tolil(self, copy=False):
  26. if self.ndim != 2:
  27. raise ValueError("Cannot convert a 1d sparse array to lil format")
  28. lil = self._lil_container(self.shape, dtype=self.dtype)
  29. self.sum_duplicates()
  30. ptr,ind,dat = self.indptr,self.indices,self.data
  31. rows, data = lil.rows, lil.data
  32. for n in range(self.shape[0]):
  33. start = ptr[n]
  34. end = ptr[n+1]
  35. rows[n] = ind[start:end].tolist()
  36. data[n] = dat[start:end].tolist()
  37. return lil
  38. tolil.__doc__ = _spbase.tolil.__doc__
  39. def tocsr(self, copy=False):
  40. if copy:
  41. return self.copy()
  42. else:
  43. return self
  44. tocsr.__doc__ = _spbase.tocsr.__doc__
  45. def tocoo(self, copy=False):
  46. A = super().tocoo(copy=copy)
  47. # CSR-to-COO conversion always preserves [non-]canonicity
  48. # (indices sorting, presense of duplicate elements).
  49. # Handled here instead of _cs_matrix because CSC-to-COO generally does not.
  50. A.has_canonical_format = self.has_canonical_format
  51. return A
  52. tocoo.__doc__ = _spbase.tocoo.__doc__
  53. def tocsc(self, copy=False):
  54. if self.ndim != 2:
  55. raise ValueError("Cannot convert a 1d sparse array to csc format")
  56. M, N = self.shape
  57. idx_dtype = self._get_index_dtype((self.indptr, self.indices),
  58. maxval=max(self.nnz, M))
  59. indptr = np.empty(N + 1, dtype=idx_dtype)
  60. indices = np.empty(self.nnz, dtype=idx_dtype)
  61. data = np.empty(self.nnz, dtype=upcast(self.dtype))
  62. csr_tocsc(M, N,
  63. self.indptr.astype(idx_dtype, copy=False),
  64. self.indices.astype(idx_dtype, copy=False),
  65. self.data,
  66. indptr,
  67. indices,
  68. data)
  69. A = self._csc_container((data, indices, indptr), shape=self.shape)
  70. A.has_sorted_indices = True
  71. return A
  72. tocsc.__doc__ = _spbase.tocsc.__doc__
  73. def tobsr(self, blocksize=None, copy=True):
  74. if self.ndim != 2:
  75. raise ValueError("Cannot convert a 1d sparse array to bsr format")
  76. if blocksize is None:
  77. from ._spfuncs import estimate_blocksize
  78. return self.tobsr(blocksize=estimate_blocksize(self))
  79. elif blocksize == (1,1):
  80. arg1 = (self.data.reshape(-1,1,1),self.indices,self.indptr)
  81. return self._bsr_container(arg1, shape=self.shape, copy=copy)
  82. else:
  83. R,C = blocksize
  84. M,N = self.shape
  85. if R < 1 or C < 1 or M % R != 0 or N % C != 0:
  86. raise ValueError(f'invalid blocksize {blocksize}')
  87. blks = csr_count_blocks(M,N,R,C,self.indptr,self.indices)
  88. idx_dtype = self._get_index_dtype((self.indptr, self.indices),
  89. maxval=max(N//C, blks))
  90. indptr = np.empty(M//R+1, dtype=idx_dtype)
  91. indices = np.empty(blks, dtype=idx_dtype)
  92. data = np.zeros((blks,R,C), dtype=self.dtype)
  93. csr_tobsr(M, N, R, C,
  94. self.indptr.astype(idx_dtype, copy=False),
  95. self.indices.astype(idx_dtype, copy=False),
  96. self.data,
  97. indptr, indices, data.ravel())
  98. return self._bsr_container(
  99. (data, indices, indptr), shape=self.shape
  100. )
  101. tobsr.__doc__ = _spbase.tobsr.__doc__
  102. # these functions are used by the parent class (_cs_matrix)
  103. # to remove redundancy between csc_matrix and csr_array
  104. @staticmethod
  105. def _swap(x):
  106. """swap the members of x if this is a column-oriented matrix
  107. """
  108. return x
  109. def __iter__(self):
  110. if self.ndim == 1:
  111. zero = self.dtype.type(0)
  112. u = 0
  113. for v, d in zip(self.indices, self.data):
  114. for _ in range(v - u):
  115. yield zero
  116. yield d
  117. u = v + 1
  118. for _ in range(self.shape[0] - u):
  119. yield zero
  120. return
  121. indptr = np.zeros(2, dtype=self.indptr.dtype)
  122. # return 1d (sparray) or 2drow (spmatrix)
  123. shape = self.shape[1:] if isinstance(self, sparray) else (1, self.shape[1])
  124. i0 = 0
  125. for i1 in self.indptr[1:]:
  126. indptr[1] = i1 - i0
  127. indices = self.indices[i0:i1]
  128. data = self.data[i0:i1]
  129. yield self.__class__((data, indices, indptr), shape=shape, copy=True)
  130. i0 = i1
  131. def _getrow(self, i):
  132. """Returns a copy of row i of the matrix, as a (1 x n)
  133. CSR matrix (row vector).
  134. """
  135. if self.ndim == 1:
  136. if i not in (0, -1):
  137. raise IndexError(f'index ({i}) out of range')
  138. return self.reshape((1, self.shape[0]), copy=True)
  139. M, N = self.shape
  140. i = int(i)
  141. if i < 0:
  142. i += M
  143. if i < 0 or i >= M:
  144. raise IndexError(f'index ({i}) out of range')
  145. indptr, indices, data = get_csr_submatrix(
  146. M, N, self.indptr, self.indices, self.data, i, i + 1, 0, N)
  147. return self.__class__((data, indices, indptr), shape=(1, N),
  148. dtype=self.dtype, copy=False)
  149. def _getcol(self, i):
  150. """Returns a copy of column i. A (m x 1) sparse array (column vector).
  151. """
  152. if self.ndim == 1:
  153. raise ValueError("getcol not provided for 1d arrays. Use indexing A[j]")
  154. M, N = self.shape
  155. i = int(i)
  156. if i < 0:
  157. i += N
  158. if i < 0 or i >= N:
  159. raise IndexError(f'index ({i}) out of range')
  160. indptr, indices, data = get_csr_submatrix(
  161. M, N, self.indptr, self.indices, self.data, 0, M, i, i + 1)
  162. return self.__class__((data, indices, indptr), shape=(M, 1),
  163. dtype=self.dtype, copy=False)
  164. def _get_int(self, idx):
  165. spot = np.flatnonzero(self.indices == idx)
  166. if spot.size:
  167. return self.data[spot[0]]
  168. return self.data.dtype.type(0)
  169. def _get_slice(self, idx):
  170. if idx == slice(None):
  171. return self.copy()
  172. if idx.step in (1, None):
  173. ret = self._get_submatrix(0, idx, copy=True)
  174. return ret.reshape(ret.shape[-1])
  175. return self._minor_slice(idx)
  176. def _get_array(self, idx):
  177. idx_dtype = self._get_index_dtype(self.indices)
  178. idx = np.asarray(idx, dtype=idx_dtype)
  179. if idx.size == 0:
  180. return self.__class__([], dtype=self.dtype)
  181. M, N = 1, self.shape[0]
  182. row = np.zeros_like(idx, dtype=idx_dtype)
  183. col = np.asarray(idx, dtype=idx_dtype)
  184. val = np.empty(row.size, dtype=self.dtype)
  185. csr_sample_values(M, N, self.indptr, self.indices, self.data,
  186. row.size, row, col, val)
  187. new_shape = col.shape if col.shape[0] > 1 else (col.shape[0],)
  188. return self.__class__(val.reshape(new_shape))
  189. def _get_intXarray(self, row, col):
  190. return self._getrow(row)._minor_index_fancy(col)
  191. def _get_intXslice(self, row, col):
  192. if col.step in (1, None):
  193. return self._get_submatrix(row, col, copy=True)
  194. # TODO: uncomment this once it's faster:
  195. # return self._getrow(row)._minor_slice(col)
  196. M, N = self.shape
  197. start, stop, stride = col.indices(N)
  198. ii, jj = self.indptr[row:row+2]
  199. row_indices = self.indices[ii:jj]
  200. row_data = self.data[ii:jj]
  201. if stride > 0:
  202. ind = (row_indices >= start) & (row_indices < stop)
  203. else:
  204. ind = (row_indices <= start) & (row_indices > stop)
  205. if abs(stride) > 1:
  206. ind &= (row_indices - start) % stride == 0
  207. row_indices = (row_indices[ind] - start) // stride
  208. row_data = row_data[ind]
  209. row_indptr = np.array([0, len(row_indices)])
  210. if stride < 0:
  211. row_data = row_data[::-1]
  212. row_indices = abs(row_indices[::-1])
  213. shape = (1, max(0, int(np.ceil(float(stop - start) / stride))))
  214. return self.__class__((row_data, row_indices, row_indptr), shape=shape,
  215. dtype=self.dtype, copy=False)
  216. def _get_sliceXint(self, row, col):
  217. if row.step in (1, None):
  218. return self._get_submatrix(row, col, copy=True)
  219. return self._major_slice(row)._get_submatrix(minor=col)
  220. def _get_sliceXarray(self, row, col):
  221. return self._major_slice(row)._minor_index_fancy(col)
  222. def _get_arrayXint(self, row, col):
  223. res = self._major_index_fancy(row)._get_submatrix(minor=col)
  224. if row.ndim > 1:
  225. return res.reshape(row.shape)
  226. return res
  227. def _get_arrayXslice(self, row, col):
  228. if col.step not in (1, None):
  229. col = np.arange(*col.indices(self.shape[1]))
  230. return self._get_arrayXarray(row, col)
  231. return self._major_index_fancy(row)._get_submatrix(minor=col)
  232. def _set_int(self, idx, x):
  233. self._set_many(0, idx, x)
  234. def _set_array(self, idx, x):
  235. x = np.broadcast_to(x, idx.shape)
  236. self._set_many(np.zeros_like(idx), idx, x)
  237. def isspmatrix_csr(x):
  238. """Is `x` of csr_matrix type?
  239. Parameters
  240. ----------
  241. x
  242. object to check for being a csr matrix
  243. Returns
  244. -------
  245. bool
  246. True if `x` is a csr matrix, False otherwise
  247. Examples
  248. --------
  249. >>> from scipy.sparse import csr_array, csr_matrix, coo_matrix, isspmatrix_csr
  250. >>> isspmatrix_csr(csr_matrix([[5]]))
  251. True
  252. >>> isspmatrix_csr(csr_array([[5]]))
  253. False
  254. >>> isspmatrix_csr(coo_matrix([[5]]))
  255. False
  256. """
  257. return isinstance(x, csr_matrix)
  258. # This namespace class separates array from matrix with isinstance
  259. class csr_array(_csr_base, sparray):
  260. """
  261. Compressed Sparse Row array.
  262. This can be instantiated in several ways:
  263. csr_array(D)
  264. where D is a 2-D ndarray
  265. csr_array(S)
  266. with another sparse array or matrix S (equivalent to S.tocsr())
  267. csr_array((M, N), [dtype])
  268. to construct an empty array with shape (M, N)
  269. dtype is optional, defaulting to dtype='d'.
  270. csr_array((data, (row_ind, col_ind)), [shape=(M, N)])
  271. where ``data``, ``row_ind`` and ``col_ind`` satisfy the
  272. relationship ``a[row_ind[k], col_ind[k]] = data[k]``.
  273. csr_array((data, indices, indptr), [shape=(M, N)])
  274. is the standard CSR representation where the column indices for
  275. row i are stored in ``indices[indptr[i]:indptr[i+1]]`` and their
  276. corresponding values are stored in ``data[indptr[i]:indptr[i+1]]``.
  277. If the shape parameter is not supplied, the array dimensions
  278. are inferred from the index arrays.
  279. Attributes
  280. ----------
  281. dtype : dtype
  282. Data type of the array
  283. shape : 2-tuple
  284. Shape of the array
  285. ndim : int
  286. Number of dimensions (this is always 2)
  287. nnz
  288. size
  289. data
  290. CSR format data array of the array
  291. indices
  292. CSR format index array of the array
  293. indptr
  294. CSR format index pointer array of the array
  295. has_sorted_indices
  296. has_canonical_format
  297. T
  298. Notes
  299. -----
  300. Sparse arrays can be used in arithmetic operations: they support
  301. addition, subtraction, multiplication, division, and matrix power.
  302. Advantages of the CSR format
  303. - efficient arithmetic operations CSR + CSR, CSR * CSR, etc.
  304. - efficient row slicing
  305. - fast matrix vector products
  306. Disadvantages of the CSR format
  307. - slow column slicing operations (consider CSC)
  308. - changes to the sparsity structure are expensive (consider LIL or DOK)
  309. Canonical Format
  310. - Within each row, indices are sorted by column.
  311. - There are no duplicate entries.
  312. Examples
  313. --------
  314. >>> import numpy as np
  315. >>> from scipy.sparse import csr_array
  316. >>> csr_array((3, 4), dtype=np.int8).toarray()
  317. array([[0, 0, 0, 0],
  318. [0, 0, 0, 0],
  319. [0, 0, 0, 0]], dtype=int8)
  320. >>> row = np.array([0, 0, 1, 2, 2, 2])
  321. >>> col = np.array([0, 2, 2, 0, 1, 2])
  322. >>> data = np.array([1, 2, 3, 4, 5, 6])
  323. >>> csr_array((data, (row, col)), shape=(3, 3)).toarray()
  324. array([[1, 0, 2],
  325. [0, 0, 3],
  326. [4, 5, 6]])
  327. >>> indptr = np.array([0, 2, 3, 6])
  328. >>> indices = np.array([0, 2, 2, 0, 1, 2])
  329. >>> data = np.array([1, 2, 3, 4, 5, 6])
  330. >>> csr_array((data, indices, indptr), shape=(3, 3)).toarray()
  331. array([[1, 0, 2],
  332. [0, 0, 3],
  333. [4, 5, 6]])
  334. Duplicate entries are summed together:
  335. >>> row = np.array([0, 1, 2, 0])
  336. >>> col = np.array([0, 1, 1, 0])
  337. >>> data = np.array([1, 2, 4, 8])
  338. >>> csr_array((data, (row, col)), shape=(3, 3)).toarray()
  339. array([[9, 0, 0],
  340. [0, 2, 0],
  341. [0, 4, 0]])
  342. As an example of how to construct a CSR array incrementally,
  343. the following snippet builds a term-document array from texts:
  344. >>> docs = [["hello", "world", "hello"], ["goodbye", "cruel", "world"]]
  345. >>> indptr = [0]
  346. >>> indices = []
  347. >>> data = []
  348. >>> vocabulary = {}
  349. >>> for d in docs:
  350. ... for term in d:
  351. ... index = vocabulary.setdefault(term, len(vocabulary))
  352. ... indices.append(index)
  353. ... data.append(1)
  354. ... indptr.append(len(indices))
  355. ...
  356. >>> csr_array((data, indices, indptr), dtype=int).toarray()
  357. array([[2, 1, 0, 0],
  358. [0, 1, 1, 1]])
  359. """
  360. class csr_matrix(spmatrix, _csr_base):
  361. """
  362. Compressed Sparse Row matrix.
  363. This can be instantiated in several ways:
  364. csr_matrix(D)
  365. where D is a 2-D ndarray
  366. csr_matrix(S)
  367. with another sparse array or matrix S (equivalent to S.tocsr())
  368. csr_matrix((M, N), [dtype])
  369. to construct an empty matrix with shape (M, N)
  370. dtype is optional, defaulting to dtype='d'.
  371. csr_matrix((data, (row_ind, col_ind)), [shape=(M, N)])
  372. where ``data``, ``row_ind`` and ``col_ind`` satisfy the
  373. relationship ``a[row_ind[k], col_ind[k]] = data[k]``.
  374. csr_matrix((data, indices, indptr), [shape=(M, N)])
  375. is the standard CSR representation where the column indices for
  376. row i are stored in ``indices[indptr[i]:indptr[i+1]]`` and their
  377. corresponding values are stored in ``data[indptr[i]:indptr[i+1]]``.
  378. If the shape parameter is not supplied, the matrix dimensions
  379. are inferred from the index arrays.
  380. Attributes
  381. ----------
  382. dtype : dtype
  383. Data type of the matrix
  384. shape : 2-tuple
  385. Shape of the matrix
  386. ndim : int
  387. Number of dimensions (this is always 2)
  388. nnz
  389. size
  390. data
  391. CSR format data array of the matrix
  392. indices
  393. CSR format index array of the matrix
  394. indptr
  395. CSR format index pointer array of the matrix
  396. has_sorted_indices
  397. has_canonical_format
  398. T
  399. Notes
  400. -----
  401. Sparse matrices can be used in arithmetic operations: they support
  402. addition, subtraction, multiplication, division, and matrix power.
  403. Advantages of the CSR format
  404. - efficient arithmetic operations CSR + CSR, CSR * CSR, etc.
  405. - efficient row slicing
  406. - fast matrix vector products
  407. Disadvantages of the CSR format
  408. - slow column slicing operations (consider CSC)
  409. - changes to the sparsity structure are expensive (consider LIL or DOK)
  410. Canonical Format
  411. - Within each row, indices are sorted by column.
  412. - There are no duplicate entries.
  413. Examples
  414. --------
  415. >>> import numpy as np
  416. >>> from scipy.sparse import csr_matrix
  417. >>> csr_matrix((3, 4), dtype=np.int8).toarray()
  418. array([[0, 0, 0, 0],
  419. [0, 0, 0, 0],
  420. [0, 0, 0, 0]], dtype=int8)
  421. >>> row = np.array([0, 0, 1, 2, 2, 2])
  422. >>> col = np.array([0, 2, 2, 0, 1, 2])
  423. >>> data = np.array([1, 2, 3, 4, 5, 6])
  424. >>> csr_matrix((data, (row, col)), shape=(3, 3)).toarray()
  425. array([[1, 0, 2],
  426. [0, 0, 3],
  427. [4, 5, 6]])
  428. >>> indptr = np.array([0, 2, 3, 6])
  429. >>> indices = np.array([0, 2, 2, 0, 1, 2])
  430. >>> data = np.array([1, 2, 3, 4, 5, 6])
  431. >>> csr_matrix((data, indices, indptr), shape=(3, 3)).toarray()
  432. array([[1, 0, 2],
  433. [0, 0, 3],
  434. [4, 5, 6]])
  435. Duplicate entries are summed together:
  436. >>> row = np.array([0, 1, 2, 0])
  437. >>> col = np.array([0, 1, 1, 0])
  438. >>> data = np.array([1, 2, 4, 8])
  439. >>> csr_matrix((data, (row, col)), shape=(3, 3)).toarray()
  440. array([[9, 0, 0],
  441. [0, 2, 0],
  442. [0, 4, 0]])
  443. As an example of how to construct a CSR matrix incrementally,
  444. the following snippet builds a term-document matrix from texts:
  445. >>> docs = [["hello", "world", "hello"], ["goodbye", "cruel", "world"]]
  446. >>> indptr = [0]
  447. >>> indices = []
  448. >>> data = []
  449. >>> vocabulary = {}
  450. >>> for d in docs:
  451. ... for term in d:
  452. ... index = vocabulary.setdefault(term, len(vocabulary))
  453. ... indices.append(index)
  454. ... data.append(1)
  455. ... indptr.append(len(indices))
  456. ...
  457. >>> csr_matrix((data, indices, indptr), dtype=int).toarray()
  458. array([[2, 1, 0, 0],
  459. [0, 1, 1, 1]])
  460. """