| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568 |
- """Compressed Sparse Row matrix format"""
- __docformat__ = "restructuredtext en"
- __all__ = ['csr_array', 'csr_matrix', 'isspmatrix_csr']
- import numpy as np
- from ._matrix import spmatrix
- from ._base import _spbase, sparray
- from ._sparsetools import (csr_tocsc, csr_tobsr, csr_count_blocks,
- get_csr_submatrix, csr_sample_values)
- from ._sputils import upcast
- from ._compressed import _cs_matrix
- class _csr_base(_cs_matrix):
- _format = 'csr'
- _allow_nd = (1, 2)
- def transpose(self, axes=None, copy=False):
- if axes is not None and axes != (1, 0):
- raise ValueError("Sparse arrays/matrices do not support "
- "an 'axes' parameter because swapping "
- "dimensions is the only logical permutation.")
- if self.ndim == 1:
- return self.copy() if copy else self
- M, N = self.shape
- return self._csc_container((self.data, self.indices,
- self.indptr), shape=(N, M), copy=copy)
- transpose.__doc__ = _spbase.transpose.__doc__
- def tolil(self, copy=False):
- if self.ndim != 2:
- raise ValueError("Cannot convert a 1d sparse array to lil format")
- lil = self._lil_container(self.shape, dtype=self.dtype)
- self.sum_duplicates()
- ptr,ind,dat = self.indptr,self.indices,self.data
- rows, data = lil.rows, lil.data
- for n in range(self.shape[0]):
- start = ptr[n]
- end = ptr[n+1]
- rows[n] = ind[start:end].tolist()
- data[n] = dat[start:end].tolist()
- return lil
- tolil.__doc__ = _spbase.tolil.__doc__
- def tocsr(self, copy=False):
- if copy:
- return self.copy()
- else:
- return self
- tocsr.__doc__ = _spbase.tocsr.__doc__
- def tocoo(self, copy=False):
- A = super().tocoo(copy=copy)
- # CSR-to-COO conversion always preserves [non-]canonicity
- # (indices sorting, presense of duplicate elements).
- # Handled here instead of _cs_matrix because CSC-to-COO generally does not.
- A.has_canonical_format = self.has_canonical_format
- return A
- tocoo.__doc__ = _spbase.tocoo.__doc__
- def tocsc(self, copy=False):
- if self.ndim != 2:
- raise ValueError("Cannot convert a 1d sparse array to csc format")
- M, N = self.shape
- idx_dtype = self._get_index_dtype((self.indptr, self.indices),
- maxval=max(self.nnz, M))
- indptr = np.empty(N + 1, dtype=idx_dtype)
- indices = np.empty(self.nnz, dtype=idx_dtype)
- data = np.empty(self.nnz, dtype=upcast(self.dtype))
- csr_tocsc(M, N,
- self.indptr.astype(idx_dtype, copy=False),
- self.indices.astype(idx_dtype, copy=False),
- self.data,
- indptr,
- indices,
- data)
- A = self._csc_container((data, indices, indptr), shape=self.shape)
- A.has_sorted_indices = True
- return A
- tocsc.__doc__ = _spbase.tocsc.__doc__
- def tobsr(self, blocksize=None, copy=True):
- if self.ndim != 2:
- raise ValueError("Cannot convert a 1d sparse array to bsr format")
- if blocksize is None:
- from ._spfuncs import estimate_blocksize
- return self.tobsr(blocksize=estimate_blocksize(self))
- elif blocksize == (1,1):
- arg1 = (self.data.reshape(-1,1,1),self.indices,self.indptr)
- return self._bsr_container(arg1, shape=self.shape, copy=copy)
- else:
- R,C = blocksize
- M,N = self.shape
- if R < 1 or C < 1 or M % R != 0 or N % C != 0:
- raise ValueError(f'invalid blocksize {blocksize}')
- blks = csr_count_blocks(M,N,R,C,self.indptr,self.indices)
- idx_dtype = self._get_index_dtype((self.indptr, self.indices),
- maxval=max(N//C, blks))
- indptr = np.empty(M//R+1, dtype=idx_dtype)
- indices = np.empty(blks, dtype=idx_dtype)
- data = np.zeros((blks,R,C), dtype=self.dtype)
- csr_tobsr(M, N, R, C,
- self.indptr.astype(idx_dtype, copy=False),
- self.indices.astype(idx_dtype, copy=False),
- self.data,
- indptr, indices, data.ravel())
- return self._bsr_container(
- (data, indices, indptr), shape=self.shape
- )
- tobsr.__doc__ = _spbase.tobsr.__doc__
- # these functions are used by the parent class (_cs_matrix)
- # to remove redundancy between csc_matrix and csr_array
- @staticmethod
- def _swap(x):
- """swap the members of x if this is a column-oriented matrix
- """
- return x
- def __iter__(self):
- if self.ndim == 1:
- zero = self.dtype.type(0)
- u = 0
- for v, d in zip(self.indices, self.data):
- for _ in range(v - u):
- yield zero
- yield d
- u = v + 1
- for _ in range(self.shape[0] - u):
- yield zero
- return
- indptr = np.zeros(2, dtype=self.indptr.dtype)
- # return 1d (sparray) or 2drow (spmatrix)
- shape = self.shape[1:] if isinstance(self, sparray) else (1, self.shape[1])
- i0 = 0
- for i1 in self.indptr[1:]:
- indptr[1] = i1 - i0
- indices = self.indices[i0:i1]
- data = self.data[i0:i1]
- yield self.__class__((data, indices, indptr), shape=shape, copy=True)
- i0 = i1
- def _getrow(self, i):
- """Returns a copy of row i of the matrix, as a (1 x n)
- CSR matrix (row vector).
- """
- if self.ndim == 1:
- if i not in (0, -1):
- raise IndexError(f'index ({i}) out of range')
- return self.reshape((1, self.shape[0]), copy=True)
- M, N = self.shape
- i = int(i)
- if i < 0:
- i += M
- if i < 0 or i >= M:
- raise IndexError(f'index ({i}) out of range')
- indptr, indices, data = get_csr_submatrix(
- M, N, self.indptr, self.indices, self.data, i, i + 1, 0, N)
- return self.__class__((data, indices, indptr), shape=(1, N),
- dtype=self.dtype, copy=False)
- def _getcol(self, i):
- """Returns a copy of column i. A (m x 1) sparse array (column vector).
- """
- if self.ndim == 1:
- raise ValueError("getcol not provided for 1d arrays. Use indexing A[j]")
- M, N = self.shape
- i = int(i)
- if i < 0:
- i += N
- if i < 0 or i >= N:
- raise IndexError(f'index ({i}) out of range')
- indptr, indices, data = get_csr_submatrix(
- M, N, self.indptr, self.indices, self.data, 0, M, i, i + 1)
- return self.__class__((data, indices, indptr), shape=(M, 1),
- dtype=self.dtype, copy=False)
- def _get_int(self, idx):
- spot = np.flatnonzero(self.indices == idx)
- if spot.size:
- return self.data[spot[0]]
- return self.data.dtype.type(0)
- def _get_slice(self, idx):
- if idx == slice(None):
- return self.copy()
- if idx.step in (1, None):
- ret = self._get_submatrix(0, idx, copy=True)
- return ret.reshape(ret.shape[-1])
- return self._minor_slice(idx)
- def _get_array(self, idx):
- idx_dtype = self._get_index_dtype(self.indices)
- idx = np.asarray(idx, dtype=idx_dtype)
- if idx.size == 0:
- return self.__class__([], dtype=self.dtype)
- M, N = 1, self.shape[0]
- row = np.zeros_like(idx, dtype=idx_dtype)
- col = np.asarray(idx, dtype=idx_dtype)
- val = np.empty(row.size, dtype=self.dtype)
- csr_sample_values(M, N, self.indptr, self.indices, self.data,
- row.size, row, col, val)
- new_shape = col.shape if col.shape[0] > 1 else (col.shape[0],)
- return self.__class__(val.reshape(new_shape))
- def _get_intXarray(self, row, col):
- return self._getrow(row)._minor_index_fancy(col)
- def _get_intXslice(self, row, col):
- if col.step in (1, None):
- return self._get_submatrix(row, col, copy=True)
- # TODO: uncomment this once it's faster:
- # return self._getrow(row)._minor_slice(col)
- M, N = self.shape
- start, stop, stride = col.indices(N)
- ii, jj = self.indptr[row:row+2]
- row_indices = self.indices[ii:jj]
- row_data = self.data[ii:jj]
- if stride > 0:
- ind = (row_indices >= start) & (row_indices < stop)
- else:
- ind = (row_indices <= start) & (row_indices > stop)
- if abs(stride) > 1:
- ind &= (row_indices - start) % stride == 0
- row_indices = (row_indices[ind] - start) // stride
- row_data = row_data[ind]
- row_indptr = np.array([0, len(row_indices)])
- if stride < 0:
- row_data = row_data[::-1]
- row_indices = abs(row_indices[::-1])
- shape = (1, max(0, int(np.ceil(float(stop - start) / stride))))
- return self.__class__((row_data, row_indices, row_indptr), shape=shape,
- dtype=self.dtype, copy=False)
- def _get_sliceXint(self, row, col):
- if row.step in (1, None):
- return self._get_submatrix(row, col, copy=True)
- return self._major_slice(row)._get_submatrix(minor=col)
- def _get_sliceXarray(self, row, col):
- return self._major_slice(row)._minor_index_fancy(col)
- def _get_arrayXint(self, row, col):
- res = self._major_index_fancy(row)._get_submatrix(minor=col)
- if row.ndim > 1:
- return res.reshape(row.shape)
- return res
- def _get_arrayXslice(self, row, col):
- if col.step not in (1, None):
- col = np.arange(*col.indices(self.shape[1]))
- return self._get_arrayXarray(row, col)
- return self._major_index_fancy(row)._get_submatrix(minor=col)
- def _set_int(self, idx, x):
- self._set_many(0, idx, x)
- def _set_array(self, idx, x):
- x = np.broadcast_to(x, idx.shape)
- self._set_many(np.zeros_like(idx), idx, x)
- def isspmatrix_csr(x):
- """Is `x` of csr_matrix type?
- Parameters
- ----------
- x
- object to check for being a csr matrix
- Returns
- -------
- bool
- True if `x` is a csr matrix, False otherwise
- Examples
- --------
- >>> from scipy.sparse import csr_array, csr_matrix, coo_matrix, isspmatrix_csr
- >>> isspmatrix_csr(csr_matrix([[5]]))
- True
- >>> isspmatrix_csr(csr_array([[5]]))
- False
- >>> isspmatrix_csr(coo_matrix([[5]]))
- False
- """
- return isinstance(x, csr_matrix)
- # This namespace class separates array from matrix with isinstance
- class csr_array(_csr_base, sparray):
- """
- Compressed Sparse Row array.
- This can be instantiated in several ways:
- csr_array(D)
- where D is a 2-D ndarray
- csr_array(S)
- with another sparse array or matrix S (equivalent to S.tocsr())
- csr_array((M, N), [dtype])
- to construct an empty array with shape (M, N)
- dtype is optional, defaulting to dtype='d'.
- csr_array((data, (row_ind, col_ind)), [shape=(M, N)])
- where ``data``, ``row_ind`` and ``col_ind`` satisfy the
- relationship ``a[row_ind[k], col_ind[k]] = data[k]``.
- csr_array((data, indices, indptr), [shape=(M, N)])
- is the standard CSR representation where the column indices for
- row i are stored in ``indices[indptr[i]:indptr[i+1]]`` and their
- corresponding values are stored in ``data[indptr[i]:indptr[i+1]]``.
- If the shape parameter is not supplied, the array dimensions
- are inferred from the index arrays.
- Attributes
- ----------
- dtype : dtype
- Data type of the array
- shape : 2-tuple
- Shape of the array
- ndim : int
- Number of dimensions (this is always 2)
- nnz
- size
- data
- CSR format data array of the array
- indices
- CSR format index array of the array
- indptr
- CSR format index pointer array of the array
- has_sorted_indices
- has_canonical_format
- T
- Notes
- -----
- Sparse arrays can be used in arithmetic operations: they support
- addition, subtraction, multiplication, division, and matrix power.
- Advantages of the CSR format
- - efficient arithmetic operations CSR + CSR, CSR * CSR, etc.
- - efficient row slicing
- - fast matrix vector products
- Disadvantages of the CSR format
- - slow column slicing operations (consider CSC)
- - changes to the sparsity structure are expensive (consider LIL or DOK)
- Canonical Format
- - Within each row, indices are sorted by column.
- - There are no duplicate entries.
- Examples
- --------
- >>> import numpy as np
- >>> from scipy.sparse import csr_array
- >>> csr_array((3, 4), dtype=np.int8).toarray()
- array([[0, 0, 0, 0],
- [0, 0, 0, 0],
- [0, 0, 0, 0]], dtype=int8)
- >>> row = np.array([0, 0, 1, 2, 2, 2])
- >>> col = np.array([0, 2, 2, 0, 1, 2])
- >>> data = np.array([1, 2, 3, 4, 5, 6])
- >>> csr_array((data, (row, col)), shape=(3, 3)).toarray()
- array([[1, 0, 2],
- [0, 0, 3],
- [4, 5, 6]])
- >>> indptr = np.array([0, 2, 3, 6])
- >>> indices = np.array([0, 2, 2, 0, 1, 2])
- >>> data = np.array([1, 2, 3, 4, 5, 6])
- >>> csr_array((data, indices, indptr), shape=(3, 3)).toarray()
- array([[1, 0, 2],
- [0, 0, 3],
- [4, 5, 6]])
- Duplicate entries are summed together:
- >>> row = np.array([0, 1, 2, 0])
- >>> col = np.array([0, 1, 1, 0])
- >>> data = np.array([1, 2, 4, 8])
- >>> csr_array((data, (row, col)), shape=(3, 3)).toarray()
- array([[9, 0, 0],
- [0, 2, 0],
- [0, 4, 0]])
- As an example of how to construct a CSR array incrementally,
- the following snippet builds a term-document array from texts:
- >>> docs = [["hello", "world", "hello"], ["goodbye", "cruel", "world"]]
- >>> indptr = [0]
- >>> indices = []
- >>> data = []
- >>> vocabulary = {}
- >>> for d in docs:
- ... for term in d:
- ... index = vocabulary.setdefault(term, len(vocabulary))
- ... indices.append(index)
- ... data.append(1)
- ... indptr.append(len(indices))
- ...
- >>> csr_array((data, indices, indptr), dtype=int).toarray()
- array([[2, 1, 0, 0],
- [0, 1, 1, 1]])
- """
- class csr_matrix(spmatrix, _csr_base):
- """
- Compressed Sparse Row matrix.
- This can be instantiated in several ways:
- csr_matrix(D)
- where D is a 2-D ndarray
- csr_matrix(S)
- with another sparse array or matrix S (equivalent to S.tocsr())
- csr_matrix((M, N), [dtype])
- to construct an empty matrix with shape (M, N)
- dtype is optional, defaulting to dtype='d'.
- csr_matrix((data, (row_ind, col_ind)), [shape=(M, N)])
- where ``data``, ``row_ind`` and ``col_ind`` satisfy the
- relationship ``a[row_ind[k], col_ind[k]] = data[k]``.
- csr_matrix((data, indices, indptr), [shape=(M, N)])
- is the standard CSR representation where the column indices for
- row i are stored in ``indices[indptr[i]:indptr[i+1]]`` and their
- corresponding values are stored in ``data[indptr[i]:indptr[i+1]]``.
- If the shape parameter is not supplied, the matrix dimensions
- are inferred from the index arrays.
- Attributes
- ----------
- dtype : dtype
- Data type of the matrix
- shape : 2-tuple
- Shape of the matrix
- ndim : int
- Number of dimensions (this is always 2)
- nnz
- size
- data
- CSR format data array of the matrix
- indices
- CSR format index array of the matrix
- indptr
- CSR format index pointer array of the matrix
- has_sorted_indices
- has_canonical_format
- T
- Notes
- -----
- Sparse matrices can be used in arithmetic operations: they support
- addition, subtraction, multiplication, division, and matrix power.
- Advantages of the CSR format
- - efficient arithmetic operations CSR + CSR, CSR * CSR, etc.
- - efficient row slicing
- - fast matrix vector products
- Disadvantages of the CSR format
- - slow column slicing operations (consider CSC)
- - changes to the sparsity structure are expensive (consider LIL or DOK)
- Canonical Format
- - Within each row, indices are sorted by column.
- - There are no duplicate entries.
- Examples
- --------
- >>> import numpy as np
- >>> from scipy.sparse import csr_matrix
- >>> csr_matrix((3, 4), dtype=np.int8).toarray()
- array([[0, 0, 0, 0],
- [0, 0, 0, 0],
- [0, 0, 0, 0]], dtype=int8)
- >>> row = np.array([0, 0, 1, 2, 2, 2])
- >>> col = np.array([0, 2, 2, 0, 1, 2])
- >>> data = np.array([1, 2, 3, 4, 5, 6])
- >>> csr_matrix((data, (row, col)), shape=(3, 3)).toarray()
- array([[1, 0, 2],
- [0, 0, 3],
- [4, 5, 6]])
- >>> indptr = np.array([0, 2, 3, 6])
- >>> indices = np.array([0, 2, 2, 0, 1, 2])
- >>> data = np.array([1, 2, 3, 4, 5, 6])
- >>> csr_matrix((data, indices, indptr), shape=(3, 3)).toarray()
- array([[1, 0, 2],
- [0, 0, 3],
- [4, 5, 6]])
- Duplicate entries are summed together:
- >>> row = np.array([0, 1, 2, 0])
- >>> col = np.array([0, 1, 1, 0])
- >>> data = np.array([1, 2, 4, 8])
- >>> csr_matrix((data, (row, col)), shape=(3, 3)).toarray()
- array([[9, 0, 0],
- [0, 2, 0],
- [0, 4, 0]])
- As an example of how to construct a CSR matrix incrementally,
- the following snippet builds a term-document matrix from texts:
- >>> docs = [["hello", "world", "hello"], ["goodbye", "cruel", "world"]]
- >>> indptr = [0]
- >>> indices = []
- >>> data = []
- >>> vocabulary = {}
- >>> for d in docs:
- ... for term in d:
- ... index = vocabulary.setdefault(term, len(vocabulary))
- ... indices.append(index)
- ... data.append(1)
- ... indptr.append(len(indices))
- ...
- >>> csr_matrix((data, indices, indptr), dtype=int).toarray()
- array([[2, 1, 0, 0],
- [0, 1, 1, 1]])
- """
|