from cuml.common.kernel_utils import cuda_kernel_factory
import math
_inplace_subset_gene = r"""
(int *indptr, int * indices, {0} *data,
int nrows, bool * mask, int* new_idx) {
int row = blockDim.x * blockIdx.x + threadIdx.x;
if(row >= nrows)
return;
int start_idx = indptr[row];
int stop_idx = indptr[row+1];
for(int i = start_idx; i < stop_idx; i++){
int gene_indx = indices[i];
indices[i] =new_idx[gene_indx];
if(mask[gene_indx]){
data[i] =0;
}
}
}
"""
def _mul_csr(dtype):
return cuda_kernel_factory(_inplace_subset_gene, (dtype,), "_inplace_subset_gene")
X = adata.X.copy() #copy the matrix
mask = cp.array(adata.var["n_cells_by_counts"] < 10) #get the mask
new_idx = cp.cumsum(~mask).astype(cp.int32)-1
mul_kernel = _mul_csr(X.dtype)
mul_kernel(
(math.ceil(X.shape[0] / 32),),
(32,),
(X.indptr,X.indices, X.data, X.shape[0], mask,new_idx),
)
out = sparse.csr_matrix((X.data,X.indices,X.indptr),shape=(X.shape[0],X.shape[1]-mask.sum()))
out.eliminate_zeros()
out.shape
Description
Currently if you subset a
CSRmatrix over the minor axis, the matrix get internally converted toCSCto subset and than converted back toCSR. This very memory and time intensive and I ran into a lot of out of memory errors because of this.In my very limited testing using a function that is based around
eliminate_zerosmight work best. This approach would also work for a sorting subset.Additional Information