Elementwise maximum of sparse Scipy matrix & vector with broadcasting

A low level approach

As always you can think on how a proper sparse matrix format for this operation is built up, for csr-matrices the main components are shape, data_arr,indices and ind_ptr. With these parts of the scipy.sparse.csr object it is quite straight forward but maybe a bit time consuming to implement an efficient algorithm in a compiled language (C,C++,Cython, Python-Numba). Int his implemenation I used Numba, but porting it to C++ should be easily possible (syntax changes) and maybe avoiding the slicing.

Implementation (first try)

import numpy as np
import numba as nb

# get all needed components of the csr object and create a resulting csr object at the end
def sparse_elementwise_maximum_wrap(mat,vec):
    mat_csr=mat.tocsr()
    vec_csr=vec.tocsr()

    shape_mat=mat_csr.shape
    indices_mat=mat_csr.indices
    indptr_mat=mat_csr.indptr
    data_mat=mat_csr.data
    indices_vec=vec_csr.indices
    data_vec=vec_csr.data

    res=sparse_elementwise_maximum_nb(indices_mat,indptr_mat,data_mat,shape_mat,indices_vec,data_vec)
    res=sparse.csr_matrix(res, shape=shape_mat)
    return res

@nb.njit(cache=True)
def sparse_elementwise_maximum_nb(indices_mat,indptr_mat,data_mat,shape_mat,vec_row_ind,vec_row_data):
    data_res=[]
    indices_res=[]
    indptr_mat_res=[]

    indptr_mat_=0
    indptr_mat_res.append(indptr_mat_)

    for row_idx in range(shape_mat[0]):
        mat_row_ind=indices_mat[indptr_mat[row_idx]:indptr_mat[row_idx+1]]
        mat_row_data=data_mat[indptr_mat[row_idx]:indptr_mat[row_idx+1]]

        mat_ptr=0
        vec_ptr=0
        while mat_ptr<mat_row_ind.shape[0] and vec_ptr<vec_row_ind.shape[0]:
            ind_mat=mat_row_ind[mat_ptr]
            ind_vec=vec_row_ind[vec_ptr]

            #value for both matrix and vector is present
            if ind_mat==ind_vec:
                data_res.append(max(mat_row_data[mat_ptr],vec_row_data[vec_ptr]))
                indices_res.append(ind_mat)
                mat_ptr+=1
                vec_ptr+=1
                indptr_mat_+=1

            #only value for the matrix is present vector is assumed 0
            elif ind_mat<ind_vec:
                if mat_row_data[mat_ptr] >0:
                    data_res.append(mat_row_data[mat_ptr])
                    indices_res.append(ind_mat)
                    indptr_mat_+=1
                mat_ptr+=1

            #only value for the vector is present matrix is assumed 0
            else:
                if vec_row_data[vec_ptr] >0:
                    data_res.append(vec_row_data[vec_ptr])
                    indices_res.append(ind_vec)
                    indptr_mat_+=1
                vec_ptr+=1

        for i in range(mat_ptr,mat_row_ind.shape[0]):
            if mat_row_data[i] >0:
                data_res.append(mat_row_data[i])
                indices_res.append(mat_row_ind[i])
                indptr_mat_+=1
        for i in range(vec_ptr,vec_row_ind.shape[0]):
            if vec_row_data[i] >0:
                data_res.append(vec_row_data[i])
                indices_res.append(vec_row_ind[i])
                indptr_mat_+=1
        indptr_mat_res.append(indptr_mat_)

    return np.array(data_res),np.array(indices_res),np.array(indptr_mat_res)

Implementation (optimized)

In this approach the lists are replaced by a dynamically resized array. I increased the size of the output in 60 MB steps. On creation of the csr-object, there is also no copy of the data made, just references. If you want avoid a memory overhead you have to copy the arrays in the end.

@nb.njit(cache=True)
def sparse_elementwise_maximum_nb(indices_mat,indptr_mat,data_mat,shape_mat,vec_row_ind,vec_row_data):
    mem_step=5_000_000
    #preallocate memory for 5M non-zero elements (60 MB in this example)
    data_res=np.empty(mem_step,dtype=data_mat.dtype)
    indices_res=np.empty(mem_step,dtype=np.int32)
    data_res_p=0

    indptr_mat_res=np.empty((shape_mat[0]+1),dtype=np.int32)
    indptr_mat_res[0]=0
    indptr_mat_res_p=1
    indptr_mat_=0

    for row_idx in range(shape_mat[0]):
        mat_row_ind=indices_mat[indptr_mat[row_idx]:indptr_mat[row_idx+1]]
        mat_row_data=data_mat[indptr_mat[row_idx]:indptr_mat[row_idx+1]]

        #check if resizing is necessary
        if data_res.shape[0]<data_res_p+shape_mat[1]:
            #add at least memory for another mem_step elements
            size_to_add=mem_step
            if shape_mat[1] >size_to_add:
                size_to_add=shape_mat[1]

            data_res_2   =np.empty(data_res.shape[0]   +size_to_add,data_res.dtype)
            indices_res_2=np.empty(indices_res.shape[0]+size_to_add,indices_res.dtype)
            for i in range(data_res_p):
                data_res_2[i]=data_res[i]
                indices_res_2[i]=indices_res[i]
            data_res=data_res_2
            indices_res=indices_res_2

        mat_ptr=0
        vec_ptr=0
        while mat_ptr<mat_row_ind.shape[0] and vec_ptr<vec_row_ind.shape[0]:
            ind_mat=mat_row_ind[mat_ptr]
            ind_vec=vec_row_ind[vec_ptr]

            #value for both matrix and vector is present
            if ind_mat==ind_vec:
                data_res[data_res_p]=max(mat_row_data[mat_ptr],vec_row_data[vec_ptr])
                indices_res[data_res_p]=ind_mat
                data_res_p+=1
                mat_ptr+=1
                vec_ptr+=1
                indptr_mat_+=1

            #only value for the matrix is present vector is assumed 0
            elif ind_mat<ind_vec:
                if mat_row_data[mat_ptr] >0:
                    data_res[data_res_p]=mat_row_data[mat_ptr]
                    indices_res[data_res_p]=ind_mat
                    data_res_p+=1
                    indptr_mat_+=1
                mat_ptr+=1

            #only value for the vector is present matrix is assumed 0
            else:
                if vec_row_data[vec_ptr] >0:
                    data_res[data_res_p]=vec_row_data[vec_ptr]
                    indices_res[data_res_p]=ind_vec
                    data_res_p+=1
                    indptr_mat_+=1
                vec_ptr+=1

        for i in range(mat_ptr,mat_row_ind.shape[0]):
            if mat_row_data[i] >0:
                data_res[data_res_p]=mat_row_data[i]
                indices_res[data_res_p]=mat_row_ind[i]
                data_res_p+=1
                indptr_mat_+=1
        for i in range(vec_ptr,vec_row_ind.shape[0]):
            if vec_row_data[i] >0:
                data_res[data_res_p]=vec_row_data[i]
                indices_res[data_res_p]=vec_row_ind[i]
                data_res_p+=1
                indptr_mat_+=1
        indptr_mat_res[indptr_mat_res_p]=indptr_mat_
        indptr_mat_res_p+=1

    return data_res[:data_res_p],indices_res[:data_res_p],indptr_mat_res

Maximum memory allocated in the beginning

The performance and usability of this approach heavily depends on the inputs. In this approach the maximal memory is allocated (this could easily cause out of memory errors).

@nb.njit(cache=True)
def sparse_elementwise_maximum_nb(indices_mat,indptr_mat,data_mat,shape_mat,vec_row_ind,vec_row_data,shrink_to_fit):
    max_non_zero=shape_mat[0]*vec_row_data.shape[0]+data_mat.shape[0]
    data_res=np.empty(max_non_zero,dtype=data_mat.dtype)
    indices_res=np.empty(max_non_zero,dtype=np.int32)
    data_res_p=0

    indptr_mat_res=np.empty((shape_mat[0]+1),dtype=np.int32)
    indptr_mat_res[0]=0
    indptr_mat_res_p=1
    indptr_mat_=0

    for row_idx in range(shape_mat[0]):
        mat_row_ind=indices_mat[indptr_mat[row_idx]:indptr_mat[row_idx+1]]
        mat_row_data=data_mat[indptr_mat[row_idx]:indptr_mat[row_idx+1]]

        mat_ptr=0
        vec_ptr=0
        while mat_ptr<mat_row_ind.shape[0] and vec_ptr<vec_row_ind.shape[0]:
            ind_mat=mat_row_ind[mat_ptr]
            ind_vec=vec_row_ind[vec_ptr]

            #value for both matrix and vector is present
            if ind_mat==ind_vec:
                data_res[data_res_p]=max(mat_row_data[mat_ptr],vec_row_data[vec_ptr])
                indices_res[data_res_p]=ind_mat
                data_res_p+=1
                mat_ptr+=1
                vec_ptr+=1
                indptr_mat_+=1

            #only value for the matrix is present vector is assumed 0
            elif ind_mat<ind_vec:
                if mat_row_data[mat_ptr] >0:
                    data_res[data_res_p]=mat_row_data[mat_ptr]
                    indices_res[data_res_p]=ind_mat
                    data_res_p+=1
                    indptr_mat_+=1
                mat_ptr+=1

            #only value for the vector is present matrix is assumed 0
            else:
                if vec_row_data[vec_ptr] >0:
                    data_res[data_res_p]=vec_row_data[vec_ptr]
                    indices_res[data_res_p]=ind_vec
                    data_res_p+=1
                    indptr_mat_+=1
                vec_ptr+=1

        for i in range(mat_ptr,mat_row_ind.shape[0]):
            if mat_row_data[i] >0:
                data_res[data_res_p]=mat_row_data[i]
                indices_res[data_res_p]=mat_row_ind[i]
                data_res_p+=1
                indptr_mat_+=1
        for i in range(vec_ptr,vec_row_ind.shape[0]):
            if vec_row_data[i] >0:
                data_res[data_res_p]=vec_row_data[i]
                indices_res[data_res_p]=vec_row_ind[i]
                data_res_p+=1
                indptr_mat_+=1
        indptr_mat_res[indptr_mat_res_p]=indptr_mat_
        indptr_mat_res_p+=1

    if shrink_to_fit==True:
        data_res=np.copy(data_res[:data_res_p])
        indices_res=np.copy(indices_res[:data_res_p])
    else:
        data_res=data_res[:data_res_p]
        indices_res=indices_res[:data_res_p]

    return data_res,indices_res,indptr_mat_res

# get all needed components of the csr object and create a resulting csr object at the end
def sparse_elementwise_maximum_wrap(mat,vec,shrink_to_fit=True):
    mat_csr=mat.tocsr()
    vec_csr=vec.tocsr()

    shape_mat=mat_csr.shape
    indices_mat=mat_csr.indices
    indptr_mat=mat_csr.indptr
    data_mat=mat_csr.data
    indices_vec=vec_csr.indices
    data_vec=vec_csr.data

    res=sparse_elementwise_maximum_nb(indices_mat,indptr_mat,data_mat,shape_mat,indices_vec,data_vec,shrink_to_fit)
    res=sparse.csr_matrix(res, shape=shape_mat)
    return res

Timings

Numba has a compilation overhead or some overhead to load the function from cache. Don't consider the first call if you want to get the runtime and not compilation+runtime.

import numpy as np
from scipy import sparse

mat = sparse.csr_matrix(  sparse.random(20000, 4000, density=.01, data_rvs=lambda s: np.random.randint(0, 5000, size=s))  )
vec = sparse.csr_matrix(  sparse.random(1, 4000, density=.01, data_rvs=lambda s: np.random.randint(0, 5000, size=s))  )

%timeit output=sparse_elementwise_maximum(mat, vec)
#for csc input
37.9 s ± 224 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
#for csr input
10.7 s ± 90.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

#Daniel F
%timeit sparse_maximum(mat, vec)
164 ms ± 1.74 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

#low level implementation (first try)
%timeit res=sparse_elementwise_maximum_wrap(mat,vec)
89.7 ms ± 2.51 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

#low level implementation (optimized, csr)
%timeit res=sparse_elementwise_maximum_wrap(mat,vec)
16.5 ms ± 122 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

#low level implementation (preallocation, without copying at the end)
%timeit res=sparse_elementwise_maximum_wrap(mat,vec)
16.5 ms ± 122 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

#low level implementation (preallocation, with copying at the end)
%timeit res=sparse_elementwise_maximum_wrap(mat,vec)
16.5 ms ± 122 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

%timeit res=sparse_elementwise_maximum_wrap(mat,vec,shrink_to_fit=False)
14.9 ms ± 110 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit res=sparse_elementwise_maximum_wrap(mat,vec,shrink_to_fit=True)
21.7 ms ± 399 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

#For comparison, copying the result takes
%%timeit
np.copy(res.data)
np.copy(res.indices)
np.copy(res.indptr)
7.8 ms ± 47.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

scipy.sparse matrices don't broadcast. At all. So unless you can figure out some way to operate on the indices and inpts (I haven't), you're stuck stacking. Best I can figure out is just to vstack your vecs until they're the same shape as mat. It seems to give a good speedup, although it doesn't explain the segfault weirdness with csr.

#using `mat` and `vec` from the speed test
def sparse_maximum(mat, vec):
    vec1 = sparse.vstack([vec for _ in range(mat.shape[0])])
    return mat.maximum(vec1)

# Time it
num_timing_loops = 3.0
starttime = timeit.default_timer()
sparse_maximum(mat, vec)
print('time per call is:', (timeit.default_timer() - starttime)/num_timing_loops, 'seconds')
# I was getting 11-12 seconds on your original code
time per call is: 0.514533479333295 seconds

Proof that it works on original matrices:

vec = sparse.vstack([vec for _ in range(4)])

print(mat.maximum(vec).todense())
[[  0   5 100]
 [  3   5 100]
 [  6   7 100]
 [  9  10 100]]