For data simulation purposes I'm looking for an efficient way to do a weighted sum of sparse matrices.
Basically I have a data cube of Nx x Ny x Nz double values, where Nx and Ny is in the order of 4000 and Nz is several Millions. All Nx x Ny sub-matrices are very sparse (a block of data in the order of 40 entires).
I now want to reduce the data cube in Z direction by adding up all matrices and weight them. The process is illustrated in the figure. For my simulations all matrices stay fix and only the weights will change and generate different Nx x Ny datasets.
This is what I tried: A naive implementation of a sparse matrix in C++, and a simple sum.
#ifndef SPARSEARRAY3D_H
#define SPARSEARRAY3D_H
#include <vector>
struct data{
unsigned short int x;
unsigned short int y;
int z;
double value;
};
class sparsearray3d
{
public:
sparsearray3d();
void createRandomData(int Nx, int Ny, int Nz, int bwidthX, int bwidthY);
void sumData();
int Nx,Ny,Nz;
std::vector<data> dd;
std::vector<std::vector<double> > image;
std::vector<double> weights;
};
#endif // SPARSEARRAY3D_H
sparsearray3d.cpp
#include "sparsearray3d.h"
#include <stdlib.h> /* srand, rand */
#include <stdio.h> /* printf, scanf, puts, NULL */
sparsearray3d::sparsearray3d()
{
this->Nx = 0;
this->Ny = 0;
this->Nz = 0;
}
void sparsearray3d::createRandomData(int Nx, int Ny, int Nz, int bwidthX = 5, int bwidthY = 5)
{
// create random data
this->weights.resize(Nz);
this->image.resize( Nx , std::vector<double>( Ny , 0. ) );
this->Nx = Nx;
this->Ny = Ny;
this->Nz = Nz;
for(int i=0; i<Nz; ++i)
{
int x0 = rand() % (Nx-bwidthX);
int y0 = rand() % (Ny-bwidthY);
this->weights.push_back((double) rand() / (RAND_MAX));
for(int j=0; j<bwidthX; ++j)
{
for(int k=0; k<bwidthY; ++k)
{
this->dd.push_back({x0+j,y0+k,i,((double) rand() / (RAND_MAX))});
}
}
}
printf("Vector size: %4.2f GB \n", this->dd.size()*sizeof(data) * 1E-9);
}
void sparsearray3d::sumData()
{
std::vector<data>::iterator it;
#pragma omp parallel for
for(it = this->dd.begin(); it < this->dd.end(); ++it)
{
this->image[it->y][it->x] += it->value * this->weights[it->z];
}
}
main.cpp
#include <iostream>
#include "sparsearray3d.h"
#include <sys/time.h>
using namespace std;
int main()
{
struct timeval start, end;
sparsearray3d sa;
gettimeofday(&start, NULL);
sa.createRandomData(4096, 4096, 2000000, 4, 16);
gettimeofday(&end, NULL);
double delta = ((end.tv_sec - start.tv_sec) * 1000000u +
end.tv_usec - start.tv_usec) / 1.e6;
cout << "random array generation: " << delta << endl;
gettimeofday(&start, NULL);
sa.sumData();
gettimeofday(&end, NULL);
delta = ((end.tv_sec - start.tv_sec) * 1000000u +
end.tv_usec - start.tv_usec) / 1.e6;
cout << "array addition: " << delta << endl;
return 0;
}
This does already a good job, the example above runs here at ~0.6s. The first thing I'm wondering about is, why the #pragma omp parallel for gives a speed up of only about a factor of 2 although 4 CPU's are used.
The problem seems to be pretty well suited for massive parallelization . Could Cuda / OpenCL help here? However, I read somewhere that matrix addition is not very efficient with Cuda/ OpenCL. (I don't have a NVIDIA card available though). Alternatively, I read a little about graphs and their relation to matrices. Could that problem be treated with some graph algorithms?
EDIT: I tried to give Eigen a shot; However, I failed to create a large number of matrices. The following code requires much more memory than my code does (and fails for N~ 20000000, as I run out of memory). Not sure I'm doing it right, but that's how I understood it from eigen documentation.
#include <vector>
#include <eigen3/Eigen/Sparse>
int main()
{
int N=100000;
std::vector<Eigen::SparseMatrix<double> > data;
data.resize(N);
for (int i=0; i<N; ++i)
{
data[i].resize(4096,4096);
data[i].reserve(4*16);
}
return 0;
}
Also, summing up the sparse matrices in the following way was much slower than my code:
Eigen::SparseMatrix<double> sum(4096,4096) ;
sum.reserve(4096*4096);
for(int i=0; i<N; ++i)
sum+=data[i];