I'm trying to do a simple matrix-vector product with OpenCL, using the ViennaCL library.
Here's my main:
#include "viennacl/scalar.hpp"
#include "viennacl/vector.hpp"
#include "viennacl/matrix.hpp"
#include "viennacl/linalg/prod.hpp"
#include "viennacl/matrix_proxy.hpp"
#include "viennacl/linalg/lu.hpp"
int main()
{
viennacl::ocl::set_context_device_type(0, viennacl::ocl::gpu_tag());
std::vector<viennacl::ocl::device> devices = viennacl::ocl::current_context().devices();
viennacl::ocl::current_context().switch_device(devices[0]);
int Nx=10;
int Ny=10;
//GPU vectors
viennacl::matrix<float> vcl_A(Nx,Ny);
viennacl::vector<float> vcl_b(Ny);
viennacl::vector<float> vcl_c(Nx);
//CPU vectors
std::vector<float> stl_A(Nx*Ny);
std::vector<float> stl_b(Ny);
std::vector<float> stl_c(Nx);
//filling CPU vectors
for (unsigned int i = 0; i < Nx; ++i)
for (unsigned int j = 0; j < Ny; ++j)
stl_A[i*Ny + j] = (float) (rand()%100);
for (unsigned int i = 0; i < stl_b.size(); ++i)
stl_b[i] = (float) (rand()%100);
//copying input data to GPU
viennacl::fast_copy(&(stl_A[0]),
&(stl_A[0]) + stl_A.size(),
vcl_A);
viennacl::fast_copy(stl_b, vcl_b);
//launching product c = A*b
vcl_c = viennacl::linalg::prod(vcl_A, vcl_b);
//copying output data back to CPU
viennacl::copy(vcl_c, stl_c);
viennacl::backend::finish();
}
Afterwards, my stl_c vector has its first coefficient correctly computed, but every 9 other coefs are 0
.
When I change the dimensions to upper values, I get more than one right coef at the begining of my vector, but I still get a bunch of zeros for every other coefs.
I'm guessing some of my copies are done the wrong way, but maybe my prod operation is in cause (local/globalsize issue, but I assume ViennaCL takes care of it all)
Any Idea of what i'm doing wrong? Any help or advice will be really appreciated.
(I'm running the code on VS 2012, my GPU is an NVIDIA Geforce gtx 670)