I'm currently working on a server with 2 cuda capable GPU's: Quadro 400 and Tesla C2075. I made a simple vector addition test program. My problem is that while Tesla C2075 GPU is supposed to be more powerful than Quadro 400, it takes it more time to do the job. I found that cudaMemcpy takes up most of the execution time and it works slower on a more powerful gpu. Here's the source:
void get_matrix(float* arr1,float* arr2,int N1,int N2)
{
int Nx,Ny;
int n_blocks,n_threads;
int dev=0; // 1
float time;
size_t size;
clock_t start,end;
cudaSetDevice(dev);
cudaDeviceProp deviceProp;
start = clock();
cudaGetDeviceProperties(&deviceProp, dev);
Nx=N1;
Ny=N2;
n_threads=256;
n_blocks=(Nx*Ny+n_threads-1)/n_threads;
size=Nx*Ny*sizeof(float);
cudaMalloc((void**)&d_A,size);
cudaMalloc((void**)&d_B,size);
cudaMemcpy(d_A, arr1, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, arr2, size, cudaMemcpyHostToDevice);
vector_add<<<n_blocks,n_threads>>>(d_A,d_B,size);
cudaMemcpy(arr1, d_A, size, cudaMemcpyDeviceToHost);
printf("Running device %s \n",deviceProp.name);
end = clock();
time=float(end-start)/float(CLOCKS_PER_SEC);
printf("time = %e\n",time);
}
int main()
{
int const nx = 20000,ny = nx;
static float a[nx*ny],b[nx*ny];
for(int i=0;i<nx;i++)
{
for(int j=0;j<ny;j++)
{
a[j+ny*i]=j+10*i;
b[j+ny*i]=-(j+10*i);
}
}
get_matrix(a,b,nx,ny);
return 0;
}
The output is:
Running device Quadro 400
time = 1.100000e-01
Running device Tesla C2075
time = 1.050000e+00
And my questions are:
- Should I modify the code depending on what GPU I am going to use?
- Is there any connection between the number of blocks, threads per block specified in the code and the number of multiprocessors, cores per multiprocessor available on a GPU?
I'm running Linux Open Suse 11.2. The source code is compiled using the nvcc compiler (version 4.2).
Thanks for your help!