I've implemented various algorithms using Cuda, such as matrix multiplication, Cholesky decomposition and inversion (by forward substitution) of a lower triangular matrix.
For some of these algorithms I have a for loop in the kernel that repeats part of the kernel code lots of times. It all works well for (flattened: represented by 1D arrays) matrices (of floats) up to about 200x200, with the for loop calling part of the kernel code 200 times. Increasing the matrix size to say 1000x1000 (with the for loop calling part of the kernel code 1000 times) leaves the GPU to take as much computing time as can be expected based on trials with smaller matrix sizes. But no kernel code (including parts outside the for loop) seems to have been run (the output matrix has none of its elements changed since initialization). If I increase the matrix size to around 500 I'm sometimes able to get the kernel to run if I set the limiter in the for loop to some low value (such has 3).
Have I hit some hardware limit here or is there a trick I can use to make these for loops work for large matrices?
This is an example of complete code that you can copy into a .cu file. The kernel attempts to copy the contents of matrix A (W*H) to matrix B (W*H). The output shows the first element of both matrices, for W*H < 200x200 this works just fine, for W*H = 1000x1000 no copying seems to occur because the elements of B remain zero, as if nothing happened since initialization. I'm compiling and running this code on a linux based server. For large matrices error checking gives me: "GPUassert: unspecified launch failure" at line 67 which is the cudamempcy line that copies matrix B from device to host.
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <iostream>
#include <time.h>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void MatrixCopy(float *A, float *B, int W)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
int j = blockIdx.y*blockDim.y + threadIdx.y;
B[j*W + i]=A[j*W + i];
}
int main(void)
{
clock_t start1=clock();
int W=1000;
int H=1000;
float *A, *B;
float *devA, *devB;
A=(float*)malloc(W*H*sizeof(float));
B=(float*)malloc(W*H*sizeof(float));
for(int i=0; i<=W*H; i++)
{
A[i]=rand() % 3;
A[i]=A[i]+1;
B[i]=0;
}
gpuErrchk( cudaMalloc( (void**)&devA, W*H*sizeof(float) ) );
gpuErrchk( cudaMalloc( (void**)&devB, W*H*sizeof(float) ) );
gpuErrchk( cudaMemcpy( devA, A, W*H*sizeof(float), cudaMemcpyHostToDevice ) );
gpuErrchk( cudaMemcpy( devB, B, W*H*sizeof(float), cudaMemcpyHostToDevice ) );
dim3 threads(32,32);
int bloW=(int)ceil((double)W/32);
int bloH=(int)ceil((double)H/32);
dim3 blocks(bloW, bloH);
clock_t finish1=clock();
clock_t start2=clock();
MatrixCopy<<<blocks,threads>>>(devA, devB, W);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaMemcpy( B, devB, W*H*sizeof(float), cudaMemcpyDeviceToHost ) );
clock_t finish2=clock();
printf("\nGPU calculation time (ms): %d\nInitialization time (ms): %d\n\n", (int)ceil(double(((finish2-start2)*1000/(CLOCKS_PER_SEC)))), (int)ceil(double(((finish1-start1)*1000/(CLOCKS_PER_SEC)))));
printf("\n%f\n", A[0]);
printf("\n%f\n\n", B[0]);
gpuErrchk( cudaFree(devA) );
gpuErrchk( cudaFree(devB) );
free(A);
free(B);
#ifdef _WIN32
system ("PAUSE");
#endif
return 0;
}