I'm having an issue with calling a matrix addition kernel that causes a timeout on matrices larger than 255x255. For further information, I'm compiling with -arch=sm_21 and running on a MacBook Pro. I've tried using different sized blocks and threads.
The matrix struct:
typedef struct {
int n, m; /* Define an n-rows by m-columns matrix */
double* data;
} c_matrix;
Initializing the matrix and the wrapper for adding the matrix:
extern "C"
c_matrix *new_c_matrix(int i, int j) {
c_matrix *m = (c_matrix *)malloc(sizeof(*m));
if(m == NULL)
return NULL;
m->data = (double *)malloc(sizeof(double) * i * j);
if(m->data == NULL) {
free(m);
return NULL;
}
m->n = i;
m->m = j;
return m;
}
extern "C"
void c_matrix_add(const c_matrix *m1, const c_matrix *m2, c_matrix *m) {
/* We only need 4 comparisons because we can assume
* transitivity of ints */
if(m1->m != m2->m || m1->n != m2->n || m1->m != m->m
|| m1->n != m2->n)
exit(EXIT_FAILURE);
double *d_a, *d_b, *d_c;
handle_error( cudaMalloc(&d_a, m1->m * m1->n * sizeof(double)) );
handle_error( cudaMalloc(&d_b, m1->m * m1->n * sizeof(double)) );
handle_error( cudaMalloc(&d_c, m1->m * m1->n * sizeof(double)) );
handle_error( cudaMemcpy(d_a, m1->data, m1->m * m1->n * sizeof(double), cudaMemcpyHostToDevice ) );
handle_error( cudaMemcpy(d_b, m2->data, m2->m * m2->n * sizeof(double), cudaMemcpyHostToDevice ) );
dim3 dimBlock(16, 16);
dim3 dimGrid((m1->m + dimBlock.x - 1) / dimBlock.x, (m1->n + dimBlock.y - 1) / dimBlock.y);
cu_matrix_add<<< dimGrid, dimBlock >>>(d_a, d_b, d_c, m1->m * m1->n);
handle_error( cudaMemcpy(m->data, d_c, m->m * m->n * sizeof(double), cudaMemcpyDeviceToHost ) );
// cudaFree( d_c );
cudaFree( d_b );
cudaFree( d_a );
}
And the kernel itself:
__global__ void cu_matrix_add(const double *d_a, const double *d_b, double *d_c, int element_count) {
unsigned short tid = blockIdx.x * blockDim.x + threadIdx.x;
while( tid < element_count ) {
d_c[tid] = d_a[tid] + d_b[tid];
tid += blockDim.x * gridDim.x;
}
}
It's timing out on the handle_error( cudaMemcpy(m->data, d_c, m->m * m->n * sizeof(double), cudaMemcpyDeviceToHost ) );
when I attempt to copy the device matrix back to the host matrix.