I recently bought a gtx550ti boost card. Programs that used to work on my old gf440 card fails. Here is an example. The following program works fine with smaller kernels, but goes wrong with larger ones.
#include "stdio.h"
__global__ void kernel(float * d_in, float * d_out){
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int idx = x + y * blockDim.x * gridDim.x;
d_out[idx] = d_in[idx];
}
int main(){
const dim3 gridSize(10,10);
const dim3 blockSize(80,80);
const int size = 800*800;
float * h_in = new float[size];
float * h_out = new float[size];
float * d_in;
float * d_out;
cudaMalloc((void**)&d_in, sizeof(float)*size);
cudaMalloc((void**)&d_out, sizeof(float)*size);
for(int i = 0; i < size; i++)
h_in[i] = (float)i;
cudaMemcpy(d_in, h_in, sizeof(float)*size, cudaMemcpyHostToDevice);
kernel<<<gridSize,blockSize>>>(d_in, d_out);
cudaMemcpy(h_out, d_out, sizeof(float)*size, cudaMemcpyDeviceToHost);
for(int i = 0; i < size; i++)
printf("%f\n",h_out[i]);
cudaFree(d_in);
cudaFree(d_out);
return 0;
}
I expected it to output index in floats. But it outputs some random floats:
0.131061
2.520029
9.304665
0.000189
0.242134
0.525557
0.560013
size 100*100
Instead, when I switch to size 100*100:
const dim3 gridSize(10,10);
const dim3 blockSize(10,10);
const int size = 100*100;
And it works fine(last 5 outputs):
9995.000000
9996.000000
9997.000000
9998.000000
9999.000000
size 500*500
But for larger size 500*500:
const dim3 gridSize(10,10);
const dim3 blockSize(50,50);
const int size = 500*500;
It outputs wrong index(last 5 outputs):
512139.000000
512140.000000
512141.000000
512142.000000
512143.000000
I installed CUDA 5.5. Thanks!