This is my sequential code:
float foo(float* in1, float* in2, float in3, unsigned int size) {
float tmp = 0.f;
for (int i = 0; i<size; i++)
if(in2[i]>0)tmp += (in1[i]/in3 - (in2[i] /in3)*(in2[i] /in3));
return tmp;
}
This is my effort to port it to CUDA:
__global__ void kernel_foo(float* tmp, const float* in1, const float*
in2, float in3, unsigned int size) {
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size) {
if(in2[i]>0){
atomicAdd(tmp, in1[i]/in3 - (in2[i] /in3)*(in2[i] /in3));
}
}
}
void launch_kernel_foo(float* tmp, const float* in1, const float* in2,
float in3, unsigned int size) {
kernel_foo<<<(size+255)/256,256>>>(tmp, in1, in2, in3, size);
}
but it does't work to generate correct results. Could anyone tell me where is the mistake?