all:
I am learning how shared memory accelerates the GPU programming process. I am using the codes below to calculate the squared value of each element plus the squared value of the average of its left and right neighbors. The code runs, however, the result is not as expected.
The first 10 result printed out is 0,1,2,3,4,5,6,7,8,9, while I am expecting the result as 25,2,8, 18,32,50,72,98,128,162;
The code is as follows, with the reference to here;
Would you please tell me which part goes wrong? Your help is very much appreciated.
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <cuda.h>
const int N=1024;
__global__ void compute_it(float *data)
{
int tid = threadIdx.x;
__shared__ float myblock[N];
float tmp;
// load the thread's data element into shared memory
myblock[tid] = data[tid];
// ensure that all threads have loaded their values into
// shared memory; otherwise, one thread might be computing
// on unitialized data.
__syncthreads();
// compute the average of this thread's left and right neighbors
tmp = (myblock[tid>0?tid-1:(N-1)] + myblock[tid<(N-1)?tid+1:0]) * 0.5f;
// square the previousr result and add my value, squared
tmp = tmp*tmp + myblock[tid]*myblock[tid];
// write the result back to global memory
data[tid] = myblock[tid];
__syncthreads();
}
int main (){
char key;
float *a;
float *dev_a;
a = (float*)malloc(N*sizeof(float));
cudaMalloc((void**)&dev_a,N*sizeof(float));
for (int i=0; i<N; i++){
a [i] = i;
}
cudaMemcpy(dev_a, a, N*sizeof(float), cudaMemcpyHostToDevice);
compute_it<<<N,1>>>(dev_a);
cudaMemcpy(a, dev_a, N*sizeof(float), cudaMemcpyDeviceToHost);
for (int i=0; i<10; i++){
std::cout<<a [i]<<",";
}
std::cin>>key;
free (a);
free (dev_a);