I am trying to learn the usage of shared memory to increase the performance. Here I am trying to copy data from global memory to shared memory. But when I have a single block (256 threads) it gives the right result while with more than one block it gives a random result.
#include <cuda.h>
#include <stdio.h>
__global__ void staticReverse(int *d, int n)
{
__shared__ int s[400];
int t = blockIdx.x * blockDim.x + threadIdx.x;
d[t] = d[t]*d[t];
s[t] =d[t];
__syncthreads();
d[t] = s[t];
}
__global__ void dynamicReverse(int *d, int n)
{
extern __shared__ int s[];
int t = threadIdx.x;
s[t] = d[t]*d[t];
__syncthreads();
d[t] = s[t];
}
int main(void)
{
const int n = 400;
int a[n], d[n];
for (int i = 0; i < n; i++)
{
a[i] = i;
}
int *d_d;
cudaMalloc(&d_d, n * sizeof(int));
// run version with static shared memory
int block_size = 256;
int n_blocks = n/block_size + (n%block_size == 0 ? 0:1);
cudaMemcpy(d_d, a, n*sizeof(int), cudaMemcpyHostToDevice);
staticReverse<<<n_blocks,block_size>>>(d_d, n);
cudaMemcpy(d, d_d, n*sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0; i < n; i++)
{
printf("%d\n",d[i]);
}
}
What does the third argument in the
dynamicReverse<<<n_blocks,block_size,n*sizeof(int)>>>(d_d, n);
kernel launch do? Does it allocat shared memory for the entire block or per thread?
If I require more than 64kb of shared memory per multiprocessor with compute capability 5.0, what do I need to do?