Im using this code for reduction:
http://www.math.nsysu.edu.tw/~lam/MPI/code/cuda/reduction.cu
that is based on Mark Harris talk as in here
http://www.math.nsysu.edu.tw/~lam/MPI/lecture/reduction.pdf
But for
#define blocksize 1024
#define gridsize 1024*8
#define size blocksize*gridsize
Kernel reduce6 works and reduce7 fails. Is it bcos reduce7 is dependant on amount of shared memory that size has to reach even "size" defined above?
Code snippet is here:
#define THR_PER_BLC 1024
#define BLC_PER_GRD 16
#define GRID_SIZE THR_PER_BLC * BLC_PER_GRD
template<unsigned int nThreads>
__global__ void reduce7(int *g_idata, int *g_odata, unsigned int n) {
//I added GRID_SIZE myself so it can be volatile
__shared__ volatile int sdata[THR_PER_BLC];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * (nThreads * 2) + threadIdx.x;
unsigned int gridSize = nThreads * 2 * gridDim.x;
sdata[tid] = 0;
while (i < n) {
sdata[tid] += g_idata[i] + g_idata[i + nThreads];
i += gridSize;
}
__syncthreads();
// reduction in shared memory
if (nThreads >= 512) {
if (tid < 256) { sdata[tid] += sdata[tid + 256]; }
__syncthreads();
}
if (nThreads >= 256) {
if (tid < 128) { sdata[tid] += sdata[tid + 128]; }
__syncthreads();
}
if (nThreads >= 128) {
if (tid < 64) { sdata[tid] += sdata[tid + 64]; }
__syncthreads();
}
if (tid < 32) {
if (nThreads >= 64) sdata[tid] += sdata[tid + 32];
if (nThreads >= 32) sdata[tid] += sdata[tid + 16];
if (nThreads >= 16) sdata[tid] += sdata[tid + 8];
if (nThreads >= 8) sdata[tid] += sdata[tid + 4];
if (nThreads >= 4) sdata[tid] += sdata[tid + 2];
if (nThreads >= 2) sdata[tid] += sdata[tid + 1];
// transfer of the result to global memory
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
}
And this kernel is called like this from main:
threads = THR_PER_BLC /2 ;
int gsize = BLC_PER_GRD /8;
switch (threads) {
case 512:
reduce7<512> << < gsize, threads >> > (g_idata, g_odata, GRID_SIZE);
break;
case 256:
reduce7<256> << < gsize, threads >> > (g_idata, g_odata, GRID_SIZE);
break;
case 128:
reduce7<128> << < gsize, threads >> > (g_idata, g_odata, GRID_SIZE);
break;
case 64:
reduce7<64> << < gsize, threads >> > (g_idata, g_odata, GRID_SIZE);
break;
case 32:
reduce7<32> << < gsize, threads >> > (g_idata, g_odata, GRID_SIZE);
break;
case 16:
reduce7<16> << < gsize, threads >> > (g_idata, g_odata, GRID_SIZE);
break;
case 8:
reduce7<8> << < gsize, threads >> > (g_idata, g_odata, GRID_SIZE);
break;
case 4:
reduce7<4> << < gsize, threads >> > (g_idata, g_odata, GRID_SIZE);
break;
case 2:
reduce7<2> << < gsize, threads >> > (g_idata, g_odata, GRID_SIZE);
break;
case 1:
reduce7<1> << < gsize, threads >> > (g_idata, g_odata, GRID_SIZE);
break;
}
cudaThreadSynchronize();
Is basically means that reduce7 cant be called with to big GRID_SIZE?
This are my tests
#################################################################
6 Unroll the complete loop
Kernal elapsed time = 0.030(ms)
Elapsed time = 0.057(ms)
Sum = 8192, with BLC_PER_GRD 16 THR_PER_BLC 512
#################################################################
7 Final
Kernal elapsed time = 0.015(ms), band =
Elapsed time = 0.040(ms)
Sum = 8192, with BLC_PER_GRD 16 THR_PER_BLC 512
#################################################################
#################################################################
6 Unroll the complete loop
Kernal elapsed time = 0.031(ms)
Elapsed time = 0.057(ms)
Sum = 8192, with BLC_PER_GRD 8 THR_PER_BLC 1024
#################################################################
7 Final
Kernal elapsed time = 0.015(ms), band =
Elapsed time = 0.040(ms)
Sum = 8192, with BLC_PER_GRD 8 THR_PER_BLC 1024
#################################################################
#################################################################
6 Unroll the complete loop
Kernal elapsed time = 0.569(ms)
Elapsed time = 12.889(ms)
Sum = 8388608, with BLC_PER_GRD 8192 THR_PER_BLC 1024
#################################################################
And my gpu:
a@M:/usr/local/cuda/samples/bin/x86_64/linux/release$ ./dev*Drv
./deviceQueryDrv Starting...
CUDA Device Query (Driver API) statically linked version
Detected 1 CUDA Capable device(s)
Device 0: "GeForce GTX 1060 6GB"
CUDA Driver Version: 9.2
CUDA Capability Major/Minor version number: 6.1
Total amount of global memory: 6078 MBytes (6373572608 bytes)
(10) Multiprocessors, (128) CUDA Cores/MP: 1280 CUDA Cores
GPU Max Clock rate: 1709 MHz (1.71 GHz)
Memory Clock rate: 4004 Mhz
Memory Bus Width: 192-bit
L2 Cache Size: 1572864 bytes
Max Texture Dimension Sizes 1D=(131072) 2D=(131072, 65536) 3D=(16384, 16384, 16384)
Maximum Layered 1D Texture Size, (num) layers 1D=(32768), 2048 layers
Maximum Layered 2D Texture Size, (num) layers 2D=(32768, 32768), 2048 layers
Total amount of constant memory: 65536 bytes
Total amount of shared memory per block: 49152 bytes
Total number of registers available per block: 65536
Warp size: 32
Maximum number of threads per multiprocessor: 2048
Maximum number of threads per block: 1024
Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
Max dimension size of a grid size (x,y,z): (2147483647, 65535, 65535)
Texture alignment: 512 bytes
Maximum memory pitch: 2147483647 bytes
Concurrent copy and kernel execution: Yes with 2 copy engine(s)
Run time limit on kernels: No
Integrated GPU sharing Host Memory: No
Support host page-locked memory mapping: Yes
Concurrent kernel execution: Yes
Alignment requirement for Surfaces: Yes
Device has ECC support: Disabled
Device supports Unified Addressing (UVA): Yes
Device supports Compute Preemption: Yes
Supports Cooperative Kernel Launch: Yes
Supports MultiDevice Co-op Kernel Launch: Yes
Device PCI Domain ID / Bus ID / location ID: 0 / 3 / 0
Compute Mode:
< Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >
Hmm so lets sey that we set 128 threads, grid size as 4:
#define MAX_SHM 49152
#define GRID_SIZE MAX_SHM / sizeof(int)
#define THR_PER_BLC 128
#define BLC_PER_GRD GRID_SIZE/THR_PER_BLC
Then reduce7 works. So it means that reduce7 depends strictly on max shm?
Edit
Seems that I was confused by this line: while (i < n) {
, where n is GRID_SIZE. Then for now I dont know what i
means. Need to digest it some time. But its good to know, that in one block there can only be specific number of threads, that for this case we had to match with SM.