The first two optimization priorities for any CUDA programmer are:
- Use lots of threads
- Use memory efficiently
For your problem, you'll have no trouble with the first one - it readily decomposes into a set of problems that are independent and can be assigned to a lot of parallel threads. The second priority is where you want to focus, then. With respect to global memory, this means we should strive for coalesced access, whenever possible. We should pay particular attention to the reads.
I'll need to make some assumptions. I'll assume that your organization of dimensions is ROW, COLUMN, DEPTH and that your data is stored in the usual C-style, i.e. row-major storage. With those assumptions, then, the request (summing over the entire first (or second) axis) is effectively summing over the entire row or summing over the entire column. If you do a bit of searching here on the cuda
tag, you'll find worked examples for both (here is one such example). Although they don't necessarily all cover the 3D case, they should provide a pretty good roadmap. What you'll discover is that these two cases should be handled differently, with an eye towards coalesced global memory access, i.e. the optimization priority already mentioned. The row-direction is also the coalescing direction, so if we need to sum rows, then we'll need to use a classical parallel reduction technique, so that we can read rows, and sum the elements together. If we need to sum columns, the efficient kernel is much easier to write; each thread can be responsible for a column, and can just keep a running sum in a for-loop.
In your case, you appear to be summing columns (but see note below). What follows is a worked example, comparing your approach to the faster running-column-sum method, with coalesced access (adjacent threads reading adjacent elements in memory):
$ cat t1263.cu
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
const int my_N = 10000;
const int my_D = 3;
const int my_blockSize = 768;
const int my_loopSize = my_N*my_N*my_D;
const int my_numBlocks = (my_loopSize + my_blockSize -1)/my_blockSize;
const int bsize = 512;
const float TOL = 0.1f;
#define HANDLE_ERROR(x) x
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
long long dtime_usec(unsigned long long start){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
namespace kernel {
__global__ void sumNND(float* devPtrIn, float* devPtrOut, const int N, const int D) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int id = index; id < N * N * D; id += stride) {
const unsigned int d = id % D;
const unsigned int i = (id - d) / D;
const unsigned int n = i / N;
const unsigned int m = i % N;
atomicAdd(&devPtrOut[d + D * n], devPtrIn[d + D * n + N * m]);
}
}
}
void sumNND(const int numBlocks, const int blockSize, float* devPtrIn, float* devPtrOut, const int N, const int D) {
HANDLE_ERROR(cudaMemset(devPtrOut, 0, N * D * sizeof(float)));
kernel::sumNND<<<numBlocks, blockSize>>>(devPtrIn, devPtrOut, N, D);
HANDLE_ERROR(cudaDeviceSynchronize());
}
// kernel assumes 1 block assigned per row, use block-striding methodology
// assumes block size is a power of 2
__global__ void sum_rows_NND(const float * __restrict__ devPtrIn, float * __restrict__ devPtrOut, const int N, const int D) {
__shared__ float sdata[bsize];
sdata[threadIdx.x] = 0;
for (int i = threadIdx.x; i < N; i += blockDim.x) // block-stride
sdata[threadIdx.x] += devPtrIn[(blockIdx.x * N) + i];
__syncthreads();
for (int i = blockDim.x>>1; i > 0; i>>=1){
if (threadIdx.x < i) sdata[threadIdx.x] += sdata[threadIdx.x+i];
__syncthreads();}
if (!threadIdx.x) devPtrOut[blockIdx.x] = sdata[0];
}
// kernel assumes one thread assigned per column sum
// launch N threads
__global__ void sum_cols_NND(const float * __restrict__ devPtrIn, float * __restrict__ devPtrOut, const int N, const int D) {
int idx = threadIdx.x+blockDim.x*blockIdx.x;
int ido = idx;
if (idx < N){
for (int j = 0; j < D; j++){
float temp = 0;
for (int i = 0; i < N; i++) temp += devPtrIn[idx + (i*N)];
devPtrOut[ido] = temp;
ido += N;
idx += N*N;}}
}
int main(){
float *h_data, *d_data, *h_res1, *h_res2, *d_res;
h_data = new float[my_loopSize];
cudaMalloc(&d_data, my_loopSize*sizeof(d_data[0]));
h_res1 = new float[my_N*my_D];
h_res2 = new float[my_N*my_D];
cudaMalloc(&d_res, my_N*my_D*sizeof(d_res[0]));
for (int i = 0; i < my_loopSize; i++) h_data[i] = rand()/(float)RAND_MAX;
cudaCheckErrors("CUDA failure");
cudaMemcpy(d_data, h_data, my_loopSize*sizeof(d_data[0]), cudaMemcpyHostToDevice);
// test original approach
cudaMemset(d_res, 0, my_N*my_D*sizeof(d_res[0]));
unsigned long long dt1 = dtime_usec(0);
kernel::sumNND<<<my_numBlocks, my_blockSize>>>(d_data, d_res, my_N, my_D);
cudaDeviceSynchronize();
dt1 = dtime_usec(dt1);
cudaMemcpy(h_res1, d_res, my_N*my_D*sizeof(d_res[0]), cudaMemcpyDeviceToHost);
//test columnwise reduction
unsigned long long dt2 = dtime_usec(0);
//sum_rows_NND<<<my_N*my_D, bsize>>>(d_data, d_res, my_N, my_D);
sum_cols_NND<<<(my_N + bsize -1)/bsize, bsize>>>(d_data, d_res, my_N, my_D);
cudaDeviceSynchronize();
dt2 = dtime_usec(dt2);
cudaMemcpy(h_res2, d_res, my_N*my_D*sizeof(d_res[0]), cudaMemcpyDeviceToHost);
// validate results
for (int i = 0; i < my_N; i++)
if (fabsf(h_res1[i] - h_res2[i]) > TOL) {printf("mismatch at %d, was %f, should be %f\n", i, h_res2[i], h_res1[i]); return -1;}
cudaCheckErrors("program error");
printf("results match, kernel 1 time: %fs, kernel 2 time: %fs\n", dt1/(float)USECPSEC, dt2/(float)USECPSEC);
// time row reduction kernel
unsigned long long dt3 = dtime_usec(0);
sum_rows_NND<<<my_N*my_D, bsize>>>(d_data, d_res, my_N, my_D);
cudaDeviceSynchronize();
dt3 = dtime_usec(dt3);
printf("row reduction kernel time: %fs\n", dt3/(float)USECPSEC);
cudaCheckErrors("program error");
}
$ nvcc -arch=sm_52 -o t1263 t1263.cu
$ ./t1263
results match, kernel 1 time: 0.459971s, kernel 2 time: 0.013678s
row reduction kernel time: 0.013724s
$
Notes:
The optimized kernel is around 30x faster than your naive atomics kernel. I suspect that a big chunk of this is not actually the use of atomics, but the uncoalesced access. global atomics on newer GPUs can be pretty fast.
The first "page" (NxN) of elements column sum match between my kernel and your kernel (i.e. the first N results match). After the first page (first N results), our results differ. I'm pretty sure my indexing is correct, but after spending a while trying to unravel your indexing, I gave up. I suspect you have a bug in your kernel indexing, if you are trying to sum columns, and all the aforementioned assumptions are true.
I also included a timing measurement of the row-summing kernel, which looks quite different, but produces almost the same timing. This is to be expected, since optimal kernels for these types of problems will be limited by memory bandwidth, which is the same in both cases. Optimal kernels will load all the data, once, in a coalesced fashion. After that, the row-sum vs. column-sum mechanics have relatively little effect on the kernel time.
With a small modification to the initialization of the data, I think it's fairly easy to prove that your kernel is not creating the correct indexing and therefore not producing the correct row-sums after the first "page" (i.e. after the first N
results). After a little more study of your indexing, I have some idea of what is going wrong. One example problem is that for N
not divisible by D
, your kernel d
variable will not reset to zero after the first "page", but this is not the only issue.
Pursuant to item 4, here's a version of the code that has modified data initialization, and a full test of all N
*D
results. The data initialization is such that the first column of the first page will be all zero, the next column all 1, the next column all 2, etc. On the second page, we increment everything by 1, so the first column will be all 1, the second column will be all 2, etc. Therefore it should be easy to agree on what the column sums should be. For the first page, the column sums should be 0, 10000, 20000, etc. For the second page they should be 10000, 20000, 30000, etc. On the first column of the second page, my code produces 10000, your code produces 1. With your changed indexing in the comments, I produce 0 for the first column of the first page, and your code produces 9999. 1 and 9999 cannot possibly be valid column sums according to the data initialization I described:
$ cat t1263.cu
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
const int my_N = 10000;
const int my_D = 3;
const int my_blockSize = 768;
const int my_loopSize = my_N*my_N*my_D;
const int my_numBlocks = (my_loopSize + my_blockSize -1)/my_blockSize;
const int bsize = 512;
const float TOL = 0.1f;
#define HANDLE_ERROR(x) x
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
long long dtime_usec(unsigned long long start){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
namespace kernel {
__global__ void sumNND(float* devPtrIn, float* devPtrOut, const int N, const int D) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int id = index; id < N * N * D; id += stride) {
const unsigned int d = id % D; // 0 1 2 0 1 2 0 1 2
const unsigned int i = (id - d) / D; // 0 0 0 1 1 1 2 2 2
const unsigned int n = i / N; // 0 0 0 0 0 0 0 0 0
const unsigned int m = i % N; // 0 0 0 1 1 1 2 2 2
atomicAdd(&devPtrOut[d + D * n], // 0 1 2 0 1 2 0 1 2
devPtrIn[d + D * n + N * m]); // 0 1 2 0+N 1+N 2+N 0+2N 1+2N 2+2N
}
}
}
void sumNND(const int numBlocks, const int blockSize, float* devPtrIn, float* devPtrOut, const int N, const int D) {
HANDLE_ERROR(cudaMemset(devPtrOut, 0, N * D * sizeof(float)));
kernel::sumNND<<<numBlocks, blockSize>>>(devPtrIn, devPtrOut, N, D);
HANDLE_ERROR(cudaDeviceSynchronize());
}
// kernel assumes 1 block assigned per row, use block-striding methodology
// assumes block size is a power of 2
__global__ void sum_rows_NND(const float * __restrict__ devPtrIn, float * __restrict__ devPtrOut, const int N, const int D) {
__shared__ float sdata[bsize];
sdata[threadIdx.x] = 0;
for (int i = threadIdx.x; i < N; i += blockDim.x) // block-stride
sdata[threadIdx.x] += devPtrIn[(blockIdx.x * N) + i];
__syncthreads();
for (int i = blockDim.x>>1; i > 0; i>>=1){
if (threadIdx.x < i) sdata[threadIdx.x] += sdata[threadIdx.x+i];
__syncthreads();}
if (!threadIdx.x) devPtrOut[blockIdx.x] = sdata[0];
}
// kernel assumes one thread assigned per column sum
// launch N threads
__global__ void sum_cols_NND(const float * __restrict__ devPtrIn, float * __restrict__ devPtrOut, const int N, const int D) {
int idx = threadIdx.x+blockDim.x*blockIdx.x;
int ido = idx;
if (idx < N){
for (int j = 0; j < D; j++){
float temp = 0;
for (int i = 0; i < N; i++) temp += devPtrIn[idx + (i*N)];
devPtrOut[ido] = temp;
ido += N;
idx += N*N;}}
}
int main(){
float *h_data, *d_data, *h_res1, *h_res2, *d_res;
h_data = new float[my_loopSize];
cudaMalloc(&d_data, my_loopSize*sizeof(d_data[0]));
h_res1 = new float[my_N*my_D];
h_res2 = new float[my_N*my_D];
cudaMalloc(&d_res, my_N*my_D*sizeof(d_res[0]));
for (int i = 0; i < my_loopSize; i++) h_data[i] = i%my_N + i/(my_N*my_N); //rand()/(float)RAND_MAX;
cudaCheckErrors("CUDA failure");
cudaMemcpy(d_data, h_data, my_loopSize*sizeof(d_data[0]), cudaMemcpyHostToDevice);
// test original approach
cudaMemset(d_res, 0, my_N*my_D*sizeof(d_res[0]));
unsigned long long dt1 = dtime_usec(0);
kernel::sumNND<<<my_numBlocks, my_blockSize>>>(d_data, d_res, my_N, my_D);
cudaDeviceSynchronize();
dt1 = dtime_usec(dt1);
cudaMemcpy(h_res1, d_res, my_N*my_D*sizeof(d_res[0]), cudaMemcpyDeviceToHost);
//test columnwise reduction
unsigned long long dt2 = dtime_usec(0);
//sum_rows_NND<<<my_N*my_D, bsize>>>(d_data, d_res, my_N, my_D);
sum_cols_NND<<<(my_N + bsize -1)/bsize, bsize>>>(d_data, d_res, my_N, my_D);
cudaDeviceSynchronize();
dt2 = dtime_usec(dt2);
cudaMemcpy(h_res2, d_res, my_N*my_D*sizeof(d_res[0]), cudaMemcpyDeviceToHost);
// validate results
for (int i = 0; i < my_N*my_D; i++)
if (fabsf(h_res1[i] - h_res2[i]) > TOL) {printf("mismatch at %d, was %f, should be %f\n", i, h_res2[i], h_res1[i]); return -1;}
cudaCheckErrors("program error");
printf("results match, kernel 1 time: %fs, kernel 2 time: %fs\n", dt1/(float)USECPSEC, dt2/(float)USECPSEC);
// time row reduction kernel
unsigned long long dt3 = dtime_usec(0);
sum_rows_NND<<<my_N*my_D, bsize>>>(d_data, d_res, my_N, my_D);
cudaDeviceSynchronize();
dt3 = dtime_usec(dt3);
printf("row reduction kernel time: %fs\n", dt3/(float)USECPSEC);
cudaCheckErrors("program error");
}
$ nvcc -arch=sm_52 -o t1263 t1263.cu
$ ./t1263
mismatch at 10000, was 10000.000000, should be 1.000000
$