My code reads data from global memory, performs some calculation, which at the end needs to be normalized by the sum of all data in that particular block. I am using techniques from Mark Harris's "Optimizing Parallel Reduction in CUDA" to do the sum and that works significantly faster than summing via a loop (in my original code, a gain of almost 5x). However, when I'm done with the sum, __syncthreads()
is stalling the warps quite a bit and hence is the bottleneck of my code.
My SM and memory usage are at ~39% and 8% respectively, which I'm assuming means my code is latency-bound. My theoretical warps per scheduler is 8 and active warps is 7.71 but eligble warps is at only 0.24. My "No eligible [%]" is 77%. I'm assuming the stall wait and stall barrier, which are 13.59 and 13.04 cycles per instruction respectively, are responsible for the low warps eligibility. I've included sampling data values for the stalled lines (I'm particularly concerned about stall 1 as it's 29%) in the code comments below.
My actual code is a little more complicated, mainly due to a larger number of variables to be read / calculated and different equations, but my kernel is pretty much what I have included below.
#include <stdio.h>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true) {
if (code != cudaSuccess) {
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
#define length_1 256
#define length_2 100
#define length_3 250
__device__ float * d;
__global__ void kernel() {
int tid = threadIdx.x;
__shared__ float d_for_current_block[512];
d_for_current_block[tid] = d[blockIdx.x * blockDim.x + threadIdx.x]; // Move to shared memory because originally d isn't coalesced.
for (int k = 0; k < length_2; k++) {
__shared__ float power_sums[512];
__shared__ float powers[512];
float power = (2 * d_for_current_block[tid] - 1.0) / 1.0; // Stall 5; sampling data [all] = 1,111
powers[tid] = power;
power_sums[tid] = power * power;
// Sum powers of all threads in current block.
__syncthreads();
if (tid < 128) // Stall 4; sampling data [all] = 1,159
power_sums[tid] += power_sums[tid + 128];
__syncthreads();
if (tid < 64) // Stall 3; sampling data [all] = 2,213
power_sums[tid] += power_sums[tid + 64];
__syncthreads();
if (tid < 32) { // Stall 2; sampling data [all] = 3,100
power_sums[tid] += power_sums[tid + 32];
power_sums[tid] += power_sums[tid + 16];
power_sums[tid] += power_sums[tid + 8];
power_sums[tid] += power_sums[tid + 4];
power_sums[tid] += power_sums[tid + 2];
power_sums[tid] += power_sums[tid + 1];
}
__syncthreads();
powers[tid] = powers[tid] / power_sums[0]; // Stall 1; sampling data [all] = 17,016
}
}
int main(){
float * h, * d_ptr;
h = (float *) malloc(length_1 * length_2 * length_3 * sizeof(float));
for (int k = 0; k < length_3; k++)
for (int j = 0; j < length_1; j++)
for (int i = 0; i < length_2; i++)
h[i] = 100 * (float) rand() / (float)(RAND_MAX);
gpuErrchk( cudaMalloc((void**) &d_ptr, length_1 * length_2 * length_3 * sizeof(float)) );
gpuErrchk( cudaMemcpy(d_ptr, h, length_1 * length_2 * length_3 * sizeof(float), cudaMemcpyHostToDevice) );
gpuErrchk( cudaMemcpyToSymbol(d, &d_ptr, sizeof(float*)) );
kernel<<<length_3, length_1>>>();
gpuErrchk( cudaFree(d_ptr) );
free( h );
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
}
My question is, is there a way to reduce the latency barrier wait is creating? I understand where it's coming from but I was wondering if there is a way to restructure the code or something else, that I'm not seeing, that could decrease the latency. I've played around with changing block size and it doesn't make that much of a difference, which makes sense as in any case, all the threads need to be done before a warp can process any further.
From my understanding, one way to hide latency is to have other computations warps can do while they wait but I do not have any other calculation that needs to be done other than getting the normalized value (power_sums[0]) and using it to normalize all my powers. Is there something else that I can do or is this pretty much what I can expect to get for this particular problem?
EDIT:
Updated code that addresses Robert Crovella's comments below. New numbers: SM Usage = 39%; Memory Usage = 67%; Active Wraps = 7.95; Eligible wrap per scheduler = 0.25; No Eligible [%] = 76.04%.
In my original full code, SM usage and memory usage are 49% and 5% respectively. Other numbers are similar.
#include <stdio.h>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true) {
if (code != cudaSuccess) {
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
#define length_1 256
#define length_2 100
#define length_3 250
__device__ float * d;
__global__ void kernel(float * d_all_sums) {
int tid = threadIdx.x;
float final = 0.0;
for (int i = 0; i < length_2; i++) {
__shared__ float power_sums[512];
__shared__ float powers[512];
int index = blockIdx.x * blockDim.x * length_2 + threadIdx.x * length_2 + i;
float po = d[index]; // Stall 2; sampling data [all] = 4,114
float power = (2 * po - 1.0) / 2.0; // Stall 5; sampling data [all] = 1,111
powers[tid] = power;
power_sums[tid] = power * power;
// Sum powers of all threads in current block.
__syncthreads();
if (tid < 128) // Stall 5; sampling data [all] = 1,172
power_sums[tid] += power_sums[tid + 128];
__syncthreads();
if (tid < 64) // Stall 4; sampling data [all] = 2,037
power_sums[tid] += power_sums[tid + 64];
__syncthreads();
if (tid < 32) { // Stall 3; sampling data [all] = 2,843
power_sums[tid] += power_sums[tid + 32];
power_sums[tid] += power_sums[tid + 16];
power_sums[tid] += power_sums[tid + 8];
power_sums[tid] += power_sums[tid + 4];
power_sums[tid] += power_sums[tid + 2];
power_sums[tid] += power_sums[tid + 1];
}
__syncthreads();
final += powers[tid] / power_sums[0]; // Stall 1; sampling data [all] = 15,775
}
if (tid == 0)
d_all_sums[blockIdx.x] = final;
}
int main(){
float * h, * h_all_sums;
float * d_ptr, * d_all_sums;
h = (float *) malloc(length_1 * length_2 * length_3 * sizeof(float));
h_all_sums = (float *) malloc(length_3 * sizeof(float));
memset(h_all_sums, 0.0, length_3);
for (int k = 0; k < length_3; k++)
for (int j = 0; j < length_1; j++)
for (int i = 0; i < length_2; i++)
h[k * length_1 * length_2 + j * length_2 + i] = 100 * (float) rand() / (float)(RAND_MAX);
gpuErrchk( cudaMalloc((void**) &d_ptr, length_1 * length_2 * length_3 * sizeof(float)) );
gpuErrchk( cudaMemcpy(d_ptr, h, length_1 * length_2 * length_3 * sizeof(float), cudaMemcpyHostToDevice) );
gpuErrchk( cudaMemcpyToSymbol(d, &d_ptr, sizeof(float*)) );
gpuErrchk( cudaMalloc( (void**) &d_all_sums, length_3 * sizeof(float) ) );
gpuErrchk( cudaMalloc((void**) &d_ptr, length_1 * length_2 * length_3 * sizeof(float)) );
gpuErrchk( cudaMemcpy(d_ptr, h, length_1 * length_2 * length_3 * sizeof(float), cudaMemcpyHostToDevice) );
kernel<<<length_3, length_1>>>(d_all_sums);
gpuErrchk( cudaMemcpy(h_all_sums, d_all_sums, length_3 * sizeof(float), cudaMemcpyDeviceToHost) );
FILE *f = fopen("test_output.txt", "w");
if (f != NULL) {
for (int k = 0; k < length_3; k++)
fprintf(f, "k = %d; power = %f.\n", k, h_all_sums[k]);
}
fclose( f );
gpuErrchk( cudaFree(d_ptr) );
gpuErrchk( cudaFree(d_all_sums) );
free( h );
free( h_all_sums );
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
}