As just a copy kernel, the best possible performance would be approximately limited by the usable memory bandwidth. A rough estimate of this can be obtained by running the bandwidthTest
cuda sample code and referring to the device-to-device transfer reported number (will vary depending on GPU).
Your kernel is already fairly well written in that both the load and store operation should coalesce nicely. It's evident by inspection of the code, but you can prove this to yourself by running nvprof
with the --metrics gld_efficiency
and run also with --metrics gst_efficiency
. Both numbers should be close to 100%. (They are according to my testing.)
So in my case, when I run your kernel on a Quadro5000 GPU, taking the transfer size and dividing by the kernel execution time, I get a number that is about 60% of the available bandwidth. There's not much else going on in your kernel, so we're left to focus on these two lines in the loop:
int colId = j / numRows;
int rowId = j % numRows;
It turns out that integer division and modulo are both fairly expensive on the GPU; they are created by instruction sequences generated by the compiler -- there is no native divide or modulo machine instruction. So if we can figure out a way to get rid of these in the main loop, we may be able to get closer to our goal of 100% of the bandwidth reported by bandwidthTest
(device-to-device).
Since your increment in your loop is fixed (stride
), I believe we can pre-compute (mostly) the increments that need to be added to colId
and rowId
for each iteration of the loop, and use addition and subtraction, rather than division, inside the loop. The modified kernel would look like this:
__global__ void my_extractColumn_kernel(real_t *tgt, real_t *src, int *indices, int numRows, int len) {
int stride = gridDim.x * blockDim.x;
int div = stride/numRows;
int rem = stride%numRows;
int tid = blockDim.x * blockIdx.x + threadIdx.x;
int colId = tid / numRows;
int rowId = tid % numRows;
for (int j = tid; j < len; j += stride) {
tgt[j] = src[indices[colId]*numRows + rowId];
colId += div;
rowId += rem;
if (rowId >= numRows) {rowId-=numRows; colId++;}
}
}
So, by pre-computing the increments per iteration of the loop, we can avoid the "expensive" division-type operations in the main loop. So what about performance? This kernel gets closer to the 100% bandwidth goal. Here's a full test code:
#include <stdio.h>
#define NUMR 1000
#define NUMC 20000
#define DSIZE (NUMR*NUMC)
#define EXTC 10000
#define EXSZ (NUMR*EXTC)
#define nTPB 256
#define nBLK 64
typedef float real_t;
__global__ void extractColumn_kernel(real_t *tgt, real_t *src, int *indices, int numRows, int len) {
int stride = gridDim.x * blockDim.x;
int tid = blockDim.x * blockIdx.x + threadIdx.x;
for (int j = tid; j < len; j += stride) {
int colId = j / numRows;
int rowId = j % numRows;
tgt[j] = src[indices[colId]*numRows + rowId];
}
}
__global__ void my_extractColumn_kernel(real_t *tgt, real_t *src, int *indices, int numRows, int len) {
int stride = gridDim.x * blockDim.x;
int div = stride/numRows;
int rem = stride%numRows;
int tid = blockDim.x * blockIdx.x + threadIdx.x;
int colId = tid / numRows;
int rowId = tid % numRows;
for (int j = tid; j < len; j += stride) {
tgt[j] = src[indices[colId]*numRows + rowId];
colId += div;
rowId += rem;
if (rowId >= numRows) {rowId-=numRows; colId++;}
}
}
__global__ void copy_kernel(real_t *tgt, real_t *src, int len){
int tid = threadIdx.x+blockDim.x*blockIdx.x;
while (tid < len){
tgt[tid] = src[tid];
tid+=blockDim.x*gridDim.x;
}
}
int main(){
real_t *h_a, *d_a, *h_b, *d_b, *h_bi;
h_a = (real_t *) malloc(DSIZE*sizeof(real_t));
cudaMalloc(&d_a, DSIZE*sizeof(real_t));
h_b = (real_t *) malloc(EXSZ*sizeof(real_t));
cudaMalloc(&d_b, EXSZ*sizeof(real_t));
h_bi = (real_t *) malloc(EXSZ*sizeof(real_t));
int *h_ind, *d_ind;
h_ind = (int *) malloc(EXTC*sizeof(int));
cudaMalloc(&d_ind, EXTC*sizeof(int));
for (int i = 0; i < EXTC; i++) h_ind[i] = i;
for (int i = 0; i < DSIZE; i++) h_a[i] = i;
cudaMemcpy(d_a, h_a, DSIZE*sizeof(real_t), cudaMemcpyHostToDevice);
cudaMemcpy(d_ind, h_ind, EXTC*sizeof(int), cudaMemcpyHostToDevice);
extractColumn_kernel<<<nBLK, nTPB>>>(d_b, d_a, d_ind, NUMR, NUMR*EXTC);
cudaMemcpy(h_b, d_b, EXSZ*sizeof(real_t), cudaMemcpyDeviceToHost);
copy_kernel<<<nBLK, nTPB>>>(d_b, d_a, NUMR*EXTC);
cudaDeviceSynchronize();
my_extractColumn_kernel<<<nBLK, nTPB>>>(d_b, d_a, d_ind, NUMR, NUMR*EXTC);
cudaMemcpy(h_bi, d_b, EXSZ*sizeof(real_t), cudaMemcpyDeviceToHost);
for (int i = 0; i < EXSZ; i++)
if (h_bi[i] != h_b[i]) {printf("mismatch at %d, was: %f, should be: %f\n", i, h_bi[i], h_b[i]); return 1;}
printf("Success!\n");
return 0;
}
I've included a test on your kernel, my kernel, and in-between a test on a "copy kernel" which just does a pure copy of the same amount of data. This helps us to confirm our idea of the upper-bound of bandwidth available (see below).
Now performance data. bandwidthTest
on this GPU tells us:
$ /usr/local/cuda/samples/bin/x86_64/linux/release/bandwidthTest
[CUDA Bandwidth Test] - Starting...
Running on...
Device 0: Quadro 5000
Quick Mode
Host to Device Bandwidth, 1 Device(s)
PINNED Memory Transfers
Transfer Size (Bytes) Bandwidth(MB/s)
33554432 5855.8
Device to Host Bandwidth, 1 Device(s)
PINNED Memory Transfers
Transfer Size (Bytes) Bandwidth(MB/s)
33554432 6334.8
Device to Device Bandwidth, 1 Device(s)
PINNED Memory Transfers
Transfer Size (Bytes) Bandwidth(MB/s)
33554432 101535.4
Result = PASS
NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.
$
So we've got about 100GB/s of bandwidth available. Now running nvprof --print-gpu-trace
we see:
$ nvprof --print-gpu-trace ./t822
==17985== NVPROF is profiling process 17985, command: ./t822
Success!
==17985== Profiling application: ./t822
==17985== Profiling result:
Start Duration Grid Size Block Size Regs* SSMem* DSMem* Size Throughput Device Context Stream Name
781.98ms 29.400ms - - - - - 80.000MB 2.7211GB/s Quadro 5000 (0) 1 7 [CUDA memcpy HtoD]
811.40ms 9.0560us - - - - - 40.000KB 4.4170GB/s Quadro 5000 (0) 1 7 [CUDA memcpy HtoD]
811.44ms 1.3377ms (64 1 1) (256 1 1) 15 0B 0B - - Quadro 5000 (0) 1 7 extractColumn_kernel(float*, float*, int*, int, int) [188]
812.78ms 21.614ms - - - - - 40.000MB 1.8507GB/s Quadro 5000 (0) 1 7 [CUDA memcpy DtoH]
834.94ms 816.10us (64 1 1) (256 1 1) 9 0B 0B - - Quadro 5000 (0) 1 7 copy_kernel(float*, float*, int) [194]
835.77ms 911.39us (64 1 1) (256 1 1) 18 0B 0B - - Quadro 5000 (0) 1 7 my_extractColumn_kernel(float*, float*, int*, int, int) [202]
836.69ms 20.661ms - - - - - 40.000MB 1.9360GB/s Quadro 5000 (0) 1 7 [CUDA memcpy DtoH]
Regs: Number of registers used per CUDA thread. This number includes registers used internally by the CUDA driver and/or tools and can be more than what the compiler shows.
SSMem: Static shared memory allocated per CUDA block.
DSMem: Dynamic shared memory allocated per CUDA block.
$
The transfer size here is 1000 rows * 10000 columns * 4 bytes/element * 2 transfers per element (one read, one write) = 80,000,000 bytes. Your original kernel transfers this data in 1.34ms, for an average bandwidth of about 60GB/s. The "pure copy" kernel transfers the same data in 0.816ms, for an average bandwidth of 98GB/s - pretty close to our goal of 100GB/s. My modified "column-copy" kernel takes 0.911ms, so it's delivering about 88GB/s.
Looking for more performance? Since my computed variables rem
and div
are the same for every thread, you could precompute these quantities in host code, and pass them as kernel parameters. I'm not sure it will make much difference, but you could give it a try. You now have a roadmap for evaluating the performance effect, if any.
Notes:
Note that I believe my logic for updating the indices in my modified kernel is sound, but I haven't exhaustively tested it. It passes for the simple test case I have presented here.
I've omitted proper cuda error checking. However if you're having trouble you should use it, and/or run your code with cuda-memcheck
.
As I mentioned, your kernel is already pretty well written from a coalescing standpoint, so it already achieves about 60% of "best case". So if you were looking for a 2x, 5x, or 10x speedup here, you won't find it, and it's not reasonable to expect it.
EDIT: Further improvement
A likely reason that we are not getting even closer to the pure copy kernel in this case is due to this indirection:
tgt[j] = src[indices[colId]*numRows + rowId];
^^^^^^^^^^^^^^
the read from indices
(a global variable) represents an "extra" data access that our pure-copy kernel doesn't have to do. There may be clever ways to optimize the handling of this access as well. Since it will be repetetive (unlike the read of src
and the write to tgt
), it suggests that there may be some use of caching or specialized memory that may help.
If we inspect the nature of the access carefully, we can observe that probably (for reasonably large matrices), most of the time, the access to indices
will be uniform across threads in a warp. This means that typically, within a given warp, all threads will have the same value of colId
, and will therefore be requesting the same element from indices
. This type of pattern suggests a possible optimization using CUDA __constant__
memory. The changes needed here are not extensive; we basically need to move the indices
data to a __constant__
array.
Here's a modified code:
$ cat t822.cu
#include <stdio.h>
#define NUMR 1000
#define NUMC 20000
#define DSIZE (NUMR*NUMC)
#define EXTC 10000
#define EXSZ (NUMR*EXTC)
#define nTPB 256
#define nBLK 64
typedef float real_t;
__constant__ int my_indices[EXTC];
__global__ void extractColumn_kernel(real_t *tgt, real_t *src, int *indices, int numRows, int len) {
int stride = gridDim.x * blockDim.x;
int tid = blockDim.x * blockIdx.x + threadIdx.x;
for (int j = tid; j < len; j += stride) {
int colId = j / numRows;
int rowId = j % numRows;
tgt[j] = src[indices[colId]*numRows + rowId];
}
}
__global__ void my_extractColumn_kernel(real_t *tgt, real_t *src, int numRows, int len, int div, int rem) {
int stride = gridDim.x * blockDim.x;
int tid = blockDim.x * blockIdx.x + threadIdx.x;
int colId = tid / numRows;
int rowId = tid % numRows;
for (int j = tid; j < len; j += stride) {
tgt[j] = src[my_indices[colId]*numRows + rowId];
colId += div;
rowId += rem;
if (rowId >= numRows) {rowId-=numRows; colId++;}
}
}
__global__ void copy_kernel(real_t *tgt, real_t *src, int len){
int tid = threadIdx.x+blockDim.x*blockIdx.x;
while (tid < len){
tgt[tid] = src[tid];
tid+=blockDim.x*gridDim.x;
}
}
int main(){
real_t *h_a, *d_a, *h_b, *d_b, *h_bi;
h_a = (real_t *) malloc(DSIZE*sizeof(real_t));
cudaMalloc(&d_a, DSIZE*sizeof(real_t));
h_b = (real_t *) malloc(EXSZ*sizeof(real_t));
cudaMalloc(&d_b, EXSZ*sizeof(real_t));
h_bi = (real_t *) malloc(EXSZ*sizeof(real_t));
int *h_ind, *d_ind;
h_ind = (int *) malloc(EXTC*sizeof(int));
cudaMalloc(&d_ind, EXTC*sizeof(int));
for (int i = 0; i < EXTC; i++) h_ind[i] = i;
for (int i = 0; i < DSIZE; i++) h_a[i] = i;
cudaMemcpy(d_a, h_a, DSIZE*sizeof(real_t), cudaMemcpyHostToDevice);
cudaMemcpy(d_ind, h_ind, EXTC*sizeof(int), cudaMemcpyHostToDevice);
extractColumn_kernel<<<nBLK, nTPB>>>(d_b, d_a, d_ind, NUMR, NUMR*EXTC);
cudaMemcpy(h_b, d_b, EXSZ*sizeof(real_t), cudaMemcpyDeviceToHost);
copy_kernel<<<nBLK, nTPB>>>(d_b, d_a, NUMR*EXTC);
cudaDeviceSynchronize();
cudaMemcpyToSymbol(my_indices, h_ind, EXTC*sizeof(int));
int mydiv = (nBLK*nTPB)/NUMR;
int myrem = (nBLK*nTPB)%NUMR;
my_extractColumn_kernel<<<nBLK, nTPB>>>(d_b, d_a, NUMR, NUMR*EXTC, mydiv, myrem);
cudaMemcpy(h_bi, d_b, EXSZ*sizeof(real_t), cudaMemcpyDeviceToHost);
for (int i = 0; i < EXSZ; i++)
if (h_bi[i] != h_b[i]) {printf("mismatch at %d, was: %f, should be: %f\n", i, h_bi[i], h_b[i]); return 1;}
printf("Success!\n");
return 0;
}
$
And from the performance results:
$ nvprof --print-gpu-trace ./t822
==18998== NVPROF is profiling process 18998, command: ./t822
Success!
==18998== Profiling application: ./t822
==18998== Profiling result:
Start Duration Grid Size Block Size Regs* SSMem* DSMem* Size Throughput Device Context Stream Name
773.01ms 28.300ms - - - - - 80.000MB 2.8269GB/s Quadro 5000 (0) 1 7 [CUDA memcpy HtoD]
801.33ms 9.0240us - - - - - 40.000KB 4.4326GB/s Quadro 5000 (0) 1 7 [CUDA memcpy HtoD]
801.38ms 1.3001ms (64 1 1) (256 1 1) 15 0B 0B - - Quadro 5000 (0) 1 7 extractColumn_kernel(float*, float*, int*, int, int) [188]
802.68ms 20.773ms - - - - - 40.000MB 1.9256GB/s Quadro 5000 (0) 1 7 [CUDA memcpy DtoH]
823.98ms 811.75us (64 1 1) (256 1 1) 9 0B 0B - - Quadro 5000 (0) 1 7 copy_kernel(float*, float*, int) [194]
824.82ms 8.9920us - - - - - 40.000KB 4.4484GB/s Quadro 5000 (0) 1 7 [CUDA memcpy HtoD]
824.83ms 824.65us (64 1 1) (256 1 1) 13 0B 0B - - Quadro 5000 (0) 1 7 my_extractColumn_kernel(float*, float*, int, int, int, int) [204]
825.66ms 21.023ms - - - - - 40.000MB 1.9027GB/s Quadro 5000 (0) 1 7 [CUDA memcpy DtoH]
Regs: Number of registers used per CUDA thread. This number includes registers used internally by the CUDA driver and/or tools and can be more than what the compiler shows.
SSMem: Static shared memory allocated per CUDA block.
DSMem: Dynamic shared memory allocated per CUDA block.
we see that we are now very nearly at the goal of 100% utilization of available bandwidth (824 us vs. copy kernel time of 811 us). __constant__
memory is limited to 64KB total, so this means it's only usable in this fashion if the indices (effectively the number of columns that need to be copied) is less than 16,000, approximately.