The code below reports an example of Instruction Level Parallelism (ILP).
The __global__
function in the example simply performs an assignment between two arrays. For the case ILP=1
, we have as many threads as the number of array elements N
, so that each thread performs a single assignment. Opposite to that, for the case ILP=2
, we have a number of N/2
threads each one processing 2
elements. In general, for the case ILP=k
, we have a number of N/k
threads each one processing k
elements.
Besides the code, below I'm reporting also the timings, performed on an NVIDIA GT920M
(Kepler architecture), for different values of N
and ILP
. As it can be seen:
- for large values of
N
, a memory bandwidth close to the maximum one for the GT920M
card, namely, 14.4GB/s
, is reached;
- for any fixed
N
, changing the value of ILP
does not change performance.
Concerning point 2., I have also tested the same code on Maxwell, and observed the same behavior (no change in performance against ILP
). For a change in performance against ILP
, please see the answer to The efficiency and performance of ILP for the NVIDIA Kepler architecture reporting also tests on the Fermi architecture.
The memory speed has been calculated by the following formula:
(2.f * 4.f * N * numITER) / (1e9 * timeTotal * 1e-3)
where
4.f * N * numITER
is the number of read OR writes,
2.f * 4.f * N * numITER
is the number of read AND writes,
timeTotal * 1e-3
is the time in seconds
(timeTotal
is in ms
).
THE CODE
// --- GT920m - 14.4 GB/s
// http://gpuboss.com/gpus/GeForce-GTX-280M-vs-GeForce-920M
#include<stdio.h>
#include<iostream>
#include "Utilities.cuh"
#include "TimingGPU.cuh"
#define BLOCKSIZE 32
#define DEBUG
/****************************************/
/* INSTRUCTION LEVEL PARALLELISM KERNEL */
/****************************************/
__global__ void ILPKernel(const int * __restrict__ d_a, int * __restrict__ d_b, const int ILP, const int N) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x * ILP;
if (tid >= N) return;
for (int j = 0; j < ILP; j++) d_b[tid + j * blockDim.x] = d_a[tid + j * blockDim.x];
}
/********/
/* MAIN */
/********/
int main() {
//const int N = 8192;
const int N = 524288 * 32;
//const int N = 1048576;
//const int N = 262144;
//const int N = 2048;
const int numITER = 100;
const int ILP = 16;
TimingGPU timerGPU;
int *h_a = (int *)malloc(N * sizeof(int));
int *h_b = (int *)malloc(N * sizeof(int));
for (int i = 0; i<N; i++) {
h_a[i] = 2;
h_b[i] = 1;
}
int *d_a; gpuErrchk(cudaMalloc(&d_a, N * sizeof(int)));
int *d_b; gpuErrchk(cudaMalloc(&d_b, N * sizeof(int)));
gpuErrchk(cudaMemcpy(d_a, h_a, N * sizeof(int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_b, h_b, N * sizeof(int), cudaMemcpyHostToDevice));
/**************/
/* ILP KERNEL */
/**************/
float timeTotal = 0.f;
for (int k = 0; k < numITER; k++) {
timerGPU.StartCounter();
ILPKernel << <iDivUp(N / ILP, BLOCKSIZE), BLOCKSIZE >> >(d_a, d_b, ILP, N);
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
timeTotal = timeTotal + timerGPU.GetCounter();
}
printf("Bandwidth = %f GB / s; Num blocks = %d\n", (2.f * 4.f * N * numITER) / (1e6 * timeTotal), iDivUp(N / ILP, BLOCKSIZE));
gpuErrchk(cudaMemcpy(h_b, d_b, N * sizeof(int), cudaMemcpyDeviceToHost));
for (int i = 0; i < N; i++) if (h_a[i] != h_b[i]) { printf("Error at i = %i for kernel0! Host = %i; Device = %i\n", i, h_a[i], h_b[i]); return 1; }
return 0;
}
PERFORMANCE
GT 920M
N = 512 - ILP = 1 - BLOCKSIZE = 512 (1 block - each block processes 512 elements) - Bandwidth = 0.092 GB / s
N = 1024 - ILP = 1 - BLOCKSIZE = 512 (2 blocks - each block processes 512 elements) - Bandwidth = 0.15 GB / s
N = 2048 - ILP = 1 - BLOCKSIZE = 512 (4 blocks - each block processes 512 elements) - Bandwidth = 0.37 GB / s
N = 2048 - ILP = 2 - BLOCKSIZE = 256 (4 blocks - each block processes 512 elements) - Bandwidth = 0.36 GB / s
N = 2048 - ILP = 4 - BLOCKSIZE = 128 (4 blocks - each block processes 512 elements) - Bandwidth = 0.35 GB / s
N = 2048 - ILP = 8 - BLOCKSIZE = 64 (4 blocks - each block processes 512 elements) - Bandwidth = 0.26 GB / s
N = 2048 - ILP = 16 - BLOCKSIZE = 32 (4 blocks - each block processes 512 elements) - Bandwidth = 0.31 GB / s
N = 4096 - ILP = 1 - BLOCKSIZE = 512 (8 blocks - each block processes 512 elements) - Bandwidth = 0.53 GB / s
N = 4096 - ILP = 2 - BLOCKSIZE = 256 (8 blocks - each block processes 512 elements) - Bandwidth = 0.61 GB / s
N = 4096 - ILP = 4 - BLOCKSIZE = 128 (8 blocks - each block processes 512 elements) - Bandwidth = 0.74 GB / s
N = 4096 - ILP = 8 - BLOCKSIZE = 64 (8 blocks - each block processes 512 elements) - Bandwidth = 0.74 GB / s
N = 4096 - ILP = 16 - BLOCKSIZE = 32 (8 blocks - each block processes 512 elements) - Bandwidth = 0.56 GB / s
N = 8192 - ILP = 1 - BLOCKSIZE = 512 (16 blocks - each block processes 512 elements) - Bandwidth = 1.4 GB / s
N = 8192 - ILP = 2 - BLOCKSIZE = 256 (16 blocks - each block processes 512 elements) - Bandwidth = 1.1 GB / s
N = 8192 - ILP = 4 - BLOCKSIZE = 128 (16 blocks - each block processes 512 elements) - Bandwidth = 1.5 GB / s
N = 8192 - ILP = 8 - BLOCKSIZE = 64 (16 blocks - each block processes 512 elements) - Bandwidth = 1.4 GB / s
N = 8192 - ILP = 16 - BLOCKSIZE = 32 (16 blocks - each block processes 512 elements) - Bandwidth = 1.3 GB / s
...
N = 16777216 - ILP = 1 - BLOCKSIZE = 512 (32768 blocks - each block processes 512 elements) - Bandwidth = 12.9 GB / s
N = 16777216 - ILP = 2 - BLOCKSIZE = 256 (32768 blocks - each block processes 512 elements) - Bandwidth = 12.8 GB / s
N = 16777216 - ILP = 4 - BLOCKSIZE = 128 (32768 blocks - each block processes 512 elements) - Bandwidth = 12.8 GB / s
N = 16777216 - ILP = 8 - BLOCKSIZE = 64 (32768 blocks - each block processes 512 elements) - Bandwidth = 12.7 GB / s
N = 16777216 - ILP = 16 - BLOCKSIZE = 32 (32768 blocks - each block processes 512 elements) - Bandwidth = 12.6 GB / s