I am trying to understand the usage and benefit of the “cudaOccupancyMaxActiveBlocksPerMultiprocessor
” method.
I am using a slightly modified version of the sample program present on NVIDIA developer forum. Basically, I am asking the user to provide the size of the array.
My GPU: NVIDIA GeForce GTX 1070
QUESTIONS:
- The occupancy values returned by the program are very random. Many times, the program returns different occupancy values for the same input array size, is there anything wrong in the program?
- As shown in the screenshot, if user passed the array size=512 then, the occupancy value is “13” whereas if I set N=512 directly in the program then the occupancy value is “47”. Why?
- Why does user provided array size=1024 has occupancy value =0?
SAMPLE CODE:
Source.cpp
#include "kernel_header.cuh"
#include <algorithm>
#include <iostream>
using namespace std;
int main(int argc, char* argv[])
{
int N;
int userSize = 0;
//ask size to user
cout << "\n\nType the size of 1D Array: " << endl;
cin >> userSize;
N = userSize>0? userSize : 1024; //<<<<<<<<<<<<<<<-------PROBLEM
int* array = (int*)calloc(N, sizeof(int));
for (int i = 0; i < N; i++)
{
array[i] = i + 1;
//cout << "i = " << i << " is " << array[i]<<endl;
}
launchMyKernel(array, N);
free(array);
return 0;
}
kernel_header.cuh
#ifndef KERNELHEADER
#define KERNELHEADER
void launchMyKernel(int* array, int arrayCount);
#endif
kernel.cu
#include "stdio.h"
#include "cuda_runtime.h"
__global__ void MyKernel(int* array, int arrayCount)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < arrayCount)
{
array[idx] *= array[idx];
}
}
void launchMyKernel(int* array, int arrayCount)
{
int blockSize; // The launch configurator returned block size
int minGridSize; // The minimum grid size needed to achieve the
// maximum occupancy for a full device launch
int gridSize; // The actual grid size needed, based on input size
cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize,MyKernel, 0, 0);
// Round up according to array size
gridSize = (arrayCount + blockSize - 1) / blockSize;
MyKernel << < gridSize, blockSize >> > (array, arrayCount);
cudaDeviceSynchronize();
// calculate theoretical occupancy
int maxActiveBlocks;
cudaOccupancyMaxActiveBlocksPerMultiprocessor(&maxActiveBlocks,
MyKernel, blockSize,
0);
int device;
cudaDeviceProp props;
cudaGetDevice(&device);
cudaGetDeviceProperties(&props, device);
float occupancy = (maxActiveBlocks * blockSize / props.warpSize) /
(float)(props.maxThreadsPerMultiProcessor /
props.warpSize);
printf("\n\nMax. Active blocks found: %d\nOur Kernel block size decided: %d\nWarp Size: %d\nNumber of threads per SM: %d\n\n\n\n", maxActiveBlocks
, blockSize,
props.warpSize,
props.maxThreadsPerMultiProcessor);
printf("Launched blocks of size %d. Theoretical occupancy: %f\n",
blockSize, occupancy);
}