CUDA kernels don't seem to be able to be overloaded either.
It should be possible to overload kernels. Here is one possible approach, using overloading (and no templating):
$ cat t1648.cu
// Includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <helper_cuda.h>
__device__ float my_common(float *d, int width, unsigned int x, unsigned int y){
// 200 lines of common code...
return d[y *width +x];
}
////////////////////////////////////////////////////////////////////////////////
// Kernels
////////////////////////////////////////////////////////////////////////////////
//! Write to a cuArray using surface writes
//! @param gIData input data in global memory
////////////////////////////////////////////////////////////////////////////////
__global__ void WriteKernel(float *gIData, int width, int height,
cudaSurfaceObject_t outputSurface)
{
// calculate surface coordinates
unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
// read from global memory and write to cuarray (via surface reference)
surf2Dwrite(my_common(gIData, width, x, y),
outputSurface, x*4, y, cudaBoundaryModeTrap);
}
__global__ void WriteKernel(float *gIData, int width, int height,
float *out)
{
// calculate coordinates
unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
// read from global memory and write to global memory
out[y*width+x] = my_common(gIData, width, x, y);
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
printf("starting...\n");
unsigned width = 256;
unsigned height = 256;
unsigned int size = width * height * sizeof(float);
// Allocate device memory for result
float *dData = NULL;
checkCudaErrors(cudaMalloc((void **) &dData, size));
// Allocate array and copy image data
cudaChannelFormatDesc channelDesc =
cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaArray *cuArray;
float *out;
cudaMalloc(&out, size);
checkCudaErrors(cudaMallocArray(&cuArray,
&channelDesc,
width,
height,
cudaArraySurfaceLoadStore));
dim3 dimBlock(8, 8, 1);
dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);
cudaSurfaceObject_t outputSurface;
cudaResourceDesc surfRes;
memset(&surfRes, 0, sizeof(cudaResourceDesc));
surfRes.resType = cudaResourceTypeArray;
surfRes.res.array.array = cuArray;
checkCudaErrors(cudaCreateSurfaceObject(&outputSurface, &surfRes));
WriteKernel<<<dimGrid, dimBlock>>>(dData, width, height, outputSurface);
WriteKernel<<<dimGrid, dimBlock>>>(dData, width, height, out);
checkCudaErrors(cudaDestroySurfaceObject(outputSurface));
checkCudaErrors(cudaFree(dData));
checkCudaErrors(cudaFreeArray(cuArray));
}
$ nvcc -I/usr/local/cuda/samples/common/inc t1648.cu -o t1648
$
The above example was hacked together rapidly from the simpleSurfaceWrite CUDA sample code. It is not intended to be functional or run "correctly". It is designed to show how overloading can be used from a code structure standpoint to address the stated objective.