I want to use CUDA 5.0 linking to write re-usable CUDA objects. i've set up this simple test of but my kernel fails silently (runs without error or exception and outputs junk).
My simple test (below) allocates an array of integers to CUDA device memory. The CUDA kernel should populate the array with sequential entries (0,1,2,....,9). The device array is copied to CPU memory and output to the console.
Currently, this code outputs "0,0,0,0,0,0,0,0,0," instead of the desired "0,1,2,3,4,5,6,7,8,9,". It is compiled using VS2010 and CUDA 5.0 (with compute_35 and sm_35 set). Running on Win7-64-bit with a GeForce 580.
In Test.h:
class Test
{
public:
Test();
~Test();
void Run();
private:
int* cuArray;
};
In Test.cu:
#include <stdio.h>
#include <assert.h>
#include <cuda_runtime.h>
#include "Test.h"
#define ARRAY_LEN 10
__global__ void kernel(int *p)
{
int elemID = blockIdx.x * blockDim.x + threadIdx.x;
p[elemID] = elemID;
}
Test::Test()
{
cudaMalloc(&cuArray, ARRAY_LEN * sizeof(int));
}
Test::~Test()
{
cudaFree(cuArray);
}
void Test::Run()
{
kernel<<<1,ARRAY_LEN>>>(cuArray);
// Copy the array contents to CPU-accessible memory
int cpuArray[ARRAY_LEN];
cudaMemcpy(static_cast<void*>(cpuArray), static_cast<void*>(cuArray), ARRAY_LEN * sizeof(int), cudaMemcpyDeviceToHost);
// Write the array contents to console
for (int i = 0; i < ARRAY_LEN; ++i)
printf("%d,", cpuArray[i]);
printf("\n");
}
In main.cpp:
#include <iostream>
#include "Test.h"
int main()
{
Test t;
t.Run();
}
I've experimented with the DECLs (__device__ __host__
) as suggested by @harrism but to no effect.
Can anyone suggest how to make his work? (The code works when it isn't inside a class.)