In the following simple example, I'm allocating memory on the device using cudaMalloc3D
, and incrementing every voxel of my 3D data by one, which works fine, as long as I'm using symmetric 3D volumes.
Host Code looks like this:
int main(void)
{
typedef float PixelType;
// Set up test data
dim3 image_dimensions = dim3(32, 32, 32);
size_t num_elements = image_dimensions.x * image_dimensions.y * image_dimensions.z;
PixelType *image_data = new float[num_elements];
for(int i = 0; i < num_elements; ++i)
{
image_data[i] = float(i);
}
// Allocate 3D memory on the device
cudaExtent volumeSizeBytes = make_cudaExtent(sizeof(PixelType) * image_dimensions.x, image_dimensions.y, image_dimensions.z);
cudaPitchedPtr devicePitchedPointer;
cudaMalloc3D(&devicePitchedPointer, volumeSizeBytes);
cudaMemset3D(devicePitchedPointer, 1.0f, volumeSizeBytes);
// Copy image data from the host to the device
cudaMemcpy3DParms copy_params_host_to_device = {0};
copy_params_host_to_device.srcPtr = make_cudaPitchedPtr((void *)image_data, sizeof(PixelType) * image_dimensions.x, image_dimensions.y, image_dimensions.z);
copy_params_host_to_device.dstPtr = devicePitchedPointer;
copy_params_host_to_device.extent = volumeSizeBytes;
copy_params_host_to_device.kind = cudaMemcpyHostToDevice;
cudaMemcpy3D(©_params_host_to_device);
// Kernel Launch Configuration
dim3 threads_per_block = dim3(8, 8, 8);
dim3 blocks_per_grid = dim3((image_dimensions.x + threads_per_block.x - 1) / threads_per_block.x, (image_dimensions.y + threads_per_block.y - 1) / threads_per_block.y, (image_dimensions.z + threads_per_block.z - 1) / threads_per_block.z);
extract_patches_from_image_data<<<blocks_per_grid, threads_per_block>>>(devicePitchedPointer, image_dimensions);
cudaDeviceSynchronize();
// Copy image data back from the device to the host
cudaMemcpy3DParms copy_params_device_to_host = {0};
copy_params_device_to_host.srcPtr = devicePitchedPointer;
copy_params_device_to_host.dstPtr = make_cudaPitchedPtr((void *)image_data, sizeof(PixelType) * image_dimensions.x, image_dimensions.y, image_dimensions.z);
copy_params_device_to_host.extent = volumeSizeBytes;
copy_params_device_to_host.kind = cudaMemcpyDeviceToHost;
cudaMemcpy3D(©_params_device_to_host);
// Check image data
for(int i = 0; i < num_elements; ++i)
{
std::cout << "Element: " << i << " - " << image_data[i] << std::endl;
}
// Free Memory
cudaFree(devicePitchedPointer.ptr);
delete [] image_data;
}
The corresponding kernel for incrementing all values:
__global__ void extract_patches_from_image_data(cudaPitchedPtr devicePitchedPointer, dim3 image_dimensions)
{
// Index Calculation
int x = threadIdx.x + blockDim.x * blockIdx.x;
int y = threadIdx.y + blockDim.y * blockIdx.y;
int z = threadIdx.z + blockDim.z * blockIdx.z;
// Get attributes from device pitched pointer
char *devicePointer = (char *)devicePitchedPointer.ptr;
size_t pitch = devicePitchedPointer.pitch;
size_t slicePitch = pitch * image_dimensions.y;
// Loop over image data
if(z < image_dimensions.z)
{
char *current_slice_index = devicePointer + z * slicePitch;
if(y < image_dimensions.y)
{
// Get data array containing all elements from the current row
PixelType *current_row = (PixelType *)(current_slice_index + y * pitch);
if(x < image_dimensions.x)
{
current_row[x] = current_row[x] + 1.0f;
// Get values of all all neighbors
}
}
}
}
As long as I keep my image_dimensions
symmetric, e.g. (32, 32, 32), everything works fine. When I'm trying to use (32, 32, 33), it works fine until voxel 33759, the following values remain the same. My question now is how I should adapt my code in order to use non-symmetric data.