I have a kernel running very well on Intel HD graphics card. But, when I want to run the kernel on my GeForce 960 it gives the CL_OUT_OF_RESOURCES error.
I have tried for different local sizes and made sure to not go beyond the array indices, but still have no clue why this error is happening. Do you know why my code runs fine on Intel and doesn't work on NVIDIA?
One weird thing that is happening in my code is that I have a 13 itrations of similar operations. For performance purposes, I have repeated the same operations for 13 times and avoided writing a loop just to save some additional operations that loops have. The code works on NVIDIA when I reach to the 11th operation. But, when I include the 12th operation in the code it gives the above error and the 11th and 12th operations are similar! Any ideas why such thing is happening?
Here is the kernel:
float2 projectCube(float3 axis, float3 vertex){
float voxelSize = 0.5f;
float2 projection = (float2)(0.0f, 0.0f);
float temp;
//1
temp = axis.x;
if (projection.x > temp){ projection.x = temp; }
else if (projection.y < temp){ projection.y = temp; }
//2
temp = axis.x + axis.y;
if (projection.x > temp){ projection.x = temp; }
else if (projection.y < temp){ projection.y = temp; }
//3
temp = axis.y;
if (projection.x > temp){ projection.x = temp; }
else if (projection.y < temp){ projection.y = temp; }
//4
temp = axis.z;
if (projection.x > temp){ projection.x = temp; }
else if (projection.y < temp){ projection.y = temp; }
//5
temp = axis.x + axis.z;
if (projection.x > temp){ projection.x = temp; }
else if (projection.y < temp){ projection.y = temp; }
//6
temp = axis.y + axis.z;
if (projection.x > temp){ projection.x = temp; }
else if (projection.y < temp){ projection.y = temp; }
//7
temp = axis.x + axis.y + axis.z;
if (projection.x > temp){ projection.x = temp; }
else if (projection.y < temp){ projection.y = temp; }
float product = dot(axis, vertex);
projection.x = voxelSize * projection.x + product;
projection.y = voxelSize * projection.y + product;
return projection;
}
float2 projectTriangle(float3 axis, float3 v0, float3 v1, float3 v2){
float2 projection;
projection.x = dot(axis, v0);
projection.y = projection.x;
float temp = dot(axis, v1);
if(projection.x > temp){
projection.x = temp;
}
else if(projection.y < temp){
projection.y = temp;
}
temp = dot(axis, v2);
if (projection.x > temp){
projection.x = temp;
}
else if (projection.y < temp){
projection.y = temp;
}
return projection;
}
float tester(float3 axis, float3 voxel, float3 v0, float3 v1, float3 v2){
float2 voxelProjection = projectCube(axis, voxel);
float2 faceProjection = projectTriangle(axis, v0, v1, v2);
float minProjection = fmin(voxelProjection.x, faceProjection.x);
float maxProjection = fmax(voxelProjection.y, faceProjection.y);
float testResult = maxProjection - minProjection - voxelProjection.y + voxelProjection.x
- faceProjection.y + faceProjection.x;
return testResult;
}
__kernel void voxelizer(size_t global_size,
float h_voxelSize,
__global float* h_minBoundsGrid,
__global int *h_dimGrid,
__global float* coords,
__global int* density)
{
//printf("local size is: %d\n", get_num_groups(0));
int i = get_global_id(0) * 9;
if (i <= global_size * 9){
float voxelSize = h_voxelSize;
float3 minBoundsGrid;
minBoundsGrid.x = h_minBoundsGrid[0];
minBoundsGrid.y = h_minBoundsGrid[1];
minBoundsGrid.z = h_minBoundsGrid[2];
int3 dimGrid;
dimGrid.x = h_dimGrid[0];
dimGrid.y = h_dimGrid[1];
dimGrid.z = h_dimGrid[2];
if ( i %9 == 0){
/*Triangle vertices*/
float3 v0;
v0 = (float3)(coords[i], coords[i + 1], coords[i + 2]);
float3 v1;
v1 = (float3)(coords[i + 3], coords[i + 4], coords[i + 5]);
float3 v2;
v2 = (float3)(coords[i + 6], coords[i + 7], coords[i + 8]);
//printf("i = %d. v0: %f, %f, %f\n", i, v0.x, v0.y, v0.z);
//printf("i = %d. v1: %f, %f, %f\n", i, v1.x, v1.y, v1.z);
//printf("i = %d. v2: %f, %f, %f\n", i, v2.x, v2.y, v2.z);
/*Normal vectors of the each voxel*/
float3 e0;
e0 = (float3)(0.5f, 0.0f, 0.0f);
float3 e1;
e1 = (float3)(0.0f, 0.5f, 0.0f);
float3 e2;
e2 = (float3)(0.0f, 0.0f, 0.5f);
/*Edges of a traingle*/
float3 f0;
f0 = v1 - v0;
float3 f1;
f1 = v2 - v1;
float3 f2;
f2 = v0 - v2;
float3 minLocalGrid;
minLocalGrid.x = fmin(v0.x, fmin(v1.x, v2.x));
minLocalGrid.y = fmin(v0.y, fmin(v1.y, v2.y));
minLocalGrid.z = fmin(v0.z, fmin(v1.z, v2.z));
minLocalGrid.x = voxelSize * floor(minLocalGrid.x / voxelSize);
minLocalGrid.y = voxelSize * floor(minLocalGrid.y / voxelSize);
minLocalGrid.z = voxelSize * floor(minLocalGrid.z / voxelSize);
//printf("i = %d. minLocalGrid = %f, %f, %f.\n", i, minLocalGrid.x, minLocalGrid.y, minLocalGrid.z);
float3 maxLocalGrid;
maxLocalGrid.x = fmax(v0.x, fmax(v1.x, v2.x));
maxLocalGrid.y = fmax(v0.y, fmax(v1.y, v2.y));
maxLocalGrid.z = fmax(v0.z, fmax(v1.z, v2.z));
maxLocalGrid.x = voxelSize * ceil(maxLocalGrid.x / voxelSize);
maxLocalGrid.y = voxelSize * ceil(maxLocalGrid.y / voxelSize);
maxLocalGrid.z = voxelSize * ceil(maxLocalGrid.z / voxelSize);
if (maxLocalGrid.x == minLocalGrid.x){ maxLocalGrid.x += voxelSize; }
if (maxLocalGrid.y == minLocalGrid.y){ maxLocalGrid.y += voxelSize; }
if (maxLocalGrid.z == minLocalGrid.z){ maxLocalGrid.z += voxelSize; }
//printf("i = %d. maxLocalGrid = %f, %f, %f.\n", i, maxLocalGrid.x, maxLocalGrid.y, maxLocalGrid.z);
//printf("i = %d\n v0 = %f, %f, %f\n v1 = %f, %f, %f\n v2 = %f, %f, %f\n minLocalGrid = %f, %f, %f\n===============\n",
// i, v0.x, v0.y, v0.z, v1.x, v1.y, v1.z, v2.x, v2.y, v2.z, maxLocalGrid.x, maxLocalGrid.y, maxLocalGrid.z);
float j = minLocalGrid.z;
while(j < maxLocalGrid.z){
float k = minLocalGrid.y;
while(k < maxLocalGrid.y){
float l = minLocalGrid.x;
while (l < maxLocalGrid.x){
float3 firstVertexOfVoxel = (float3)(l, k, j);
//printf("l,k,j: %f, %f, %f\n", l, k, j);
float3 globalCoordOffset = (firstVertexOfVoxel - minBoundsGrid) / voxelSize;
int3 globalDimOffset = convert_int3_rtz(globalCoordOffset);
//printf("i = %d. globalCoordOffset: %f, %f, %f\n", i, globalCoordOffset.x, globalCoordOffset.y, globalCoordOffset.z);
//printf("i = %d. globalDimOffset: %d, %d, %d\n", i, globalDimOffset.x, globalDimOffset.y, globalDimOffset.z);
int voxelIndexGlobalGrid = globalDimOffset.x + dimGrid.x * (globalDimOffset.y +
dimGrid.y * globalDimOffset.z);
//printf("i = %d. voxelIndexGlobalGrid = %d\n", i, voxelIndexGlobalGrid);
if (density[voxelIndexGlobalGrid] != 1){
/*The famous 13-axes test*/
float3 axis;
float testResult = 0;
int overlapCount = 0;
//1
testResult = tester(e0, firstVertexOfVoxel, v0, v1, v2);
if (testResult <= 0){
overlapCount++;
}
//2
testResult = tester(e1, firstVertexOfVoxel, v0, v1, v2);
if (testResult <= 0){
overlapCount++;
}
//3
testResult = tester(e2, firstVertexOfVoxel, v0, v1, v2);
if (testResult <= 0){
overlapCount++;
}
//4
//axis = ;
testResult = tester(cross(-f2, f0), firstVertexOfVoxel, v0, v1, v2);
if (testResult <= 0){
overlapCount++;
}
//5
/*axis = cross(e0, f0);*/
testResult = tester(cross(e0, f0), firstVertexOfVoxel, v0, v1, v2);
if (testResult <= 0){
overlapCount++;
}
//6
//axis = cross(e0, f0);
testResult = tester(cross(e0, f1), firstVertexOfVoxel, v0, v1, v2);
if (testResult <= 0){
overlapCount++;
}
//7
//axis = cross(e0, f0);
testResult = tester(cross(e0, f2), firstVertexOfVoxel, v0, v1, v2);
if (testResult <= 0){
overlapCount++;
}
//8
//axis = cross(e1, f0);
testResult = tester(cross(e1, f0), firstVertexOfVoxel, v0, v1, v2);
if (testResult <= 0){
overlapCount++;
}
//9
//axis = cross(e1, f1);
testResult = tester(cross(e1, f1), firstVertexOfVoxel, v0, v1, v2);
if (testResult <= 0){
overlapCount++;
}
//10
//axis = cross(e1, f2);
testResult = tester(cross(e1, f2), firstVertexOfVoxel, v0, v1, v2);
if (testResult <= 0){
overlapCount++;
}
//11
//axis = cross(e2, f0);
testResult = tester(cross(e2, f0), firstVertexOfVoxel, v0, v1, v2);
if (testResult <= 0){
overlapCount++;
}
//12
//axis = cross(e2, f1);
testResult = tester(cross(e2, f1), firstVertexOfVoxel, v0, v1, v2);
if (testResult <= 0){
overlapCount++;
}
//13
//axis = cross(e2, f2);
testResult = tester(cross(e2, f2), firstVertexOfVoxel, v0, v1, v2);
if (testResult <= 0){
overlapCount++;
}
if (overlapCount == 13){
density[voxelIndexGlobalGrid] = 1;
}
}
l = l + voxelSize;
}// while for l
k = k + voxelSize;
}// while for k
j = j + voxelSize;
}//while for j
//printf("Here are the max of the %d-th face: %f, %f, %f\n", i / 9, maxLocalGrid.x, maxLocalGrid.y, maxLocalGrid.z);
//printf("Here are the coordinates of the %d-th face: %f, %f, %f\n", i / 9, e1.x, e1.y, e1.z);
//printf("Here are the coordinates of the %d-th face: %f, %f, %f\n", i / 9, e2.x, e2.y, e2.z);
//printf("\n==================KERNEL COMPUTED==================\n");
//barrier(CLK_LOCAL_MEM_FENCE);
}
}
}
And this is the c-code:
#define DEVICE_SELECTOR 1 //0 for Intel and 1 for Nvidia in my computer
#define _CRT_SECURE_NO_WARNINGS
#define KERNEL_FILE "..\\voxelizerKernel.cl"
#define WORK_DIM 1
#define VOXEL_SIZE 0.5f
#define HALF_VOXEL_SIZE VOXEL_SIZE/2.0f;
//C header files
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <algorithm>
//OpenCL header files
#ifdef MAC
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif
cl_device_id create_device() {
cl_platform_id *platform;
cl_device_id dev;
cl_uint num_platform;
int err;
/* Identify a platform */
err = clGetPlatformIDs(0, NULL, &num_platform);
if (err < 0) {
printf("Error code: %d. Couldn't identify a platform\n", err);
exit(1);
}
platform = (cl_platform_id*)malloc(sizeof(cl_platform_id)*num_platform);
clGetPlatformIDs(num_platform, platform, NULL);
/* Access a device */
err = clGetDeviceIDs(platform[DEVICE_SELECTOR], CL_DEVICE_TYPE_GPU, 1, &dev, NULL);
if (err < 0) {
printf("Error code: %d. Couldn't access any devices\n", err);
exit(1);
}
return dev;
}
cl_program build_program(cl_context ctx, cl_device_id dev, const char* filename) {
cl_program program;
FILE *program_handle;
char *program_buffer, *program_log;
size_t program_size, log_size;
int err;
/* Read program file and place content into buffer */
program_handle = fopen(filename, "r");
if (program_handle == NULL) {
printf("Couldn't find the program file\n");
exit(1);
}
fseek(program_handle, 0, SEEK_END);
program_size = ftell(program_handle);
rewind(program_handle);
program_buffer = (char*)malloc(program_size + 1);
program_buffer[program_size] = '\0';
fread(program_buffer, sizeof(char), program_size, program_handle);
fclose(program_handle);
/* Create program from file */
program = clCreateProgramWithSource(ctx, 1,
(const char**)&program_buffer, &program_size, &err);
if (err < 0) {
printf("Error code: %d. Couldn't create the program\n", err);
exit(1);
}
free(program_buffer);
/* Build program */
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (err < 0) {
/* Find size of log and print to std output */
clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
0, NULL, &log_size);
program_log = (char*)malloc(log_size + 1);
program_log[log_size] = '\0';
clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
log_size + 1, program_log, NULL);
printf("%s\n", program_log);
free(program_log);
exit(1);
}
return program;
}
void print_device_info(cl_device_id dev){
cl_ulong glob_mem_size, local_mem_size;
cl_uint clock_freq, num_core, work_item_dim, time_res;
size_t local_size, work_item_size[3];
char dev_vendor[40], dev_name[400], driver_version[40], device_version[40];
clGetDeviceInfo(dev, CL_DEVICE_VENDOR, sizeof(dev_vendor), &dev_vendor, NULL);
clGetDeviceInfo(dev, CL_DEVICE_NAME, sizeof(dev_name), &dev_name, NULL);
clGetDeviceInfo(dev, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(glob_mem_size), &glob_mem_size, NULL);
clGetDeviceInfo(dev, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(local_mem_size), &local_mem_size, NULL);
clGetDeviceInfo(dev, CL_DRIVER_VERSION, sizeof(driver_version), &driver_version, NULL);
clGetDeviceInfo(dev, CL_DEVICE_VERSION, sizeof(device_version), &device_version, NULL);
clGetDeviceInfo(dev, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(clock_freq), &clock_freq, NULL);
clGetDeviceInfo(dev, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(num_core), &num_core, NULL);
clGetDeviceInfo(dev, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(local_size), &local_size, NULL);
clGetDeviceInfo(dev, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(work_item_size), &work_item_size, NULL);
clGetDeviceInfo(dev, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(work_item_dim), &work_item_dim, NULL);
clGetDeviceInfo(dev, CL_DEVICE_PROFILING_TIMER_RESOLUTION, sizeof(time_res), &time_res, NULL);
printf("==========================================================\n");
printf("Device Sepc without consideration of kernels:\n");
printf("CL_DEVICE_VENDOR: %s\n", dev_vendor);
printf("CL_DEVICE_NAME: %s\n", dev_name);
printf("CL_DEVICE_GLOBAL_MEM_SIZE: %I64u GB\n", glob_mem_size / 1073741824);
printf("CL_DEVICE_LOCAL_MEM_SIZE: %I64u KB\n", local_mem_size / 1024);
printf("CL_DRIVER_VERSION: %s\n", driver_version);
printf("CL_DEVICE_VERSION: %s\n", device_version);
printf("CL_DEVICE_MAX_CLOCK_FREQUENCY: %I32u MHz\n", clock_freq);
printf("CL_DEVICE_MAX_COMPUTE_UNITS: %I32u\n", num_core);
printf("CL_DEVICE_MAX_WORK_GROUP_SIZE %u\n", local_size);
printf("CL_DEVICE_MAX_WORK_ITEM_SIZES: {%I32u, %I32u, %I32u}\n", work_item_size[0], work_item_size[1], work_item_size[2]);
printf("CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS: %I32u\n", work_item_dim);
printf("CL_DEVICE_PROFILING_TIMER_RESOLUTION: %I32u ns\n", time_res);
printf("==========================================================\n");
}
int main()
{
/*OpenCL variables*/
cl_int i, j, err, num_groups;
size_t local_size, max_local_size, global_size, processed_global_size;
cl_context context;
cl_command_queue queue;
cl_program program;
cl_device_id device;
cl_kernel voxelization_kernel, reduction_kernel, reduction_complete_kernel;
cl_mem coords_buffer, density_buffer, dimGrid_buffer, h_minBoundsGrid_buffer, fullVxelsCount_buffer, group_sums_buffer;
void *density_mapped_memory;
cl_event prof_event;
cl_ulong time_start, time_end, total_time;
float h_voxelSize = VOXEL_SIZE;
float fullVxelsCount = 0;
/*Read mesh data*/
float coords[54] =
{ 0.300500,
1.300000,
0.000500,
1.200500,
1.600000,
0.000500,
1.600500,
0.600000,
0.000500,
0.300500,
1.300000,
0.000500,
0.500500,
1.900000,
0.000500,
1.200500,
1.600000,
0.000500,
0.300500,
1.300000,
0.000500,
1.600500,
0.600000,
0.000500,
0.100500,
0.700000,
0.000500,
0.100500,
0.700000,
0.000500,
1.600500,
0.600000,
0.000500,
0.000500,
0.200000,
0.000500,
0.000500,
0.200000,
0.000500,
1.600500,
0.600000,
0.000500,
1.600500,
0.100000,
0.000500,
1.200500,
1.600000,
0.000500,
1.600500,
1.300000,
0.000500,
1.600500,
0.600000,
0.000500 };
/*Get the voxel count*/
float boundsGrid[6] = {0,2,0,2,0,0.5};
int dimGrid[3] = {
(boundsGrid[1] - boundsGrid[0]) / VOXEL_SIZE,
(boundsGrid[3] - boundsGrid[2]) / VOXEL_SIZE,
(boundsGrid[5] - boundsGrid[4]) / VOXEL_SIZE
};
if (dimGrid[0] == 0) dimGrid[0] = 1;
if (dimGrid[1] == 0) dimGrid[1] = 1;
if (dimGrid[2] == 0) dimGrid[2] = 1;
float h_minBoundsGrid[3];
h_minBoundsGrid[0] = boundsGrid[0];
h_minBoundsGrid[1] = boundsGrid[2];
h_minBoundsGrid[2] = boundsGrid[4];
int voxelCounts = dimGrid[0] * dimGrid[1] * dimGrid[2];
/*Prepare kernel output : build an array for storing voxles' density info*/
int *density = (int*)malloc(sizeof(int)*voxelCounts);
for (int i = 0; i < voxelCounts; i++){
density[i] = 0;
}
/*OpenCL essentials*/
device = create_device();
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(max_local_size), &max_local_size, NULL);
//print_device_info(device);
context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
if (err < 0) {
printf("Error code: %d. Couldn't create a context\n", err);
exit(1);
}
program = build_program(context, device, KERNEL_FILE);
queue = clCreateCommandQueue(context, device,
CL_QUEUE_PROFILING_ENABLE, &err);
if (err < 0) {
printf("Error code: %d. Couldn't create a command queue\n", err);
exit(1);
};
voxelization_kernel = clCreateKernel(program, "voxelizer", &err);
if (err < 0) {
printf("Error code: %d. Couldn't create a kernel\n", err);
exit(1);
};
int numberOfFaces = 6;
global_size = numberOfFaces;
local_size = max_local_size;
if (global_size % local_size != 0){
processed_global_size = (global_size / local_size + 1) * local_size;
//int padding = processed_global_size - global_size;
//int *working_data = (int*)malloc((voxelCounts + padding)*sizeof(int));
//memcpy(working_data, density, voxelCounts);
//memset(working_data + voxelCounts, 0.0, padding);
}
else{
processed_global_size = global_size;
}
/* Create host-device data exchange interface*/
dimGrid_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY |
CL_MEM_COPY_HOST_PTR, sizeof(float)* 3, dimGrid, &err);
h_minBoundsGrid_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY |
CL_MEM_COPY_HOST_PTR, sizeof(float)* 3, h_minBoundsGrid, &err);
coords_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY |
CL_MEM_COPY_HOST_PTR, sizeof(float) * 54, coords, &err);
density_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY |
CL_MEM_COPY_HOST_PTR, sizeof(int) * voxelCounts, density, &err);
if (err < 0) {
printf("Error code: %d. Couldn't create a buffer\n", err);
exit(1);
};
err = clSetKernelArg(voxelization_kernel, 0, sizeof(global_size), &global_size);
err |= clSetKernelArg(voxelization_kernel, 1, sizeof(h_voxelSize), &h_voxelSize);
err |= clSetKernelArg(voxelization_kernel, 2, sizeof(cl_mem), &h_minBoundsGrid_buffer);
err |= clSetKernelArg(voxelization_kernel, 3, sizeof(cl_mem), &dimGrid_buffer);
err |= clSetKernelArg(voxelization_kernel, 4, sizeof(cl_mem), &coords_buffer);
err |= clSetKernelArg(voxelization_kernel, 5, sizeof(cl_mem), &density_buffer);
if (err < 0) {
printf("Error code: %d. Couldn't create an argument for voxelization_kernel\n", err);
exit(1);
}
/* Do the voxelization magic */
err = clEnqueueNDRangeKernel(queue, voxelization_kernel, 1, NULL, &processed_global_size,
&local_size, 0, NULL, &prof_event);
if (err < 0) {
printf("Error code: %d. Couldn't enqueue the voxelization_kernel\n", err);
exit(1);
}
/* Read the results */
density_mapped_memory = clEnqueueMapBuffer(queue, density_buffer, CL_TRUE,
CL_MAP_READ, 0, sizeof(density), 0, NULL, NULL, &err);
if (err < 0) {
printf("Error code : %d. Couldn't map the buffer to host memory\n", err);
exit(1);
}
memcpy(density, density_mapped_memory, sizeof(density)* voxelCounts);
err = clEnqueueUnmapMemObject(queue, density_buffer, density_mapped_memory,
0, NULL, NULL);
if (err < 0) {
printf("Error code: %d. Couldn't unmap the density_buffer\n", err);
exit(1);
}
for (int i = 0; i < voxelCounts; i++){
printf("%d\n", density[i]);
}
/*Clean up*/
clReleaseKernel(voxelization_kernel);
clReleaseMemObject(dimGrid_buffer);
clReleaseMemObject(h_minBoundsGrid_buffer);
clReleaseMemObject(coords_buffer);
clReleaseMemObject(density_buffer);
clReleaseCommandQueue(queue);
clReleaseProgram(program);
clReleaseContext(context);
return 0;
}