I have to program the Floyd algorithm using OpenCL, it works fine but only with n<268. when n>=268 i have an "Access violation reading location" when calling clEnqueueWriteBuffer (the buffer_distances one, in the loop).
Here is my code:
graphe is an adjacency matrix, and distances is the distances matrix
int n;
printf("enter n value: ");
scanf("%d", &n);
printf("\n");
int n2 = n * n;
int matSize = n2 * sizeof(int*);
int* graphe = malloc(sizeof(int) * n2);
int* distances = malloc(sizeof(int) * n2);
//mat[i,j] => mat[i*n + j]
if (graphe == NULL)
printf("malloc failed\n");
init_graphe(graphe, n);
copy(graphe, distances, n);
initialization of opencl variables:
char* programSource = load_kernel("kernel.cl");
cl_int status;
// STEP 1: Discover and initialize the platforms
cl_uint numPlatforms = 0;
cl_platform_id* platforms = NULL;
status = clGetPlatformIDs(0, NULL, &numPlatforms);
printf("Number of platforms = %d\n", numPlatforms);
platforms = (cl_platform_id*)malloc(numPlatforms * sizeof(cl_platform_id));
status = clGetPlatformIDs(numPlatforms, platforms, NULL);
char Name[1000];
clGetPlatformInfo(platforms[0], CL_PLATFORM_NAME, sizeof(Name), Name, NULL);
printf("Name of platform : %s\n", Name);
fflush(stdout);
// STEP 2: Discover and initialize the devices
cl_uint numDevices = 0;
cl_device_id* devices = NULL;
status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices);
printf("Number of devices = %d\n", (int)numDevices);
devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id));
status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL);
for (int i = 0; i < numDevices; i++) {
clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(Name), Name, NULL);
printf("Name of device %d: %s\n\n", i, Name);
}
// STEP 3: Create a context
fflush(stdout);
cl_context context = NULL;
context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &status);
// STEP 4: Create a command queue
fflush(stdout);
cl_command_queue cmdQueue;
cmdQueue = clCreateCommandQueue(context, devices[0], 0, &status);
// STEP 5: Create device buffers
fflush(stdout);
cl_mem buffer_graphe;
cl_mem buffer_n;
cl_mem buffer_distances;
cl_mem buffer_k;
buffer_graphe = clCreateBuffer(context, CL_MEM_READ_WRITE, matSize, NULL, &status);
buffer_n = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int), NULL, &status);
buffer_distances = clCreateBuffer(context, CL_MEM_READ_WRITE, matSize, NULL, &status);
buffer_k = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int), NULL, &status);
fflush(stdout);
// STEP 6: Create and compile the program
cl_program program = clCreateProgramWithSource(context, 1, (const char**)&programSource, NULL, &status);
printf("Compilation\n");
fflush(stdout);
status = clBuildProgram(program, numDevices, devices, NULL, NULL, NULL);
// STEP 8: Create the kernel
cl_kernel kernel = NULL;
fflush(stdout);
kernel = clCreateKernel(program, "floyd", &status);
size_t globalWorkSize[2] = { n, n };
size_t localWorkSize[3] = { 20,20 };
Execution of the kernel:
clock_t start = clock();
int k;
for (k = 0; k < n; k++) {
status = clEnqueueWriteBuffer(cmdQueue, buffer_graphe, CL_TRUE, 0, matSize, graphe, 0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buffer_n, CL_TRUE, 0, sizeof(int), &n, 0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buffer_distances, CL_TRUE, 0, matSize, distances, 0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buffer_k, CL_TRUE, 0, sizeof(int), &k, 0, NULL, NULL);
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&buffer_graphe);
status = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&buffer_n);
status = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&buffer_distances);
status = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&buffer_k);
status = clEnqueueNDRangeKernel(cmdQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
clFinish(cmdQueue);
status = clEnqueueReadBuffer(cmdQueue, buffer_distances, CL_TRUE, 0, matSize, distances, 0, NULL, NULL);
clFinish(cmdQueue);
}
and the kernel:
void kernel floyd(global int* graphe, global int* n, global int* distances, global int* k)
{
int i = get_global_id(0);
int j = get_global_id(1);
int ij = i * (*n) + j;
int ik = i * (*n) + (*k);
int kj = (*k) * (*n) + j;
if (distances[ik] + distances[kj] < distances[ij]) {
distances[ij] = distances[ik] + distances[kj];
}
}