I have an array matrix
with values of 0, and I want to increment some of it's elements by 1. The indices of matrix
which I want to increment are stored in array indices
. I need to increment some elements several times, thus I'm trying to use an array of mutexes for each of elements in matrix
. But when I launch my code, the program hangs and I get deadlock.
I'm stuck with this issue. What I ultimately want to do is to draw a continuous brush stroke that overlaps itself using CUDA, thus I need to access the same pixels of canvas image in parallel.
Here is my code:
#include <iostream>
using namespace std;
__global__ void add_kernel(int* matrix, int* indices, int* d_semaphores, int nof_indices)
{
int index = threadIdx.x + blockIdx.x * blockDim.x; // thread id
int ind = indices[index]; // indices of target array A to increment
if (index < nof_indices) {
while (atomicCAS(&d_semaphores[ind], 0, 1) != 0);
matrix[ind] += 1;
atomicExch(&d_semaphores[ind], 0);
__syncthreads();
}
}
int main()
{
int nof_indices = 6; // length of an array B
int indices[6] = { 0,1,2,3,4,1 }; // array B; stores indices of an array A which to increment
int canvas[10]; // array A
int semaphores[10]; // mutex array with individual mutexes for each of array A elements
int* d_canvas;
int* d_indices;
int* d_semaphores;
memset(canvas, 0, sizeof(canvas)); // set all array A elements to 0
memset(semaphores, 0, sizeof(semaphores)); // set all array A elements to 0
cudaMalloc(&d_canvas, sizeof(canvas));
cudaMalloc(&d_semaphores, sizeof(semaphores));
cudaMalloc(&d_indices, sizeof(indices));
cudaMemcpy(d_canvas, &canvas, sizeof(canvas), cudaMemcpyHostToDevice);
cudaMemcpy(d_indices, &indices, sizeof(indices), cudaMemcpyHostToDevice);
cudaMemcpy(d_semaphores, &semaphores, sizeof(semaphores), cudaMemcpyHostToDevice);
add_kernel << <1, 6 >> > (d_canvas, d_indices, d_semaphores, nof_indices);
cudaMemcpy(&canvas, d_canvas, sizeof(canvas), cudaMemcpyDeviceToHost);
for (int it = 0; it < nof_indices; it++) {
cout << canvas[it] << endl;
}
cudaFree(d_canvas);
cudaFree(d_indices);
cudaFree(d_semaphores);
return 0;
}
In this example the resulting array matrix
should look like this : {1, 2 ,1 ,1,1,0}
, but I only get it when I run kernel with dimensions << 6,1 >>
.
I'm using CUDA 12.1, Geforce RTX 3060
Thank you
( It only works when I set thread per block size to 1, but it's not what I want )