I've the following algorithm that runs parallel BFS using CUDA 7.5. The function takes in an array of edges and an array of vertices. An edge is defined as
typedef struct Edge
{
int first;
int second;
}Edge;
The vertices array is initialized to -1 for all vertices except the starting vertex which is 0. Thus we have something like
1 2 3 4 5 6.....1024
-1-1-1-1 0-1.....-1
In this case the starting vertex is 4 Assuming a sparse graph the edge list would have data in the following manner(the data would be in the edge stuff, but I've provided the integer representation)
1 2 3 4 5 .......2048
3 17 12 1 3........2010
9 34 20 9 17.......196
The BFS is supposed to run parallely with 2048 threads with any thread having 0 as the first/second index writing to the relevant index in the vertices array 1 and modify a bool to 1. This is the BFS code.
__global__ void bfs(Edge* edges, int* vertices, int current_depth, int* modified){
int e = blockDim.x * blockIdx.x + threadIdx.x;
int vfirst = edges[e].first;
if (vfirst > 1023) {printf("oops %d:%d\n", e, vfirst); return;}
int dfirst = vertices[vfirst];
int vsecond = edges[e].second;
if (vsecond > 1023) {printf("oops %d:%d\n", e, vsecond); return;}
int dsecond = vertices[vsecond];
if((dfirst == current_depth) && (dsecond == -1)){
vertices[vsecond] = current_depth;
printf("e:%d depth:%d\n", e, current_depth);
__syncthreads();
*modified = 1;
printf("%d\n", *modified);
}else if((dsecond == current_depth) && (dfirst == -1)){
vertices[vfirst] = current_depth;
printf("e:%d depth:%d\n", e, current_depth);
__syncthreads();
*modified = 1;
printf("%d\n", *modified);
}
}
This BFS kernel is being called repeatedly by the main code incrementing the value of current depth everytime it is called. This is the relevant section of the calling code.
begin = clock();
do{
h_modified = 0;
//printf("Entered while loop\n");
err = cudaMemcpy(d_modified, &h_modified, sizeof(int), cudaMemcpyHostToDevice);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to copy h_done to device(error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
printf("CUDA kernel launching with %d blocks of %d threads\n", edgeBlocks, threadsPerBlock);
bfs<<<edgeBlocks, threadsPerBlock>>>(d_edges, d_vertices, current_depth, d_modified);
cudaThreadSynchronize();
err = cudaGetLastError();
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to launch bfs kernel (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
//printf("Second kernel launch finished\n");
err = cudaMemcpy(&h_modified, d_modified, sizeof(int), cudaMemcpyDeviceToHost);
printf("%d\n", h_modified);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to copy d_done to host(error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
printf("BFS run for level %d\n", current_depth);
current_depth++;
}while(h_modified != 0);
end = clock();
time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
printf("Time taken: %f\n", time_spent);
The memcpy and mallocs have all been correctly and checked up. My problem is that in the first run of BFS the modified variable never gets modified to 1 and thus the function never gets called a second time. I've thoroughly checked my logic but just can't seem to put my finger on the problem. This is my first complete project in CUDA thus any help would be appreciated. For those who want a complete verifiable example please use the link,