I just started working with CUDA and tried writing some C code. The checkStr() function is getting an array of strings, the length of that array, quantity of threads and a solution/passphrase. The Threads executing this method then check if a string from that arrray matches the solution/passphrase. Since there might be more strings than threads, one thread might check for multiple strings.
__device__ bool device_strings_equal(const char* a, const char* b) {
int i = 0;
while (a[i] == b[i]) {
if (a[i] == '\0') {
return true; // matches passkey
}
i++;
}
return false; // doesnt match
}
__global__ void checkStr(char** word, int list_length, char* solution, int* results) {
int ThreadID = blockIdx.x * blockDim.x + threadIdx.x;
int strings_per_thread = (list_length + blockDim.x - 1) / blockDim.x;
// define what strings the threads should use
int start_indx = ThreadID * strings_per_thread;
int end_indx = start_indx + strings_per_thread;
if (end_indx > list_length) {
end_indx = list_length;
}
for (int i = start_indx; i < end_indx; ++i) {
results[i] = 0; // initializing this value just in case
if (!device_strings_equal(word[i], solution)) {
results[i] = 1;
}
}
}
int main()
{
char* list[] = { "abccc\0", "yhdrh\0", "ydhydhyh\0" , "yhfxjtj\0" , "yhrtjksy\0" , "yrgb\0" , "xjjtj\0" , "syjhsrjx\0" , "zsjkjh\0" , "aehhggg\0" };
char** gpu_list;
int len = 10;
char* solution = "yhfxjtj\0";
int* gpu_results;
cudaDeviceSynchronize();
// allocate memory on gpu
cudaMalloc((void**) &gpu_list, len * sizeof(char*));
cudaMalloc((void**) &gpu_results, len * sizeof(int));
// copy list over to gpu
cudaMemcpy(gpu_list, list, len * sizeof(char*), cudaMemcpyHostToDevice);
int thread_per_block = 5;
int blocksPerGrid = (len + thread_per_block - 1) / thread_per_block;
checkStr << <blocksPerGrid, thread_per_block >> > (gpu_list, len, solution, results);
// get result array from cuda methos
cudaDeviceSynchronize();
int result_list[10];
cudaMemcpy(result_list, gpu_results, len * sizeof(int), cudaMemcpyDeviceToHost);
// printing results
for (int i = 0; i < len; ++i) {
printf("RESULT: %d\n", result_list[i]);
}
cudaFree(gpu_list);
cudaFree(gpu_results);
return 0;
}
I expected an output of only 0's and one 1. But instead i only get the output:
RESULT: -858993460
RESULT: -858993460
RESULT: -858993460
RESULT: -858993460
RESULT: -858993460
RESULT: -858993460
RESULT: -858993460
RESULT: -858993460
RESULT: -858993460
RESULT: -858993460
RESULT: -858993460
RESULT: -858993460
As far as i know that means that the integers have not been initialized yet. When using the Nsight debugger, the result array does contain the expected values, but a regular debugger or even executing the program leads to the wrong output again. It seems like the ...
cudaMemcpy(result_list, gpu_results, len * sizeof(int), cudaMemcpyDeviceToHost);
... method doesnt work (or isnt used) as intended. I'd really appreciate any help