On the host side I'm reading in a 128 x 128 integer array with random values between 0-31. I have an Occurrences array that stores the values 0-31 and then on the device I am trying to execute a kernel that loops through the values in the 128 x 128 array and then counts the number of times 0-31 appears.
I am having issues with how to split up the blocks/threads in CUDA and how to get the kernel to provide communication back to the host and print out the number of occurrences of every element.This is my first time using CUDA and I would appreciate any constructive advice! Here is my code so far:
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#define MAXR 16
#define MAXC 16
#define N 256
__global__ void count(int *arrayONE_d, int *occurrences_d, int *occurrences_final_d) {
int count = 0;
//provide unique thread ID
int idx = threadIdx.x + blockIdx.x * blockDim.x;
int k;
//for(k=0; k < 32;k++) {
// occurrences_d[k]=k;
// }
if(idx < N) {
//for(k=0; k < MAXR*MAXC; k++) {
for(int j=0; j<32; j++) {
count =0;
if(arrayONE_d[idx]==occurrences_d[j]){
count+=1;
occurrences_final_d[j] =count;
}
else {}
}
}
//occurrences_final_d[0] = 77;
}
}
int main(void) {
//const int N = MAXR*MAXC;
int arr1_h[MAXR][MAXC];
//int *occurrences_h[0][32];
//creating arrays for the device (GPU)
//int *arr1_d;
int occurrences_h[32];
int *occurrences_d;
int *occurrences_final_h[32] = {0};
int *occurrences_final_d;
int *arrayONE_h[256] = {0};
int *arrayONE_d;
int i, j;
// allocating memory for the arrays on the device
cudaMalloc( (void**) &arrayONE_d, MAXR*MAXC*sizeof(int)); // change to 16384 when using 128x128
cudaMalloc( (void**) &occurrences_d, 32* sizeof(int));
cudaMalloc( (void**) &occurrences_final_d, 32*sizeof(int));
/*
for(i=0; i < 32; i++) {
occurrences_h[i] = i;
}
/*
*
*/
//Reading in matrix from .txt file and storing it in arr1 on the host (CPU)
FILE *fp;
fp =fopen("arrays16.txt","r");
// this loop takes the information from .txt file and puts it into arr1 matrix
for(i=0;i<MAXR;i++) {
for(j=0;j<MAXC;j++)
{
fscanf(fp,"%d\t", &arr1_h[i][j]);
}
}
for(i=0;i<MAXR;i++) {
printf("\n");
for(j=0;j<MAXC;j++) {
//printf("d\t", arr1_h[i][j]);
}
printf("\n\n");
}
int x,y;
int z=0;
// this loop flattens the 2d array and makes it a 1d array of length MAXR*MAXC
for(x=0;x<MAXR;x++)
{
for(y=0;y<MAXC;y++)
{
// printf("**%d ",arr1_h[x][y]);
arrayONE_h[z]= &arr1_h[x][y];
z++;
}
}
for(x=0; x < 256; x++) {
printf("%d\n", *arrayONE_h[x]);
//return 0;
}
int length = sizeof(arrayONE_h)/sizeof(arrayONE_h[0]);
printf("\n\n");
printf("**LENGTH = %d", length);
// copying the arrays/memory from the host to the device (GPU)
cudaMemcpy(arrayONE_d, &arrayONE_h, MAXR*MAXC*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(occurrences_d, &occurrences_h, 32*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(occurrences_final_d, &occurrences_final_h, 32*sizeof(int), cudaMemcpyHostToDevice);
// how many blocks we will allocate
//dim3 DimGrid();
//how many threads per block we will allocate
dim3 DimBlock(256);
//kernel launch against the GPU
count<<<1, DimBlock>>>(arrayONE_d,occurrences_d,occurrences_final_d);
//copy the arrays post-computation from the device back to the host (CPU)
cudaMemcpy(&occurrences_final_h, occurrences_final_d, 32*sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(&occurrences_h, occurrences_d, 32*sizeof(int), cudaMemcpyDeviceToHost);
// some error checking - run this with cuda-memcheck when executing your code
cudaError_t errSync = cudaGetLastError();
cudaError_t errAsync = cudaDeviceSynchronize();
if (errSync != cudaSuccess)
printf("Sync kernel error: %s\n", cudaGetErrorString(errSync));
if (errAsync != cudaSuccess)
printf("Async kernel error: %s\n", cudaGetErrorString(errAsync));
//free up the memory of the device arrays
cudaFree(arrayONE_d);
cudaFree(occurrences_d);
cudaFree(occurrences_final_d);
//print out the number of occurrences of each 0-31 value
for(i=0;i<32;i++) {
printf("\n");
printf("%d\n",occurrences_final_h[i]);
}
}