Parallel histogram gives wrong results

Question

I try to implement character frequency program in C using CUDA and I have an issue with the results. I think it's something about thread synchronization but I need help.

Output for 1 block and 1 thread per block:

>./char_freq.exe test.txt 1 1
file size is 115
0 = 0 
1 = 0 
2 = 0 
3 = 0 
4 = 0 
5 = 0 
6 = 0 
7 = 0 
8 = 0 
9 = 0 
10 = 0
11 = 0
12 = 0
13 = 0
14 = 0
15 = 0
16 = 0
17 = 0
18 = 0
19 = 0
20 = 0
21 = 0
22 = 0
23 = 0
24 = 0
25 = 0
26 = 0
27 = 0
28 = 0
29 = 0
30 = 0
31 = 0
32 = 0
33 = 0
34 = 0
35 = 0
36 = 0
37 = 0
38 = 0
39 = 0
40 = 0
41 = 0
42 = 0
43 = 0
44 = 0
45 = 0
46 = 0
47 = 0
48 = 0
49 = 0
50 = 0
51 = 0
52 = 1
53 = 1
54 = 1
55 = 0
56 = 0
57 = 0
58 = 0
59 = 0
60 = 0
61 = 0
62 = 0
63 = 0
64 = 0
65 = 0
66 = 0
67 = 0
68 = 0
69 = 0
70 = 0
71 = 0
72 = 0
73 = 0
74 = 0
75 = 0
76 = 0
77 = 0
78 = 0
79 = 0
80 = 0
81 = 0
82 = 0
83 = 0
84 = 0
85 = 0
86 = 0
87 = 0
88 = 0
89 = 0
90 = 0
91 = 0
92 = 0
93 = 0
94 = 0
95 = 0
96 = 0
97 = 0
98 = 2
99 = 2
100 = 9
101 = 1
102 = 14
103 = 7
104 = 18
105 = 1
106 = 14
107 = 20
108 = 0
109 = 0
110 = 1
111 = 1
112 = 0
113 = 0
114 = 5
115 = 8
116 = 3
117 = 0
118 = 0
119 = 0
120 = 0
121 = 6
122 = 0
123 = 0
124 = 0
125 = 0
126 = 0
127 = 0



N: 128, Blocks: 1, Threads: 1
Total time (ms): 0.143
Kernel time (ms): 0.046
Data transfer time(ms): 0.097

Output for 1 block and 5 threads per block:

>./char_freq.exe test.txt 1 5
file size is 115
0 = 0 
1 = 0 
2 = 0 
3 = 0 
4 = 0 
5 = 0 
6 = 0 
7 = 0 
8 = 0 
9 = 0 
10 = 0
11 = 0
12 = 0
13 = 0
14 = 0
15 = 0
16 = 0
17 = 0
18 = 0
19 = 0
20 = 0
21 = 0
22 = 0
23 = 0
24 = 0
25 = 0
26 = 0
27 = 0
28 = 0
29 = 0
30 = 0
31 = 0
32 = 0
33 = 0
34 = 0
35 = 0
36 = 0
37 = 0
38 = 0
39 = 0
40 = 0
41 = 0
42 = 0
43 = 0
44 = 0
45 = 0
46 = 0
47 = 0
48 = 0
49 = 0
50 = 0
51 = 0
52 = 1
53 = 1
54 = 1
55 = 0
56 = 0
57 = 0
58 = 0
59 = 0
60 = 0
61 = 0
62 = 0
63 = 0
64 = 0
65 = 0
66 = 0
67 = 0
68 = 0
69 = 0
70 = 0
71 = 0
72 = 0
73 = 0
74 = 0
75 = 0
76 = 0
77 = 0
78 = 0
79 = 0
80 = 0
81 = 0
82 = 0
83 = 0
84 = 0
85 = 0
86 = 0
87 = 0
88 = 0
89 = 0
90 = 0
91 = 0
92 = 0
93 = 0
94 = 0
95 = 0
96 = 0
97 = 0
98 = 2
99 = 2
100 = 9
101 = 1
102 = 12
103 = 7
104 = 13
105 = 1
106 = 11
107 = 12
108 = 0
109 = 0
111 = 1
112 = 0
113 = 0
114 = 5
115 = 7
116 = 3
117 = 0
118 = 0
119 = 0
120 = 0
121 = 6
122 = 0
123 = 0
124 = 0
125 = 0
126 = 0
127 = 0



N: 128, Blocks: 1, Threads: 5
Total time (ms): 0.157
Kernel time (ms): 0.048
Data transfer time(ms): 0.109

Why the results are different?

This is my code:

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>

#define N 128
#define base 0

__global__ void char_freq(char *buffer, int *freq, int slice, int extra, int total_threads){
    int index = threadIdx.x + blockIdx.x * blockDim.x ;
    int start = index * slice; 
    int stop = start + slice;
    int i;

    if (index == (total_threads-1))
    stop += extra;

    __shared__ int local_freq[N];

    //initialize local_freq
    if(threadIdx.x == 0){
        memset(local_freq, 0, N*sizeof(int));
    }

    __syncthreads();

    for(i=start; i<stop; i++){
        local_freq[buffer[i] - base]++;
    }

    __syncthreads();

    for(i=0; i<N; i++){
        freq[i] += local_freq[i];
    }

    __syncthreads();
}


int main(int argc, char *argv[]){
    FILE *pFile;
    long file_size;
    char * buffer;
    char * filename;
    size_t result;
    int j, freq[N];
    int slice, extra;
    int total_blocks, threads_per_block, total_threads;

    float total_time, comp_time;
    cudaEvent_t total_start, total_stop, comp_start, comp_stop;
    cudaEventCreate(&total_start);
    cudaEventCreate(&total_stop);
    cudaEventCreate(&comp_start);
    cudaEventCreate(&comp_stop);


    if (argc != 4) {
        printf ("Usage : %s <file_name> <blocks> <threads_per_block>\n", argv[0]);
        return 1;
    }

    total_blocks = strtol(argv[2], NULL, 10);
    threads_per_block = strtol(argv[3], NULL, 10);
    total_threads = total_blocks*threads_per_block;

    filename = argv[1];
    pFile = fopen ( filename , "rb" );
    if (pFile==NULL) {printf ("File error\n"); return 2;}

    fseek (pFile , 0 , SEEK_END);
    file_size = ftell (pFile);
    rewind (pFile);
    printf("file size is %ld\n", file_size);
    
    buffer = (char*) malloc (sizeof(char)*file_size);
    if (buffer == NULL) {printf ("Memory error\n"); return 3;}

    result = fread (buffer,1,file_size,pFile);
    if (result != file_size) {printf ("Reading error\n"); return 4;}

    char *buffer_dev;
    int *freq_dev;

    cudaMalloc((void **)&buffer_dev, file_size*sizeof(char));
    cudaMalloc((void **)&freq_dev, N*sizeof(int));
    cudaMemset(freq_dev,0,N*sizeof(int));

    cudaEventRecord(total_start);

    cudaMemcpy(buffer_dev, buffer, file_size*sizeof(char), cudaMemcpyHostToDevice);

    cudaEventRecord(comp_start);

    slice = file_size / total_threads;
    extra = file_size % total_threads;

    char_freq<<<total_blocks, threads_per_block>>>(buffer_dev, freq_dev, slice, extra, total_threads);

    cudaEventRecord(comp_stop);
    cudaEventSynchronize(comp_stop);
    cudaEventElapsedTime(&comp_time, comp_start, comp_stop);

    cudaMemcpy(freq, freq_dev, N*sizeof(int), cudaMemcpyDeviceToHost);

    cudaEventRecord(total_stop);
    cudaEventSynchronize(total_stop);
    cudaEventElapsedTime(&total_time, total_start, total_stop);

    cudaFree(buffer_dev);
    cudaFree(freq_dev);

    for (j=0; j<N; j++){
        printf("%d = %d\n", j+base, freq[j]);
    }
    
    fclose (pFile);
    free (buffer);

    //GPU Timing
    printf("\n\n\nN: %d, Blocks: %d, Threads: %d\n", N, total_blocks, total_threads);
    printf("Total time (ms): %.3f\n", total_time);
    printf("Kernel time (ms): %.3f\n", comp_time);
    printf("Data transfer time(ms): %.3f\n\n\n", total_time-comp_time);

    return 0;
}

Note `__syncthreads` only synchronize threads belonging to the same block, not between blocks. Besides this, there is no bound check for `index` which is suspicious. — Jérôme Richard, May 25 '22 at 10:37
Your synchronizations are useless as you are accessing thread-local memory, not shared memory. Your accesses to `freq` are a race condition, as all threads will read/write from/to the same address at the same time. The easiest fix will be to use [atomics](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicadd) for the accesses to `freq`. For performant solutions take a look at the CUDA samples. There is a [histogram sample](https://github.com/NVIDIA/cuda-samples/tree/master/Samples/2_Concepts_and_Techniques/histogram). — paleonix, May 25 '22 at 10:47
Now that you have added `__shared__` in an edit, the most of the synchronization does make sense. The one at the end of the kernel is still unnecessary, the loop writing to `freq` needs to only be executed by one thread per block, the writes to `local_freq` and `freq` need to be atomics and instead of using `memset` from one thread, it makes more sense to set the values to zero in parallel with all threads. — paleonix, May 30 '22 at 15:43

paleonix · Answer 1 · 2022-05-30T16:14:30.857

This should work (there might be typos, I have neither compiled nor run the code):

__global__ void char_freq(char *buffer, int *freq, int buffersize) {
    __shared__ int sh_freq[N];

    // block-stride loop over shared buffer
    for (int idx = threadIdx.x; idx < N; idx += blockDim.x) {
        sh_freq[idx] = 0;
    }

    __syncthreads();

    // grid-stride loop over global buffer
    const int gtid = blockIdx.x * blockDim.x + threadIdx.x;
    const int grid_size = blockDim.x * gridDim.x;
    for (int idx = gtid; idx < buffersize; idx += grid_size) {
        atomicAdd(&sh_freq[buffer[idx] - base], 1);
    }

    __syncthreads();

    // block-stride loop over shared buffer
    for (int idx = threadIdx.x; idx < N; idx += blockDim.x) {
        atomicAdd(&freq[idx], sh_freq[idx]);
    }
}

For better performance take a look at the CUDA sample as mentioned in the comments.

score -2 · Answer 2 · answered May 25 '22 at 17:16

I finally found a solution!

I used temp array for optimization

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>

#define N 128
#define base 0

__global__ void char_freq(char *buffer, int *freq, int buffersize){
    int tid; 
    __shared__ int temp[N];

    //Cyclic calculation of block local frequences
    for(tid=blockIdx.x*blockDim.x+threadIdx.x; tid<buffersize; tid+=blockDim.x){
        atomicAdd(&temp[buffer[tid]-base], 1); //used for thread synchronization
    }

    //reduce temp results to freq
    if(threadIdx.x == 0){
        int j;
        for(j=0; j<N; j++){
            atomicAdd(&freq[j], temp[j]);
        }
    }
}


int main(int argc, char *argv[]){
    FILE *pFile;
    long file_size;
    char * buffer;
    char * filename;
    size_t result;
    int j, freq[N];
    int slice, extra;
    int total_blocks, threads_per_block, total_threads;

    float total_time, comp_time;
    cudaEvent_t total_start, total_stop, comp_start, comp_stop;
    cudaEventCreate(&total_start);
    cudaEventCreate(&total_stop);
    cudaEventCreate(&comp_start);
    cudaEventCreate(&comp_stop);


    if (argc != 4) {
        printf ("Usage : %s <file_name> <blocks> <threads_per_block>\n", argv[0]);
        return 1;
    }

    total_blocks = strtol(argv[2], NULL, 10);
    threads_per_block = strtol(argv[3], NULL, 10);
    total_threads = total_blocks*threads_per_block;

    filename = argv[1];
    pFile = fopen ( filename , "rb" );
    if (pFile==NULL) {printf ("File error\n"); return 2;}

    //obtain file size
    fseek (pFile , 0 , SEEK_END);
    file_size = ftell (pFile);
    rewind (pFile);
    printf("file size is %ld\n", file_size);
    
    //allocate memory to contain the file
    buffer = (char*) malloc (sizeof(char)*file_size);
    if (buffer == NULL) {printf ("Memory error\n"); return 3;}

    //copy file data to buffer 
    result = fread (buffer,1,file_size,pFile);
    if (result != file_size) {printf ("Reading error\n"); return 4;}

    //Device arrays (GPU)
    char *buffer_dev;
    int *freq_dev;

    cudaMalloc((void **)&buffer_dev, file_size*sizeof(char));
    cudaMalloc((void **)&freq_dev, N*sizeof(int));
    cudaMemset(freq_dev,0,N*sizeof(int));

    cudaEventRecord(total_start);

    //Copy data from host (CPU) to device (GPU)
    cudaMemcpy(buffer_dev, buffer, file_size*sizeof(char), cudaMemcpyHostToDevice);

    cudaEventRecord(comp_start);

    char_freq<<<total_blocks, threads_per_block>>>(buffer_dev, freq_dev, file_size);

    cudaEventRecord(comp_stop);
    cudaEventSynchronize(comp_stop);
    cudaEventElapsedTime(&comp_time, comp_start, comp_stop);

    //Copy result from device (GPU) to host (CPU)
    cudaMemcpy(freq, freq_dev, N*sizeof(int), cudaMemcpyDeviceToHost);

    cudaEventRecord(total_stop);
    cudaEventSynchronize(total_stop);
    cudaEventElapsedTime(&total_time, total_start, total_stop);

    cudaFree(buffer_dev);
    cudaFree(freq_dev);

    //Print Result
    for (j=0; j<N; j++){
        printf("%d = %d\n", j+base, freq[j]);
    }
    
    fclose (pFile);
    free (buffer);

    //GPU Timing
    printf("\n\n\nN: %d, Blocks: %d, Threads: %d\n", N, total_blocks, total_threads);
    printf("Total time (ms): %.3f\n", total_time);
    printf("Kernel time (ms): %.3f\n", comp_time);
    printf("Data transfer time(ms): %.3f\n\n\n", total_time-comp_time);

    return 0;
}

This kernel is still wrong. You need to set `temp` to zero and for this kernel to work with more than one block, the stride of the loop needs to be `blockDim.x * gridDim.x` like in your [other question](https://stackoverflow.com/a/72434556/10107454). It is also lacking synchronization. — paleonix, May 30 '22 at 15:48

Parallel histogram gives wrong results

2 Answers2