String matching in CUDA shows different result when increase blocks

Question

I'm trying to implement string matching program with CUDA in C and I have th following issue.

When I set 1 block and 1 thread per block the result for pattern dfh is 2. That's correct, but when I increase the blocks the result is 4.

The text file is:

ffskdfhksdjhfksdfksjdfhksdhfksjdhfkjer654yrkhjkfgjhdsrtrhkjchgkjthyoirthygfnbkjgkjdhykhkjchgkjfdhsfykhkbhkjfghkfgjy

This is my code:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cuda.h>

__global__ void string_matching(char *buffer, char *pattern, int match_size, int pattern_size, int *result){
    int tid, i;
    __shared__ int local_matches;

    if(threadIdx.x == 0) local_matches = 0;

    __syncthreads();

    for(tid=blockIdx.x*blockDim.x+threadIdx.x; tid<match_size; tid+=blockDim.x){
        for (i = 0; i < pattern_size && pattern[i] == buffer[i + tid]; ++i);
        if(i >= pattern_size){
            atomicAdd(&local_matches, 1);
        }
    }

    __syncthreads();

    if(threadIdx.x == 0) 
        atomicAdd(result, local_matches);

}


int main(int argc, char *argv[]){
    FILE *pFile;
    long file_size, match_size, pattern_size;
    char * buffer;
    char * filename, *pattern;
    size_t result;
    int *match, total_matches;

    //CUDA variables
    int blocks, threads_per_block;
    int *result_dev;
    char *buffer_dev, *pattern_dev;

    float total_time, comp_time;
    cudaEvent_t total_start, total_stop, comp_start, comp_stop;
    cudaEventCreate(&total_start);
    cudaEventCreate(&total_stop);
    cudaEventCreate(&comp_start);
    cudaEventCreate(&comp_stop);

    if (argc != 5) {
        printf ("Usage : %s <file_name> <string> <blocks> <threads_per_block>\n", argv[0]);
        return 1;
    }
    filename = argv[1];
    pattern = argv[2];
    blocks = strtol(argv[3], NULL, 10);
    threads_per_block = strtol(argv[4], NULL, 10);
    
    pFile = fopen ( filename , "rb" );
    if (pFile==NULL) {printf ("File error\n"); return 2;}

    // obtain file size:
    fseek (pFile , 0 , SEEK_END);
    file_size = ftell (pFile);
    rewind (pFile);
    printf("file size is %ld\n", file_size);
    
    // allocate memory to contain the file:
    buffer = (char*) malloc (sizeof(char)*file_size);
    if (buffer == NULL) {printf ("Memory error\n"); return 3;}

    // copy the file into the buffer:
    result = fread (buffer,1,file_size,pFile);
    if (result != file_size) {printf ("Reading error\n"); return 4;} 
    
    pattern_size = strlen(pattern);
    match_size = file_size - pattern_size + 1;
    
    match = (int *) malloc (sizeof(int)*match_size);
    if (match == NULL) {printf ("Malloc error\n"); return 5;}

    cudaMalloc((void **)&result_dev, sizeof(int));
    cudaMalloc((void **)&buffer_dev, file_size*sizeof(char));
    cudaMalloc((void **)&pattern_dev, pattern_size*sizeof(char));

    cudaEventRecord(total_start);

    cudaEventRecord(comp_start);

    cudaMemcpy(buffer_dev, buffer, file_size*sizeof(char), cudaMemcpyHostToDevice);
    cudaMemcpy(pattern_dev, pattern, pattern_size*sizeof(char), cudaMemcpyHostToDevice);

    string_matching<<<blocks, threads_per_block>>>(buffer_dev, pattern_dev, match_size, pattern_size, result_dev);
    cudaThreadSynchronize();

    cudaEventRecord(comp_stop);
    cudaEventSynchronize(comp_stop);
    cudaEventElapsedTime(&comp_time, comp_start, comp_stop);

    cudaMemcpy(&total_matches, result_dev, sizeof(int), cudaMemcpyDeviceToHost);

    cudaEventRecord(total_stop);
    cudaEventSynchronize(total_stop);
    cudaEventElapsedTime(&total_time, total_start, total_stop);

    cudaFree(result_dev);
    cudaFree(buffer_dev);
    cudaFree(pattern_dev);

    fclose (pFile);
    free (buffer);

    //Print result
    printf("Total matches: %d\n", total_matches);

    printf("\n\n\nN: %d, Blocks: %d, Threads: %d\n", file_size, blocks, blocks*threads_per_block);
    printf("Total time (ms): %.3f\n", total_time);
    printf("Kernel time (ms): %.3f\n", comp_time);
    printf("Data transfer time(ms): %.3f\n\n\n", total_time-comp_time);

}

Please use proper [CUDA error checking](https://stackoverflow.com/q/14038589/10107454). — paleonix, May 27 '22 at 09:27
Consider using [`cub::BlockReduce`](https://nvlabs.github.io/cub/classcub_1_1_block_reduce.html) (the CUB library is part of the CUDA toolkit) to efficiently reduce the data across the threads of each block. It will be more efficient than just using atomics. — paleonix, May 27 '22 at 09:41
@paleonix On modern GPUs basic atomic operations (eg. atomicAdd) are pretty well optimized by the hardware. This is especially true for atomics operating on shared memory as they appear to be implemented using a clever SIMD reduction. I tried to implement my own clever optimized complex reduction on my 1660S GPU but in the end it was just as fast than a one-line reduction. Atomic operations on global memory can often be still further optimized though they are already quite fast (they do not scale with the number of SMs in pathological cases). — Jérôme Richard, May 28 '22 at 19:52
@paleonix The benefit of Atomic in shared memory is that developers do not need to care about having all threads participating to the collective: the instruction can be put in a complex for loop with some conditionals (and break). That being said, an SIMD reduction is not as efficient as a local reduction so developers should perform a local reduction before the atomic operation for sake of performance (often simple). This is unfortunately not done here. — Jérôme Richard, May 28 '22 at 20:00
@JérômeRichard you are referring to the content of [this](https://developer.nvidia.com/blog/cuda-pro-tip-optimized-filtering-warp-aggregated-atomics/) blog post, especially the note in the beginnig that CUDA >=9 does this optimization automatically, I think. Sure, there will be situations where one wants to be more flexible than what CUB allows, but in most cases this is a nice abstraction that should be performant on any new or old architecture. — paleonix, May 30 '22 at 12:00

score 1 · Answer 1 · answered May 25 '22 at 20:15

You need to synchronize threads of the same block using __syncthreads. For example, local_matches = 0 can theoretically be done concurrently to the atomicAdd of the main loop. Thus, a __syncthreads is needed between both. For the same reason, you also need a __syncthreads before the last if(threadIdx.x == 0). I am not sure this is the only error.

I advise you to use CUDA-GDB to track such bug since the kernel is quite short and relatively simple.

Note that a local_matches do not need to be shared. In fact, it is not efficient to do that. On can perform the reduction in local memory and then perform a final atomicAdd. Additionally, you do not need the innermost conditional. You can simply do: local_matches += i >= pattern_size; (the compiler might already do such optimization).

I used `__syncthreads()` after initialization with 0 and before the last `if(threadsIdx.x == 0)` and didn't work but I found a solution with something different. I'll post it here. — Sotiris Sotiriou, May 25 '22 at 20:34

score 0 · Answer 2 · answered May 25 '22 at 20:40

I finally found a solution for this.

I set a match table with 0 values for every buffer position, set 1 for every position that pattern found, and added the 1's in the CPU.

If you think something better please add an answer.

This is the code:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cuda.h>

__global__ void string_matching(char *buffer, char *pattern, int match_size, int pattern_size, int *match){
    int tid, i;

    for(tid=blockIdx.x*blockDim.x+threadIdx.x; tid<match_size; tid+=blockDim.x){
        for (i = 0; i < pattern_size && pattern[i] == buffer[i + tid]; ++i);
        if(i >= pattern_size){
            match[tid] = 1;
        }
        else{
            match[tid] = 0;
        }
    }

}


int main(int argc, char *argv[]){
    FILE *pFile;
    int i;
    long file_size, match_size, pattern_size;
    char * buffer;
    char * filename, *pattern;
    size_t result;
    int *match, total_matches;

    //CUDA variables
    int blocks, threads_per_block;
    int *match_dev;
    char *buffer_dev, *pattern_dev;

    float total_time, comp_time;
    cudaEvent_t total_start, total_stop, comp_start, comp_stop;
    cudaEventCreate(&total_start);
    cudaEventCreate(&total_stop);
    cudaEventCreate(&comp_start);
    cudaEventCreate(&comp_stop);

    if (argc != 5) {
        printf ("Usage : %s <file_name> <string> <blocks> <threads_per_block>\n", argv[0]);
        return 1;
    }
    filename = argv[1];
    pattern = argv[2];
    blocks = strtol(argv[3], NULL, 10);
    threads_per_block = strtol(argv[4], NULL, 10);
    
    pFile = fopen ( filename , "rb" );
    if (pFile==NULL) {printf ("File error\n"); return 2;}

    // obtain file size:
    fseek (pFile , 0 , SEEK_END);
    file_size = ftell (pFile);
    rewind (pFile);
    printf("file size is %ld\n", file_size);
    
    // allocate memory to contain the file:
    buffer = (char*) malloc (sizeof(char)*file_size);
    if (buffer == NULL) {printf ("Memory error\n"); return 3;}

    // copy the file into the buffer:
    result = fread (buffer,1,file_size,pFile);
    if (result != file_size) {printf ("Reading error\n"); return 4;} 
    
    pattern_size = strlen(pattern);
    match_size = file_size - pattern_size + 1;
    
    match = (int *) malloc (sizeof(int)*match_size);
    if (match == NULL) {printf ("Malloc error\n"); return 5;}

    cudaMalloc((void **)&match_dev, match_size*sizeof(int));
    cudaMalloc((void **)&buffer_dev, file_size*sizeof(char));
    cudaMalloc((void **)&pattern_dev, pattern_size*sizeof(char));

    cudaEventRecord(total_start);

    cudaEventRecord(comp_start);

    cudaMemcpy(buffer_dev, buffer, file_size*sizeof(char), cudaMemcpyHostToDevice);
    cudaMemcpy(pattern_dev, pattern, pattern_size*sizeof(char), cudaMemcpyHostToDevice);

    string_matching<<<blocks, threads_per_block>>>(buffer_dev, pattern_dev, match_size, pattern_size, match_dev);
    cudaThreadSynchronize();

    cudaEventRecord(comp_stop);
    cudaEventSynchronize(comp_stop);
    cudaEventElapsedTime(&comp_time, comp_start, comp_stop);

    cudaMemcpy(match, match_dev, match_size*sizeof(int), cudaMemcpyDeviceToHost);

    cudaEventRecord(total_stop);
    cudaEventSynchronize(total_stop);
    cudaEventElapsedTime(&total_time, total_start, total_stop);

    total_matches = 0;
    for(i=0; i<match_size; i++){
        total_matches += match[i];
    }

    cudaFree(match_dev);
    cudaFree(buffer_dev);
    cudaFree(pattern_dev);

    fclose (pFile);
    free (buffer);

    //Print result
    printf("Total matches: %d\n", total_matches);

    printf("\n\n\nN: %d, Blocks: %d, Threads: %d\n", file_size, blocks, blocks*threads_per_block);
    printf("Total time (ms): %.3f\n", total_time);
    printf("Kernel time (ms): %.3f\n", comp_time);
    printf("Data transfer time(ms): %.3f\n\n\n", total_time-comp_time);

}

Note that the algorithm is not equivalent to the one in the question: this one set the value to 0 or 1 regarding the matching result while the initial one count the matches. Doing the sum on the CPU is pretty inefficient since you need to write the result in the GPU memory (quite slow) and then perform a slow transfer (especially on PCIE). This is especially true since you store the binary result in a 32-bit `int` array wasting 97% of the memory space (and so the transfer time)... It looks like more a workaround than an actual solution. — Jérôme Richard, May 25 '22 at 20:57
A faster solution based on this approach is to pack bits in a char array so to make the slow transfer time 32 times faster. Please consider at least using a `char` array. — Jérôme Richard, May 25 '22 at 21:00

score 0 · Answer 3 · answered May 28 '22 at 23:21

I think this is a better solution.

It works only for power of 2 threads per block.

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cuda.h>

#define MAX_THREADS_PER_BLOCK 100

__global__ void string_matching(char *buffer, char *pattern, int match_size, int pattern_size, int blocks, int slice, int extra, int *gout){
    int tid, i;
    int thread_index = blockIdx.x*blockDim.x + threadIdx.x;
    int start = thread_index*slice;
    int stop = start + slice;
    if(thread_index == blocks*blockDim.x - 1){
        stop += extra;
    }
    if(stop > match_size){
        stop = match_size;
    }
    __shared__ int r[MAX_THREADS_PER_BLOCK];
    int sum = 0;

    for(tid=start; tid<stop; tid++){
        for (i = 0; i < pattern_size && pattern[i] == buffer[i + tid]; ++i);
        if(i >= pattern_size){
            sum++;
        }
    }

    r[threadIdx.x] = sum;

    __syncthreads();

    //works only for power of 2 threads_per_block
    for (int size = blockDim.x/2; size>0; size/=2) { //uniform
        if (threadIdx.x<size)
            r[threadIdx.x] += r[threadIdx.x+size];
        __syncthreads();
    }



    printf("Block: %d, Thread: %d, Global Thread: %d, Start: %d, Stop: %d, Matches: %d, Block Matches: %d\n", blockIdx.x, threadIdx.x, thread_index, start, stop, r[threadIdx.x], r[0]);


    if(threadIdx.x == 0){
        gout[blockIdx.x] = r[0];
    }

    
}


int main(int argc, char *argv[]){
    int i;
    FILE *pFile;
    long file_size, match_size, pattern_size;
    char * buffer;
    char * filename, *pattern;
    size_t result;
    int *results;
    int total_matches;

    //CUDA variables
    int blocks, threads_per_block, total_threads, slice, extra;
    int *results_dev;
    char *buffer_dev, *pattern_dev;

    float total_time, comp_time;
    cudaEvent_t total_start, total_stop, comp_start, comp_stop;
    cudaEventCreate(&total_start);
    cudaEventCreate(&total_stop);
    cudaEventCreate(&comp_start);
    cudaEventCreate(&comp_stop);

    if (argc != 5) {
        printf ("Usage : %s <file_name> <string> <blocks> <threads_per_block>\n", argv[0]);
        return 1;
    }
    filename = argv[1];
    pattern = argv[2];
    blocks = strtol(argv[3], NULL, 10);
    threads_per_block = strtol(argv[4], NULL, 10);
    
    pFile = fopen ( filename , "rb" );
    if (pFile==NULL) {printf ("File error\n"); return 2;}

    // obtain file size:
    fseek (pFile , 0 , SEEK_END);
    file_size = ftell (pFile);
    rewind (pFile);
    printf("file size is %ld\n", file_size);
    
    // allocate memory to contain the file:
    buffer = (char*) malloc (sizeof(char)*file_size);
    if (buffer == NULL) {printf ("Memory error\n"); return 3;}

    // copy the file into the buffer:
    result = fread (buffer,1,file_size,pFile);
    if (result != file_size) {printf ("Reading error\n"); return 4;} 
    
    pattern_size = strlen(pattern);
    match_size = file_size - pattern_size + 1;

    results = (int *)malloc(blocks*sizeof(int));

    cudaMalloc((void **)&results_dev, blocks*sizeof(int));
    cudaMalloc((void **)&buffer_dev, file_size*sizeof(char));
    cudaMalloc((void **)&pattern_dev, pattern_size*sizeof(char));

    cudaEventRecord(total_start);

    cudaEventRecord(comp_start);

    cudaMemcpy(buffer_dev, buffer, file_size*sizeof(char), cudaMemcpyHostToDevice);
    cudaMemcpy(pattern_dev, pattern, pattern_size*sizeof(char), cudaMemcpyHostToDevice);

    total_threads = blocks*threads_per_block;
    slice = match_size/total_threads;
    extra = match_size%total_threads;

    string_matching<<<blocks, threads_per_block>>>(buffer_dev, pattern_dev, match_size, pattern_size, blocks, slice, extra, results_dev);

    cudaEventRecord(comp_stop);
    cudaEventSynchronize(comp_stop);
    cudaEventElapsedTime(&comp_time, comp_start, comp_stop);

    cudaMemcpy(results, results_dev, blocks*sizeof(int), cudaMemcpyDeviceToHost);

    total_matches = 0;
    for(i=0; i<blocks; i++){
        total_matches += results[i];
    }

    cudaEventRecord(total_stop);
    cudaEventSynchronize(total_stop);
    cudaEventElapsedTime(&total_time, total_start, total_stop);

    cudaFree(results_dev);
    cudaFree(buffer_dev);
    cudaFree(pattern_dev);

    fclose (pFile);
    free (buffer);

    //Print result
    printf("Total matches: %d\n", total_matches);

    printf("\n\n\nN: %d, Blocks: %d, Threads: %d\n", file_size, blocks, blocks*threads_per_block);
    printf("Total time (ms): %.3f\n", total_time);
    printf("Kernel time (ms): %.3f\n", comp_time);
    printf("Data transfer time(ms): %.3f\n\n\n", total_time-comp_time);

}

paleonix · Answer 4 · 2022-05-30T12:51:23.780

0

The stride on your loop is not correct for using multiple blocks, i.e. both blocks were doing the full amount of work (and therefore finding two entries each). The correct grid-stride loop looks like the following

for (int tid = blockIdx.x * blockDim.x + threadIdx.x; 
         tid < match_size; 
         tid += blockDim.x * gridDim.x /* <-- fix */) {
    // ...
}

edited May 30 '22 at 12:51

answered May 30 '22 at 12:45

paleonix

2,293
1
13
29

String matching in CUDA shows different result when increase blocks

4 Answers4

Linked