I'm trying to implement string matching program with CUDA in C and I have th following issue.
When I set 1 block and 1 thread per block the result for pattern dfh
is 2. That's correct, but when I increase the blocks the result is 4.
The text file is:
ffskdfhksdjhfksdfksjdfhksdhfksjdhfkjer654yrkhjkfgjhdsrtrhkjchgkjthyoirthygfnbkjgkjdhykhkjchgkjfdhsfykhkbhkjfghkfgjy
This is my code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cuda.h>
__global__ void string_matching(char *buffer, char *pattern, int match_size, int pattern_size, int *result){
int tid, i;
__shared__ int local_matches;
if(threadIdx.x == 0) local_matches = 0;
__syncthreads();
for(tid=blockIdx.x*blockDim.x+threadIdx.x; tid<match_size; tid+=blockDim.x){
for (i = 0; i < pattern_size && pattern[i] == buffer[i + tid]; ++i);
if(i >= pattern_size){
atomicAdd(&local_matches, 1);
}
}
__syncthreads();
if(threadIdx.x == 0)
atomicAdd(result, local_matches);
}
int main(int argc, char *argv[]){
FILE *pFile;
long file_size, match_size, pattern_size;
char * buffer;
char * filename, *pattern;
size_t result;
int *match, total_matches;
//CUDA variables
int blocks, threads_per_block;
int *result_dev;
char *buffer_dev, *pattern_dev;
float total_time, comp_time;
cudaEvent_t total_start, total_stop, comp_start, comp_stop;
cudaEventCreate(&total_start);
cudaEventCreate(&total_stop);
cudaEventCreate(&comp_start);
cudaEventCreate(&comp_stop);
if (argc != 5) {
printf ("Usage : %s <file_name> <string> <blocks> <threads_per_block>\n", argv[0]);
return 1;
}
filename = argv[1];
pattern = argv[2];
blocks = strtol(argv[3], NULL, 10);
threads_per_block = strtol(argv[4], NULL, 10);
pFile = fopen ( filename , "rb" );
if (pFile==NULL) {printf ("File error\n"); return 2;}
// obtain file size:
fseek (pFile , 0 , SEEK_END);
file_size = ftell (pFile);
rewind (pFile);
printf("file size is %ld\n", file_size);
// allocate memory to contain the file:
buffer = (char*) malloc (sizeof(char)*file_size);
if (buffer == NULL) {printf ("Memory error\n"); return 3;}
// copy the file into the buffer:
result = fread (buffer,1,file_size,pFile);
if (result != file_size) {printf ("Reading error\n"); return 4;}
pattern_size = strlen(pattern);
match_size = file_size - pattern_size + 1;
match = (int *) malloc (sizeof(int)*match_size);
if (match == NULL) {printf ("Malloc error\n"); return 5;}
cudaMalloc((void **)&result_dev, sizeof(int));
cudaMalloc((void **)&buffer_dev, file_size*sizeof(char));
cudaMalloc((void **)&pattern_dev, pattern_size*sizeof(char));
cudaEventRecord(total_start);
cudaEventRecord(comp_start);
cudaMemcpy(buffer_dev, buffer, file_size*sizeof(char), cudaMemcpyHostToDevice);
cudaMemcpy(pattern_dev, pattern, pattern_size*sizeof(char), cudaMemcpyHostToDevice);
string_matching<<<blocks, threads_per_block>>>(buffer_dev, pattern_dev, match_size, pattern_size, result_dev);
cudaThreadSynchronize();
cudaEventRecord(comp_stop);
cudaEventSynchronize(comp_stop);
cudaEventElapsedTime(&comp_time, comp_start, comp_stop);
cudaMemcpy(&total_matches, result_dev, sizeof(int), cudaMemcpyDeviceToHost);
cudaEventRecord(total_stop);
cudaEventSynchronize(total_stop);
cudaEventElapsedTime(&total_time, total_start, total_stop);
cudaFree(result_dev);
cudaFree(buffer_dev);
cudaFree(pattern_dev);
fclose (pFile);
free (buffer);
//Print result
printf("Total matches: %d\n", total_matches);
printf("\n\n\nN: %d, Blocks: %d, Threads: %d\n", file_size, blocks, blocks*threads_per_block);
printf("Total time (ms): %.3f\n", total_time);
printf("Kernel time (ms): %.3f\n", comp_time);
printf("Data transfer time(ms): %.3f\n\n\n", total_time-comp_time);
}