I am using GPU to do some calculation for processing words. Initially, I used one block (with 500 threads) to process one word. To process 100 words, I have to loop the kernel function 100 times in my main function.
for (int i=0; i<100; i++)
kernel <<< 1, 500 >>> (length_of_word);
My kernel function looks like this:
__global__ void kernel (int *dev_length)
{
int length = *dev_length;
while (length > 4)
{ //do something;
length -=4;
}
}
Now I want to process all 100 words at the same time.
Each block will still have 500 threads, and processes one word (per block).
dev_totalwordarray: store all characters of the words (one after another)
dev_length_array: store the length of each word.
dev_accu_length: stores the accumulative length of the word (total char of all previous words)
dev_salt_ is an array of of size 500, storing unsigned integers.
Hence, in my main function I have
kernel2 <<< 100, 500 >>> (dev_totalwordarray, dev_length_array, dev_accu_length, dev_salt_);
to populate the cpu array:
for (int i=0; i<wordnumber; i++)
{
int length=0;
while (word_list_ptr_array[i][length]!=0)
{
length++;
}
actualwordlength2[i] = length;
}
to copy from cpu -> gpu:
int* dev_array_of_word_length;
HANDLE_ERROR( cudaMalloc( (void**)&dev_array_of_word_length, 100 * sizeof(int) ) );
HANDLE_ERROR( cudaMemcpy( dev_array_of_word_length, actualwordlength2, 100 * sizeof(int),
My function kernel now looks like this:
__global__ void kernel2 (char* dev_totalwordarray, int *dev_length_array, int* dev_accu_length, unsigned int* dev_salt_)
{
tid = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int hash[N];
int length = dev_length_array[blockIdx.x];
while (tid < 50000)
{
const char* itr = &(dev_totalwordarray[dev_accu_length[blockIdx.x]]);
hash[tid] = dev_salt_[threadIdx.x];
unsigned int loop = 0;
while (length > 4)
{ const unsigned int& i1 = *(reinterpret_cast<const unsigned int*>(itr)); itr += sizeof(unsigned int);
const unsigned int& i2 = *(reinterpret_cast<const unsigned int*>(itr)); itr += sizeof(unsigned int);
hash[tid] ^= (hash[tid] << 7) ^ i1 * (hash[tid] >> 3) ^ (~((hash[tid] << 11) + (i2 ^ (hash[tid] >> 5))));
length -=4;
}
tid += blockDim.x * gridDim.x;
}
}
However, kernel2 doesn't seem to work at all.
It seems while (length > 4)
causes this.
Does anyone know why? Thanks.