My CUDA kernel is using thrust, sort and reduce by key. when i use an array more than 460 it start showing incorrect results.
could any one explain this behavior? or it is something related to my machine ?
the sort is working correctly despite the size, however, the REDUCE_BY_KEY is not working good. and return improper results.
more details about the code, i have 4 arrays 1) input keys which is defined as wholeSequenceArray. 2) input values which is defined in the kernel with initial value of 1. 3) output keys is to save the distinct values of the input keys 4) output values is to save the sum of input values corresponding to the same input key .
for more description about the reduce_by_key please visit this page: https://thrust.github.io/doc/group__reductions.html#gad5623f203f9b3fdcab72481c3913f0e0
here is my code:
#include <cstdlib>
#include <stdlib.h>
#include <stdio.h>
#include <iostream>
#include <vector>
#include <fstream>
#include <string>
#include <cuda.h>
#include <cuda_runtime.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/sort.h>
#include <thrust/reduce.h>
#include <thrust/execution_policy.h>
using namespace std;
#define size 461
__global__ void calculateOccurances(unsigned int *input_keys,
unsigned int *output_Values) {
int tid = threadIdx.x;
const int N = size;
__shared__ unsigned int input_values[N];
unsigned int outputKeys[N];
int i = tid;
while (i < N) {
if (tid < N) {
input_values[tid] = 1;
}
i += blockDim.x;
}
__syncthreads();
thrust::sort(thrust::device, input_keys, input_keys + N);
thrust::reduce_by_key(thrust::device, input_keys, input_keys + N,
input_values, outputKeys, output_Values);
if (tid == 0) {
for (int i = 0; i < N; ++i) {
printf("%d,", output_Values[i]);
}
}
}
int main(int argc, char** argv) {
unsigned int wholeSequenceArray[size] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6,
7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5,
6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4,
5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3,
4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2,
3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1,
2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,1 };
cout << "wholeSequenceArray:" << endl;
for (int i = 0; i < size; i++) {
cout << wholeSequenceArray[i] << ",";
}
cout << "\nStart C++ Array New" << endl;
cout << "Size of Input:" << size << endl;
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, 0);
printf("Max threads per block: %d\n", prop.maxThreadsPerBlock);
unsigned int counts[size];
unsigned int *d_whole;
unsigned int *d_counts;
cudaMalloc((void**) &d_whole, size * sizeof(unsigned int));
cudaMalloc((void**) &d_counts, size * sizeof(unsigned int));
cudaMemcpy(d_whole, wholeSequenceArray, size * sizeof(unsigned int),
cudaMemcpyHostToDevice);
calculateOccurances<<<1, size>>>(d_whole, d_counts);
cudaMemcpy(counts, d_counts, size * sizeof(unsigned int),
cudaMemcpyDeviceToHost);
cout << endl << "Counts" << endl << endl;
for (int i = 0; i < size; ++i) {
cout << counts[i] << ",";
}
cout << endl;
cudaFree(d_whole);
}