I have a device float array and I tried to sort it by key with THRUST with this function:
#include <thrust/sort.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/device_ptr.h>
#include <thrust/device_malloc.h>
#include <thrust/device_free.h>
#include <thrust/copy.h>
#include <thrust/fill.h>
#include <thrust/sequence.h>
template <typename T>
__host__ T* deepCopyDeviceArray(T* dev_array, int arraysize)
{
// performs a deep copy of a device array and returns the copy's device pointer
cudaError_t cudaStatus;
T* dev_copiedArray;
cudaStatus = cudaMalloc((void**)&dev_copiedArray, (arraysize * sizeof(T)));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "deep copy cudaMalloc failed!");
}
cudaStatus = cudaMemcpy(dev_copiedArray, dev_array, (arraysize * sizeof(T)), cudaMemcpyDeviceToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "deep copy cudaMemcpy failed!");
}
return dev_copiedArray;
}
template <typename T>
int* sortByKeyOnDevice(T* dev_keys, int len, const int* valuesarray)
{
// sorts keysarray and returns the sorted indices
T* dev_keys2 = deepCopyDeviceArray(dev_keys, len); // make deep copy to evade change of original keys
// make deep copy of values and copy it to device
int* dev_values;
cudaMalloc((void **) &dev_values, len);
cudaMemcpy(dev_values, valuesarray, len * sizeof(int), cudaMemcpyHostToDevice);
// create device pointers
thrust::device_ptr<T> dev_ptr_keys = thrust::device_pointer_cast(dev_keys2);
thrust::device_ptr<int> dev_ptr_values = thrust::device_pointer_cast(dev_values);
thrust::sort_by_key(dev_ptr_keys, dev_ptr_keys + len, dev_ptr_values);
//thrust::device_free(dev_ptr_keys);
cudaFree(dev_keys2);
return dev_values; // return only indices of sorted array
}
int main()
{
int len = 10;
float* array1 = new float[len]; for (int i=0;i<len;i++) array1[i] = rand();
float* dev_array1;
cudaMalloc(&dev_array1, len * sizeof(float));
cudaMemcpy(dev_array1, array1, (len * sizeof(float)), cudaMemcpyHostToDevice);
int* valuesarray = new int[len]; for (int i=0; i<len; i++) valuesarray[i] = i;
int* dev_values;
dev_values = sortByKeyOnDevice(dev_array1, len, valuesarray);
int* values = new int[len];
cudaMemcpy(values, dev_values, (len * sizeof(int)), cudaMemcpyDeviceToHost); // or use dev_values in a kernel for further calculations
}
Executing this main, THRUST throws the exception: "thrust::system::system_error at memory location 0x00DAF5D4."
I don't want to use thrust device vectors because they are unnecessary here. According to the THRUST documentation, device_ptr can be used in the above way (I refer to this question).
What am I doing wrong?