0

I am trying to use sort_by_key() to sort key-value pairs, where the value is a zip_iterator. But using this, only the keys are being sorted and not the value.

Here is my example code

#include <stdio.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/tuple.h>
#include <thrust/device_vector.h>

typedef thrust::device_vector<int>::iterator W_itr;
typedef thrust::tuple<W_itr, W_itr> W_itr_tuple;
typedef thrust::tuple<int, int> W_tuple;
struct Min_op
{
    __host__ __device__ 
    W_tuple operator()(const W_tuple& a, const W_tuple& b) const
    {
        int aw = thrust::get<0>(a), bw = thrust::get<0>(b);
        return aw < bw ? a : b;
    }
};

void printArray(int* a, int size)
{
    printf("[ %d", a[0]);
    for (int i = 1;i < size;i++)
    {
        printf(",\t%d", a[i]);
    }
    printf(" ]");
}


int main()
{
    int a[5] = { 5, 4, 1, 4, 4 },
        b[5] = { 1, 4, 2, 5, 6 },
        c[5] = { 10, 11, 12,13,14 };

    int* da, * db, * dc, * da_copy, * db_copy, * dc_copy;
    int size = sizeof(int) * 5;
    cudaMalloc(&da, size);
    cudaMalloc(&db, size);
    cudaMalloc(&dc, size);
    cudaMalloc(&da_copy, size);
    cudaMalloc(&db_copy, size);
    cudaMalloc(&dc_copy, size);
    cudaMemcpy(da, a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(db, b, size, cudaMemcpyHostToDevice);
    cudaMemcpy(dc, c, size, cudaMemcpyHostToDevice);

    thrust::device_ptr<int> da_ptr(da), db_ptr(db), dc_ptr(dc), db_copy_ptr(db_copy), dc_copy_ptr(dc_copy);
    thrust::device_vector<int> b_vec(db, db + 5), c_vec(dc, dc + 5), b_vec_copy(db_copy_ptr, db_copy_ptr+5), c_vec_copy(dc_copy_ptr, dc_copy_ptr+5);

    thrust::zip_iterator<W_itr_tuple> zip1(thrust::make_tuple(b_vec.begin(), c_vec.begin())), 
        zip2(thrust::make_tuple(b_vec_copy.begin(), c_vec_copy.begin())); // Question: Any easier way to make a zip_iterator with only raw pointers on device?

    thrust::sort_by_key(da, da + 5, zip1);


    cudaMemcpy(a, da, size, cudaMemcpyDeviceToHost);
    cudaMemcpy(b, db, size, cudaMemcpyDeviceToHost);
    cudaMemcpy(c, dc, size, cudaMemcpyDeviceToHost);

    printf("After Sort\n a = ");
    printArray(a, 5);
    printf("\n b = ");
    printArray(b, 5); // Question: Order has not changed at all. How to solve this?
    printf("\n c = ");
    printArray(c, 5); // Question: Order has not changed at all. How to solve this?

    // auto result = thrust::reduce_by_key
    // (
    //  da, da + 5, 
    //  zip1, 
    //  da_copy,
    //  zip2, 
    //  thrust::equal_to<int>(), 
    //  Min_op()
    // );
    // size = (result.first - da_copy) * sizeof(int); // Question: I am unable to get this compiled. Basically I want to use the zip_iterator to reduce using my custom operator. How to get this done?

    cudaMemcpy(a, da_copy, size, cudaMemcpyDeviceToHost);
    cudaMemcpy(b, db_copy, size, cudaMemcpyDeviceToHost);
    cudaMemcpy(c, dc_copy, size, cudaMemcpyDeviceToHost);

    printf("\n\nAfter Reduce\n a = ");
    printArray(a, 5);
    printf("\n b = ");
    printArray(b, 5);
    printf("\n c = ");
    printArray(b, 5);

    return 0;
}

And the output that I obtain here is

After Sort
 a = [ 1,       4,      4,      4,      5 ]
 b = [ 1,       4,      2,      5,      6 ]
 c = [ 10,      11,     12,     13,     14 ]

...

As you can see, a is sorted correctly, but the b and c do not change as I expected them to.

I have put my queries in comments, which are the following:

  • Assuming that my implementation is wrong, what is the correct/best way to declare a zip_iterator from raw device pointers?
  • How to get the zip_iteration sorted?
  • How to use the reduce_by_key() function as I intend to? If I uncomment my code, it does not compile?
AzuxirenLeadGuy
  • 2,470
  • 1
  • 16
  • 27
  • 2
    You sorted the `b_vec` and `c_vec` (which are **copies** and not references), but you output `db` and `dc`. – paleonix Sep 26 '22 at 09:20
  • 1
    Ideally you would just use Thrust vectors from the start (for allocation and data transfer), but if you need to use the CUDA runtime directly for those things, yes you should be able to put `thrust::device_ptr`s into the `zip_iterator`. Right now you have three copies of `b` and `c` each in global memory. – paleonix Sep 26 '22 at 09:24
  • @paleonix Thank you for your response. I want to keep my data used by raw pointers, and only in one section I wish to use `thrust::sort_by_key()` and `thrust::reduce_by_key()`. I'd like to know how to correctly declare the zip iterator in this case? – AzuxirenLeadGuy Sep 26 '22 at 10:31
  • If you just use the wrapped pointers consistently, it compiles fine: https://godbolt.org/z/3jedY5fdW – paleonix Sep 26 '22 at 11:39
  • @paleonix That works fine. Thanks a lot! I guess using `auto` for the zip iterator never occurred to me and I kept on making additional structs. You can paste the answer and I shall accept it. Once again, Thank you for your help. – AzuxirenLeadGuy Sep 26 '22 at 12:02

1 Answers1

2

There are three main problems with your code:

  1. Vectors are owning containers, so thrust::device_vector<int> b_vec(db, db + 5) generates a copy, on which you work. Later when transferring the result to the host, you don't use this copy, but the original, unsorted data instead.
  2. The types of fancy iterators are often complicated and easy to screw up. Use factory functions like thrust::make_zip_iterator and auto instead to create easier to read code with less pitfalls.
  3. When using Thrust functionality you need to consistently use the wrapped thrust::device_ptr variables instead of the raw pointers. Not doing so can cause a dispatch to the CPU which will then cause runtime errors due to the pointers pointing to inaccessible device memory.

In the following you can find the fixed code (+ error checking and some stylistic C++ defaults):

#include <cstdio>

#include <thrust/device_ptr.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/reduce.h>
#include <thrust/sort.h>
#include <thrust/tuple.h>

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}



struct Min_op
{
    using W_tuple = thrust::tuple<int, int>;

    __host__ __device__ 
    W_tuple operator()(const W_tuple& a, const W_tuple& b) const
    {
        int aw = thrust::get<0>(a);
        int bw = thrust::get<0>(b);
        return aw < bw ? a : b;
    }
};

void printArray(int* a, int size)
{
    std::printf("[ %d", a[0]);
    for (int i = 1;i < size;i++)
    {
        std::printf(",\t%d", a[i]);
    }
    std::printf(" ]");
}

int main()
{
    int a[] = { 5, 4, 1, 4, 4 };
    int b[] = { 1, 4, 2, 5, 6 };
    int c[] = { 10, 11, 12, 13, 14 };
    
    int* da{}; int* db{}; int* dc{};
    int* da_copy{};
    int* db_copy{};
    int* dc_copy{};

    static_assert(sizeof(a) == sizeof(b) && sizeof(b) == sizeof(c));
    constexpr int size = sizeof(a);
    constexpr int n_elements = size / sizeof(a[0]);

    gpuErrchk(cudaMalloc(&da, size));
    gpuErrchk(cudaMalloc(&db, size));
    gpuErrchk(cudaMalloc(&dc, size));
    gpuErrchk(cudaMalloc(&da_copy, size));
    gpuErrchk(cudaMalloc(&db_copy, size));
    gpuErrchk(cudaMalloc(&dc_copy, size));

    gpuErrchk(cudaMemcpy(da, a, size, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(db, b, size, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(dc, c, size, cudaMemcpyHostToDevice));

    using dptr = thrust::device_ptr<int>;
    dptr da_ptr{da}; dptr db_ptr{db}; dptr dc_ptr{dc};
    dptr da_copy_ptr{da_copy};
    dptr db_copy_ptr{db_copy};
    dptr dc_copy_ptr{dc_copy};

    auto zip1 = thrust::make_zip_iterator
    (
      thrust::make_tuple(db_ptr, dc_ptr)
    );
    auto zip2 = thrust::make_zip_iterator
    (
      thrust::make_tuple(db_copy_ptr, dc_copy_ptr)
    );

    thrust::sort_by_key(da, da + n_elements, zip1);

    gpuErrchk(cudaMemcpy(a, da, size, cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(b, db, size, cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(c, dc, size, cudaMemcpyDeviceToHost));

    std::printf("After Sort\n a = ");
    printArray(a, n_elements);
    std::printf("\n b = ");
    printArray(b, n_elements);
    std::printf("\n c = ");
    printArray(c, n_elements);

    auto result = thrust::reduce_by_key
    (
      da_ptr, da_ptr + n_elements, 
      zip1, 
      da_copy_ptr,
      zip2, 
      thrust::equal_to<int>(), 
      Min_op()
    );
    auto new_n_elements = thrust::distance(da_copy_ptr, result.first);
    auto new_size = new_n_elements * sizeof(a[0]);

    gpuErrchk(cudaMemcpy(a, da_copy, new_size, cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(b, db_copy, new_size, cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(c, dc_copy, new_size, cudaMemcpyDeviceToHost));

    std::printf("\n\nAfter Reduce\n a = ");
    printArray(a, new_n_elements);
    std::printf("\n b = ");
    printArray(b, new_n_elements);
    std::printf("\n c = ");
    printArray(b, new_n_elements);

    return 0;
}
paleonix
  • 2,293
  • 1
  • 13
  • 29