CUDA Sort Z-Axis 3D Array C++/Thrust

Question

I'm looking to sort a large 3D array along the z-axis.

Example array is X x Y x Z (1000x1000x5)

I'd like to sort along the z-axis so I'd perform 1000x1000 sorts for 5 element along the z-axis.

Edit Update: Tried an attempt to use thrust below. It's functional and I'd store the output back, but this is very slow since I'm sorting 5 elements at a time per (x,y) location:

#include <stdio.h>
#include <stdlib.h>
#include <iostream>

#include <thrust/device_ptr.h>
#include <thrust/sort.h>
#include <thrust/gather.h>
#include <thrust/iterator/counting_iterator.h>

int main(){
int x = 1000, y = 1000, z = 5;
float*** unsorted_cube = new float** [x];

for (int i = 0; i < x; i++) 
{
    // Allocate memory blocks for 
    // rows of each 2D array 
    unsorted_cube[i] = new float* [y];

    for (int j = 0; j < y; j++) 
    {
        // Allocate memory blocks for 
        // columns of each 2D array 
        unsorted_cube[i][j] = new float[z];
    }
}


for (int i = 0; i < x; i++)
{
    for (int j = 0; j < y; j++)
    {
        unsorted_cube[i][j][0] = 4.0f;
        unsorted_cube[i][j][1] = 3.0f;
        unsorted_cube[i][j][2] = 1.0f;
        unsorted_cube[i][j][3] = 5.0f;
        unsorted_cube[i][j][4] = 2.0f;
    }
}

for (int i = 0; i < 5; i++)
{
    printf("unsorted_cube first 5 elements to sort at (0,0): %f\n", unsorted_cube[0][0][i]);
}

float* temp_input;
float* temp_output;
float* raw_ptr;
float raw_ptr_out[5];
cudaMalloc((void**)&raw_ptr, N_Size * sizeof(float));
for (int i = 0; i < x; i++)
{ 
    for (int j = 0; j < y; j++)
    {
        temp_input[0] = unsorted_cube[i][j][0];
        temp_input[1] = unsorted_cube[i][j][1];
        temp_input[2] = unsorted_cube[i][j][2];
        temp_input[3] = unsorted_cube[i][j][3];
        temp_input[4] = unsorted_cube[i][j][4];

        cudaMemcpy(raw_ptr, temp_input, 5 * sizeof(float), cudaMemcpyHostToDevice);
        thrust::device_ptr<float> dev_ptr = thrust::device_pointer_cast(raw_ptr);
        thrust::sort(dev_ptr, dev_ptr + 5);
        thrust::host_vector<float> host_vec(5);
        thrust::copy(dev_ptr, dev_ptr + 5, raw_ptr_out);

        if (i == 0 && j == 0)
        {
            for (int i = 0; i < 5; i++)
            {
                temp_output[i] = raw_ptr_out[i];
            }
            printf("sorted_cube[0,0,0] : %f\n", temp_output[0]);
            printf("sorted_cube[0,0,1] : %f\n", temp_output[1]);
            printf("sorted_cube[0,0,2] : %f\n", temp_output[2]);
            printf("sorted_cube[0,0,3] : %f\n", temp_output[3]);
            printf("sorted_cube[0,0,4] : %f\n", temp_output[4]);
        }
    }
}
}

hi, interesting, not sure if this might help https://stackoverflow.com/questions/49818185/cuda-extract-layer-from-3d-array — IronMan, Feb 24 '21 at 23:25
Hi! Thank you. It's a good reference to restructure the array. However, I'd want to stick to C++ to implement this if possible. — jinxst, Feb 25 '21 at 00:29
Generate a unique key for each (x,y) coordinate which marks a z axis sub array to sort and use sort by key. If you are clever you can probably use a fancy iterator to generate the keys so you don't need to store the keys in memory — talonmies, Feb 25 '21 at 02:18
Just updated my code w/ an attempt that works, but is very very slow. I'll look into keys. I have no idea how to do that, but will try to find an example. Any references would be great... Not sure how keys work. — jinxst, Feb 25 '21 at 02:31
However your solution will look like, I don't think it will use `thrust::sort`, as this is for sorting very big lists. It could probably be done by using a `thrust::zip_iterator` to zip all 5 values into a tuple and then use `thrust::for_each` to get to every corrdinate in the xy plane. For this small number of elements to sort you could look at this answer for example https://stackoverflow.com/a/2748811/10107454 (assuming you don't plan to go for more slices in z direction in the furure). The sorting algorithm will be the unary function that you pass to `for_each`. — paleonix, Feb 25 '21 at 18:00
@talonmies are you sure that `thrust::sort_by_key` does what you think it does? Because it's not behaving like `thrust::reduce_by_key`. It is *not* sorting consecutive values with the same key (what this question asks for, as far as I understand), but sorts the values and keys such that the keys are sorted in the end. — paleonix, Feb 25 '21 at 18:58

paleonix · Answer 1 · 2023-01-13T15:05:23.210

Assuming that the data is in a format where the values in each xy-plane are consecutive in memory: data[((z * y_length) + y) * x_length + x] (which is be best for coalescing memory accesses on the GPU, as well)

#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/for_each.h>
#include <thrust/zip_iterator.h>

void sort_in_z_dir(thrust::device_vector<float> &data,
                   int x_length, int y_length) { // z_length == 5
  auto z_stride = x_length * y_length;
  
  thrust::for_each(
      thrust::make_zip_iterator(thrust::make_tuple(
          data.begin(),
          data.begin() + z_stride,
          data.begin() + 2 * z_stride,
          data.begin() + 3 * z_stride,
          data.begin() + 4 * z_stride)),
      thrust::make_zip_iterator(thrust::make_tuple(
          data.begin() + z_stride,
          data.begin() + 2 * z_stride,
          data.begin() + 3 * z_stride,
          data.begin() + 4 * z_stride,
          data.begin() + 5 * z_stride)),
      [] __host__ __device__ 
      (thrust::tuple<float, float, float, float, float> &values) {
        float local_data[5] = {thrust::get<0>(values),
                               thrust::get<1>(values),
                               thrust::get<2>(values),
                               thrust::get<3>(values),
                               thrust::get<4>(values)};
        thrust::sort(thrust::seq, local_data, local_data + 5);
        thrust::get<0>(values) = local_data[0];
        thrust::get<1>(values) = local_data[1];
        thrust::get<2>(values) = local_data[2];
        thrust::get<3>(values) = local_data[3];
        thrust::get<4>(values) = local_data[4];
      });
}

This solution is certainly very ugly in terms of hardcoding z_length. One can use some C++ template-"magic" to make z_length into a template parameter, but this seemed to be overkill for this answer about Thrust.

See Convert std::tuple to std::array C++11 and How to convert std::array to std::tuple? for examples on interfacing between arrays and tuples.

The good thing about this solution that up to the sorting algorithm itself it should be pretty much optimal performance-wise. I don't know if thrust::sort is optimized for such small input arrays, but you can replace it by any self written sorting algorithm as I proposed in the comments.

If you want to be able to use different z_length without all this hassle, you might prefer this solution, which sorts in global memory, which is far from optimal, and feels a bit hacky because it uses Thrust pretty much only to launch a kernel. Here you want to have the data ordered the other way around: data[((x * y_length) + y) * z_length + z]

#include <thrust/counting_iterator.h>
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/for_each.h>

void sort_in_z_dir_alternative(thrust::device_vector<float> &data,
                               int x_length, int y_length, int z_length) {
  int n_threads = x_length * y_length;
  
  thrust::for_each(
      thrust::make_counting_iterator(0),
      thrust::make_counting_iterator(n_threads),
      [ddata = thrust::raw_pointer_cast(data.data()), z_length] __host__ __device__ (int idx) {
        thrust::sort(thrust::seq, 
                     ddata + z_length * idx,
                     ddata + z_length * (idx + 1));
      });
}

If you are ok with z_length being a template parameter, this might be a solution that combines the best from both worlds (data format like in the first example):

#include <thrust/counting_iterator.h>
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/for_each.h>

template <int z_length>
void sort_in_z_dir_middle_ground(thrust::device_vector<float> &data,
                                 int x_length, int y_length) {
  int n_threads = x_length * y_length; // == z_stride
  
  thrust::for_each(
      thrust::make_counting_iterator(0),
      thrust::make_counting_iterator(n_threads),
      [ddata = thrust::raw_pointer_cast(data.data()),
       z_length, n_threads] __host__ __device__ (int idx) {
        float local_data[z_length];
        #pragma unroll
        for (int i = 0; i < z_length; ++i) {
          local_data[i] = ddata[idx + i * n_threads];
        }
        thrust::sort(thrust::seq, 
                     local_data,
                     local_data + z_length);
        #pragma unroll
        for (int i = 0; i < z_length; ++i) {
          ddata[idx + i * n_threads] = local_data[i];
        }
      });
}

CUDA Sort Z-Axis 3D Array C++/Thrust

1 Answers1