Determining the 2 largest elements and their positions in each matrix row with CUDA Thrust

Question

I have a matrix and I need to compute the 2 largest numbers AND their position in each row of this matrix. My initial attempt was to try and sort each row of the matrix and then look at the last two values. While I could sort each row, I was unable to get the permutation vector to get the original indexes. So my attempt (using some other threads on SO) was as follows:

int my_mod_start = 0;
int my_mod()
{
    return (my_mod_start++)/10;
}

const int rows = 2;
const int cols = 10;
const int num_points = rows * cols;

thrust::host_vector<float> data(num_points);
// fill with random values
thrust::device_vector<float> d_r = data;
thrust::host_vector<int> h_segments(rows*cols);
thrust::generate(h_segments.begin(), h_segments.end(), my_mod);

thrust::device_vector<int> d_segments = h_segments;
thrust::stable_sort_by_key(d_r.begin(), d_r.end(), d_segments.begin());
thrust::stable_sort_by_key(d_segments.begin(), d_segments.end(), 
                           d_r.begin());

While this approach sorts each row as expected, I am not sure how to modify this to get the original index of each of the values.

It also occurs to me that perhaps sorting the whole row might be wasteful if I only need the maximum 2 values and their locations.

Have you taken a look at [CUDA Thrust find index of minimum value among multiple vectors](http://stackoverflow.com/questions/17698969/cuda-thrust-find-index-of-minimum-value-among-multiple-vectors)? Of course, you have to change the columns with rows. — Vitality, Apr 24 '15 at 07:15
@JackOLantern I cannot compile that code due to this _1 in the code? — Luca, Apr 24 '15 at 15:54
read up on thrust placeholders. Google is your friend. placeholders will require a certain minimum level of C/C++ compiler capability, so depending on what platform you have, they won't work (compile error). With a recent version of gcc like 4.8.x you should have no trouble with them. If you have an older version of gcc like 4.1.2 you have to skip using placeholders. However any placeholder realization can be replaced with a corresponding thrust functor realization. — Robert Crovella, Apr 24 '15 at 23:20
Concerning the post I linked to, I do not think you will need Eric's approach, but Robert Crovella's instead. Therein, you will not need placeholders. — Vitality, Apr 25 '15 at 07:12

score 3 · Accepted Answer · edited May 23 '17 at 12:22

I have adapted the approach pointed out by Robert Crovella at Determining the least element and its position in each matrix column with CUDA Thrust. The approach considers the problem of determining the minima, not the maxima, and produces two iterators and one vector:

d_min_indices_1: iterator pointing to the indices of the last element for each row;
d_min_indices_2: iterator pointing to the indices of the second to last element for each row;
d_matrix: original matrix but with each row ordered in ascending order.

The values of the last and second to last elements can be determined from the ordered matrix d_matrix.

#include <iterator>
#include <algorithm>

#include <thrust/random.h>
#include <thrust/device_vector.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/discard_iterator.h>
#include <thrust/reduce.h>
#include <thrust/functional.h>
#include <thrust/sort.h>

template <typename Iterator>
class strided_range
{
    public:

    typedef typename thrust::iterator_difference<Iterator>::type difference_type;

    struct stride_functor : public thrust::unary_function<difference_type,difference_type>
    {
        difference_type stride;

        stride_functor(difference_type stride)
            : stride(stride) {}

        __host__ __device__
        difference_type operator()(const difference_type& i) const
        { 
            return stride * i;
        }
    };

    typedef typename thrust::counting_iterator<difference_type>                   CountingIterator;
    typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
    typedef typename thrust::permutation_iterator<Iterator,TransformIterator>     PermutationIterator;

    // type of the strided_range iterator
    typedef PermutationIterator iterator;

    // construct strided_range for the range [first,last)
    strided_range(Iterator first, Iterator last, difference_type stride)
        : first(first), last(last), stride(stride) {}

    iterator begin(void) const
    {
        return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride)));
    }

    iterator end(void) const
    {
        return begin() + ((last - first) + (stride - 1)) / stride;
    }

    protected:
    Iterator first;
    Iterator last;
    difference_type stride;
};


/**************************************************************/
/* CONVERT LINEAR INDEX TO ROW INDEX - NEEDED FOR APPROACH #1 */
/**************************************************************/
template< typename T >
struct mod_functor {
    __host__ __device__ T operator()(T a, T b) { return a % b; }
};

/********/
/* MAIN */
/********/
int main()
{
    /***********************/
    /* SETTING THE PROBLEM */
    /***********************/
    const int Nrows = 4;
    const int Ncols = 6;

    // --- Random uniform integer distribution between 10 and 99
    thrust::default_random_engine rng;
    thrust::uniform_int_distribution<int> dist(10, 99);

    // --- Matrix allocation and initialization
    thrust::device_vector<float> d_matrix(Nrows * Ncols);
    for (size_t i = 0; i < d_matrix.size(); i++) d_matrix[i] = (float)dist(rng);

    for(int i = 0; i < Nrows; i++) {
        std::cout << "[ ";
        for(int j = 0; j < Ncols; j++)
            std::cout << d_matrix[i * Ncols + j] << " ";
        std::cout << "]\n";
    }

    /******************/
    /* APPROACH NR. 2 */
    /******************/
    // --- Computing row indices vector
    thrust::device_vector<int> d_row_indices(Nrows * Ncols);
    thrust::transform(thrust::make_counting_iterator(0), thrust::make_counting_iterator(Nrows * Ncols), thrust::make_constant_iterator(Ncols), d_row_indices.begin(), thrust::divides<int>() );

    // --- Computing column indices vector
    thrust::device_vector<int> d_column_indices(Nrows * Ncols);
    thrust::transform(thrust::make_counting_iterator(0), thrust::make_counting_iterator(Nrows * Ncols), thrust::make_constant_iterator(Ncols), d_column_indices.begin(), mod_functor<int>());

    // --- int and float iterators
    typedef thrust::device_vector<int>::iterator        IntIterator;
    typedef thrust::device_vector<float>::iterator      FloatIterator;

    // --- Relevant tuples of int and float iterators
    typedef thrust::tuple<IntIterator, IntIterator>     IteratorTuple1;
    typedef thrust::tuple<FloatIterator, IntIterator>   IteratorTuple2;

    // --- zip_iterator of the relevant tuples
    typedef thrust::zip_iterator<IteratorTuple1>        ZipIterator1;
    typedef thrust::zip_iterator<IteratorTuple2>        ZipIterator2;

    // --- zip_iterator creation
    ZipIterator1 iter1(thrust::make_tuple(d_row_indices.begin(), d_column_indices.begin()));

    thrust::stable_sort_by_key(d_matrix.begin(), d_matrix.end(), iter1);

    ZipIterator2 iter2(thrust::make_tuple(d_matrix.begin(), d_column_indices.begin()));

    thrust::stable_sort_by_key(d_row_indices.begin(), d_row_indices.end(), iter2);

    typedef thrust::device_vector<int>::iterator Iterator;

    // --- Strided access to the sorted array
    strided_range<Iterator> d_min_indices_1(d_column_indices.begin(), d_column_indices.end(), Ncols);
    strided_range<Iterator> d_min_indices_2(d_column_indices.begin() + 1, d_column_indices.end() + 1, Ncols);

    printf("\n\n");
    for(int i = 0; i < Nrows; i++) {
        std::cout << "[ ";
        for(int j = 0; j < Ncols; j++)
            std::cout << d_matrix[i * Ncols + j] << " ";
        std::cout << "]\n";
    }

    printf("\n\n");
    std::copy(d_min_indices_1.begin(), d_min_indices_1.end(), std::ostream_iterator<int>(std::cout, " "));
    std::cout << std::endl;

    printf("\n\n");
    std::copy(d_min_indices_2.begin(), d_min_indices_2.end(), std::ostream_iterator<int>(std::cout, " "));
    std::cout << std::endl;

    return 0;
}

If you want to determine the maxima, then change the two rows

strided_range<Iterator> d_min_indices_1(d_column_indices.begin(), d_column_indices.end(), Ncols);
strided_range<Iterator> d_min_indices_2(d_column_indices.begin() + 1, d_column_indices.end() + 1, Ncols);

with

strided_range<Iterator> d_min_indices_1(d_column_indices.begin() + Ncols - 1, d_column_indices.end() + Ncols - 1, Ncols);
strided_range<Iterator> d_min_indices_2(d_column_indices.begin() + Ncols - 2, d_column_indices.end() + Ncols - 2, Ncols);

Determining the 2 largest elements and their positions in each matrix row with CUDA Thrust

1 Answers1