3

I have a follow-up to this question. In R, you can order duplicates after a secondary key. How can I achieve this in Rcpp? I was trying the following (inspired by Kevin Ushey's answer to the original question), which seems to work for simple situations. How could I generalize this?

// [[Rcpp::export]]
Rcpp::IntegerVector order_(Rcpp::NumericVector x, Rcpp::NumericVector y) {
  // sort x 
  Rcpp::NumericVector sorted_x = clone(x).sort();
  // find order of x only
  Rcpp::IntegerVector order = Rcpp::match(sorted_x, x);
  // find order of y for each order of x
  for (int i = 1; i <= Rcpp::max(order); ++i){
    Rcpp::LogicalVector duplicates = order == i;
    int k = which_max(duplicates);
    Rcpp::NumericVector duplicated_y = y[duplicates];
    Rcpp::NumericVector sorted_dy = clone(duplicated_y).sort();
    Rcpp::IntegerVector order_y = Rcpp::match(sorted_dy, duplicated_y);
    for (int j = 0; j < order_y.length(); ++j){
      order(k + j) = order(k + j) + order_y(j) - 1;
    }
  }
  return order;
}
Community
  • 1
  • 1
shadow
  • 21,823
  • 4
  • 63
  • 77
  • 1
    You are not restricted to Rcpp types. I am almost certain that Boost or the C++ standard library will have an 'order with secondary key' option so I would look for that and use it. _I.e._ [this answer to an old question](http://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes) shows C++11 lambda use which allows you to plug in a custom comparator. – Dirk Eddelbuettel Jan 17 '17 at 14:00

1 Answers1

3

If I understand what you asking, one approach would be to leverage the comparison operators of std::tuple, which should "do the right thing".

// [[Rcpp::plugins(cpp11)]]
#include <Rcpp.h>
#include <tuple>

template <typename... Ts>
struct Point {
    std::size_t index;

    using tuple_t = std::tuple<Ts...>;
    tuple_t data;

    template <typename... Vs>
    Point(std::size_t i, const Vs&... vs)
        : index(i),
          data(std::forward<decltype(vs[i])>(vs[i])...)
    {}

    bool operator<(const Point& other) const {
        return data < other.data;
    }
};

template <typename... Ts>
class PointVector {
public:
    std::size_t sz;
    using vector_t = std::vector<Point<Ts...>>;
    vector_t data;

    template <typename... Vs>
    PointVector(const Vs&... vs)
        : sz(min_size(vs...))
    {
        data.reserve(sz);
        for (std::size_t i = 0; i < sz; i++) {
            data.emplace_back(i, vs...);
        }
    }

    Rcpp::IntegerVector sorted_index() const {
        vector_t tmp(data);
        std::stable_sort(tmp.begin(), tmp.end());

        Rcpp::IntegerVector res(sz);
        for (std::size_t i = 0; i < sz; i++) {
            res[i] = tmp[i].index + 1;
        }

        return res;
    }

private:
    template <typename V>
    std::size_t min_size(const V& v) {
        return v.size();
    }

    template <typename T, typename S, typename... Vs>
    std::size_t min_size(const T& t, const S& s, const Vs&... vs) {
        return t.size() < s.size() ?
            min_size(t, vs...) :
            min_size(s, vs...);
    }
};

using namespace Rcpp;

// [[Rcpp::export]]
IntegerVector order2(NumericVector x, NumericVector y) {
    PointVector<double, double> pv(x, y);
    return pv.sorted_index();
}

// [[Rcpp::export]]
IntegerVector order3(NumericVector x, NumericVector y, NumericVector z) {
    PointVector<double, double, double> pv(x, y, z);
    return pv.sorted_index();
}

Using this data to demonstrate,

x <- rep(1:2 + 0.5, 10)
y <- rep(1:4 + 0.5, 5)
z <- rep(1:5 + 0.5, 4)
(df <- data.frame(x, y, z))
#      x   y   z
# 1  1.5 1.5 1.5
# 2  2.5 2.5 2.5
# 3  1.5 3.5 3.5
# 4  2.5 4.5 4.5
# 5  1.5 1.5 5.5
# 6  2.5 2.5 1.5
# 7  1.5 3.5 2.5
# 8  2.5 4.5 3.5
# 9  1.5 1.5 4.5
# 10 2.5 2.5 5.5
# 11 1.5 3.5 1.5
# 12 2.5 4.5 2.5
# 13 1.5 1.5 3.5
# 14 2.5 2.5 4.5
# 15 1.5 3.5 5.5
# 16 2.5 4.5 1.5
# 17 1.5 1.5 2.5
# 18 2.5 2.5 3.5
# 19 1.5 3.5 4.5
# 20 2.5 4.5 5.5

the C++ version yields the same results as the base R equivalent:

all.equal(base::order(x, y, z), order3(x, y, z))
# [1] TRUE

df[order3(x, y, z),]
#      x   y   z
# 1  1.5 1.5 1.5
# 17 1.5 1.5 2.5
# 13 1.5 1.5 3.5
# 9  1.5 1.5 4.5
# 5  1.5 1.5 5.5
# 11 1.5 3.5 1.5
# 7  1.5 3.5 2.5
# 3  1.5 3.5 3.5
# 19 1.5 3.5 4.5
# 15 1.5 3.5 5.5
# 6  2.5 2.5 1.5
# 2  2.5 2.5 2.5
# 18 2.5 2.5 3.5
# 14 2.5 2.5 4.5
# 10 2.5 2.5 5.5
# 16 2.5 4.5 1.5
# 12 2.5 4.5 2.5
# 8  2.5 4.5 3.5
# 4  2.5 4.5 4.5
# 20 2.5 4.5 5.5

Edit: Here's a slightly more palatable version which allows you to write auto pv = MakePointVector(...) rather than specifying all of the type parameters:

// Point class as before 

template <typename... Vecs>
class PointVector {
public:
    std::size_t sz;
    using point_t = Point<typename std::remove_reference<decltype(Vecs{}[0])>::type...>;
    using vector_t = std::vector<point_t>;
    vector_t data;

    template <typename... Vs>
    PointVector(const Vecs&... vs)
        : sz(min_size(vs...))
    {
        data.reserve(sz);
        for (std::size_t i = 0; i < sz; i++) {
            data.emplace_back(i, vs...);
        }
    }

    Rcpp::IntegerVector sorted_index() const {
        vector_t tmp(data);
        std::stable_sort(tmp.begin(), tmp.end());

        Rcpp::IntegerVector res(sz);
        for (std::size_t i = 0; i < sz; i++) {
            res[i] = tmp[i].index + 1;
        }

        return res;
    }

private:
    template <typename V>
    std::size_t min_size(const V& v) {
        return v.size();
    }

    template <typename T, typename S, typename... Vs>
    std::size_t min_size(const T& t, const S& s, const Vs&... vs) {
        return t.size() < s.size() ?
            min_size(t, vs...) :
            min_size(s, vs...);
    }
};

template <typename... Vecs>
PointVector<Vecs...> MakePointVector(const Vecs&... vecs) {
    return PointVector<Vecs...>(vecs...);
}

using namespace Rcpp;

// [[Rcpp::export]]
IntegerVector order2(NumericVector x, NumericVector y) {
    auto pv = MakePointVector(x, y);
    return pv.sorted_index();
}

// [[Rcpp::export]]
IntegerVector order3(NumericVector x, NumericVector y, NumericVector z) {
    auto pv = MakePointVector(x, y, z);
    return pv.sorted_index();
}

/*** R

all.equal(base::order(x, y), order2(x, y))
# [1] TRUE

all.equal(base::order(x, y, z), order3(x, y, z))
# [1] TRUE

*/
nrussell
  • 18,382
  • 4
  • 47
  • 60