0

I'm having trouble converting this ptyhon to Rcpp

The script is part of a classification and regression tree logic:

# Calculate the Gini index for a split dataset
def gini_index(groups, classes):
    # count all samples at split point
    n_instances = float(sum([len(group) for group in groups]))
    # sum weighted Gini index for each group
    gini = 0.0
    for group in groups:
        size = float(len(group))
        # avoid divide by zero
        if size == 0:
            continue
        score = 0.0
        # score the group based on the score for each class
        for class_val in classes:
            p = [row[-1] for row in group].count(class_val) / size
            score += p * p
        # weight the group score by its relative size
        gini += (1.0 - score) * (size / n_instances)
    return gini

# test Gini values
print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1]))
>>0.5
print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1]))
>>0

I'm trying to speed it up by simply providing two vectors, and looping. The vectors are the actual group and proposed group. The actual group corresponds to the last element in each nested list above:

1,0,1,0

The proposed group corresponds to the way the nested lists are grouped (first two elements are in one group; the second two elements are in another)

1,1,2,2

My C++ / Rcpp is not strong enough to see what's going wrong here:

float gini_index(NumericVector y, NumericVector g){
  int n_records = y.size();
  float gini = 0.0;
  NumericVector groups = unique(y);
  int group_count = groups.size();
  for(int i_g=0; i_g<group_count; i_g++){
    float size = 0;
    for(int i;i<n_records;i++){
      if(g[i]==groups[i_g]){
        size++;
      }
    }
    float score = 0.0;
    for(int i_y=0; i_y<n_records; i_y++){
      float class_count=0;
      if(y[i_y]==groups[i_g]){
        class_count++;}
      float p=0;
      if(size==0){
        p=0;
      }
      else{
        p = class_count/size;
      }
      std::cout<<p<<std::endl;
      score = score + p*p;
    }
    gini = gini+(1.0-score)*(size/n_records);  
  }
return(gini);
}


y<-c(1,1,2,2)
g<-c(1,2,1,2)
gini_index(y,g)




 > gini_index(y,g)
    0
    0.5
    0
    0.5
    0
    0
    0
    0
    [1] 0.25

This should return 0.5

user2723494
  • 1,168
  • 2
  • 15
  • 26

2 Answers2

1

There is an error in one of your loops: i is not initialized. In my case the caused the loop to be not evaluated at all. With the proper fix in place, the result looks better:

#include <Rcpp.h>
using namespace Rcpp;

// [[Rcpp::export]]
float gini_index(NumericVector y, NumericVector g){
  int n_records = y.size();
  float gini = 0.0;
  NumericVector groups = unique(y);
  int group_count = groups.size();
  for(int i_g=0; i_g<group_count; i_g++){
    float size = 0;
    for(int i = 0;i<n_records;i++){ // !!!
      if(g[i]==groups[i_g]){
        size++;
      }
    }
    float score = 0.0;
    for(int i_y=0; i_y<n_records; i_y++){
      float class_count=0;
      if(y[i_y]==groups[i_g]){
        class_count++;}
      float p=0;
      if(size==0){
        p=0;
      }
      else{
        p = class_count/size;
      }
      Rcpp::Rcout<<p<<std::endl;
      score = score + p*p;
    }
    gini = gini+(1.0-score)*(size/n_records);  
  }
  return(gini);
}

/*** R
y<-c(1,1,2,2)
g<-c(1,2,1,2)
gini_index(y,g)
*/

Result:

> gini_index(y,g)
0 2
0
0
0.5
0.5
1 2
0.5
0.5
0
0
[1] 0.5

BTW:

  • Why are you returning float instead of double?
  • Increments should be written as ++i etc.
  • C++ has +=.
Ralf Stubner
  • 26,263
  • 3
  • 40
  • 75
  • Thank you....based on this https://stackoverflow.com/questions/1074474/should-i-use-double-or-float .... "whereas floats take up less memory and are faster. In general you should use float unless you have a case where it isn't accurate enough. " – user2723494 Jun 27 '18 at 16:22
  • @user2723494 That's from a 0-score answer, while the top rated answers advocate that there is not much difference (if any at all) on CPU. Besides, R's `numeric` type corresponds to a `double`. So while you might use `float` in C++, the result will be converted to `double` on its way to R. – Ralf Stubner Jun 27 '18 at 16:27
  • thank you. Ironically when you google, "float instead of double" the line I quoted is the google auto-answer. – user2723494 Jun 27 '18 at 16:47
0

This is what I ended up using, if anyone tries to recreate. The answer I originally accepted was very helpful but printed out errant answers in other scenarios.

float gini_index(NumericVector y, NumericVector g){
  float gini = 0.0;
  NumericVector classes = unique(y);
  int class_count = classes.size();
  int n_instances = y.size();

  float p;
  for(int group=0; group<class_count; group++){
    float size = 0;
    for(int i=0; i<g.size();i++){
      if(g[i]==classes[group]){
        size++;
      }
    }
    if(size==0){
      p=0;
    }
    float score =0.0;
    if(size!=0){
      for(int class_val=0; class_val<classes.size(); class_val++){
        float correctly_assigned = 0;
        for(int i=0; i<g.size();i++){
          if(g[i]==classes[group] && y[i]==classes[class_val])
            correctly_assigned++;
        }
        p = correctly_assigned/size;
        score = score +p*p;
      }
      gini = gini+ (1.0-score)*(size / n_instances);
    }
  }
  return gini;
}
user2723494
  • 1,168
  • 2
  • 15
  • 26