I'm having trouble converting this ptyhon to Rcpp
The script is part of a classification and regression tree logic:
# Calculate the Gini index for a split dataset
def gini_index(groups, classes):
# count all samples at split point
n_instances = float(sum([len(group) for group in groups]))
# sum weighted Gini index for each group
gini = 0.0
for group in groups:
size = float(len(group))
# avoid divide by zero
if size == 0:
continue
score = 0.0
# score the group based on the score for each class
for class_val in classes:
p = [row[-1] for row in group].count(class_val) / size
score += p * p
# weight the group score by its relative size
gini += (1.0 - score) * (size / n_instances)
return gini
# test Gini values
print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1]))
>>0.5
print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1]))
>>0
I'm trying to speed it up by simply providing two vectors, and looping. The vectors are the actual group and proposed group. The actual group corresponds to the last element in each nested list above:
1,0,1,0
The proposed group corresponds to the way the nested lists are grouped (first two elements are in one group; the second two elements are in another)
1,1,2,2
My C++ / Rcpp is not strong enough to see what's going wrong here:
float gini_index(NumericVector y, NumericVector g){
int n_records = y.size();
float gini = 0.0;
NumericVector groups = unique(y);
int group_count = groups.size();
for(int i_g=0; i_g<group_count; i_g++){
float size = 0;
for(int i;i<n_records;i++){
if(g[i]==groups[i_g]){
size++;
}
}
float score = 0.0;
for(int i_y=0; i_y<n_records; i_y++){
float class_count=0;
if(y[i_y]==groups[i_g]){
class_count++;}
float p=0;
if(size==0){
p=0;
}
else{
p = class_count/size;
}
std::cout<<p<<std::endl;
score = score + p*p;
}
gini = gini+(1.0-score)*(size/n_records);
}
return(gini);
}
y<-c(1,1,2,2)
g<-c(1,2,1,2)
gini_index(y,g)
> gini_index(y,g)
0
0.5
0
0.5
0
0
0
0
[1] 0.25
This should return 0.5