Usually I have to work with big spatial data, and high speed and memory efficiency are expected. Supposing I want to modify some numeric columns of a dataframe with a self-defined function in Rcpp, I am confused about the reference and copy mechanism of C++ and Rcpp. With the three minimal example code below, would you please help me clatifying the following questions:
Is updateDF3 the best function to do such a task with the highest speed and lowest memory required? This function is modified from a similiar question here, but I do not understand the warning given by the author, "There are issues associated with this approach. Your original data frame and the one you created share the same vectors and so bad things can happen." If I use this function only for a sub function as updateDF3 and called from R, is it safe?
Why is the difference of performance of updateDF1 and updateDF2 not significant? What is the difference between passing the parameter with or without reference (&)?
Is the function coded pooly and there is another way, such as DataFrame out=clone(df), tmpstr=asstd::string(colnames[v])?
Thanks in advance.
#include <Rcpp.h>
#include <iostream>
using namespace Rcpp;
using namespace std;
// [[Rcpp::export]]
bool contains(CharacterVector x, std::string y) {
return std::find(x.begin(), x.end(), y)!=x.end();
}
// [[Rcpp::export]]
DataFrame updateDF1(DataFrame df, Nullable<Rcpp::CharacterVector> vars=R_NilValue) {
DataFrame out=clone(df);
string tmpstr;
NumericVector tmpv;
if(vars.isNotNull()){
CharacterVector selvars(vars);
for(int v=0;v<selvars.size();v++){
tmpstr=as<std::string>(selvars[v]);
tmpv=df[tmpstr];
tmpv=tmpv+1.0;
out[tmpstr]=tmpv;
}
}
return out;
}
// [[Rcpp::export]]
DataFrame updateDF2(DataFrame& df, Nullable<Rcpp::CharacterVector> vars=R_NilValue) {
DataFrame out=clone(df);
string tmpstr;
NumericVector tmpv;
if(vars.isNotNull()){
CharacterVector selvars(vars);
for(int v=0;v<selvars.size();v++){
tmpstr=as<std::string>(selvars[v]);
tmpv=df[tmpstr];
tmpv=tmpv+1.0;
out[tmpstr]=tmpv;
}
}
return out;
}
// [[Rcpp::export]]
List updateDF3(DataFrame& df, Nullable<Rcpp::CharacterVector> vars=R_NilValue) {
List out(df.size());
CharacterVector colnames=df.attr("names");
string tmpstr;
NumericVector tmpv;
for(int v=0;v<df.size();v++){
if(vars.isNotNull()){
CharacterVector selvars(vars);
tmpstr=as<std::string>(colnames[v]);
if(contains(selvars,tmpstr)){
tmpv=df[tmpstr];
tmpv=tmpv+1.0;
out[v]=tmpv;
}else{
out[v]=df[tmpstr];
}
}else{
out[v]=df[tmpstr];
}
}
out.attr("class") = df.attr("class") ;
out.attr("row.names") = df.attr("row.names") ;
out.attr("names") = df.attr("names") ;
return out;
}
/*** R
df=as.data.frame(matrix(1:120000000,nrow=10000000))
names(df)=paste("band",1:ncol(df),sep="_")
df=cbind(x="charcol",df)
microbenchmark::microbenchmark(
x1<<-updateDF1(df,vars=names(df)[-1]),
x2<<-updateDF2(df,vars=names(df)[-1]),
x3<<-updateDF3(df,vars=names(df)[-1]),
times=10
)
identical(x1,x2)
identical(x1,x3)
*/
##performance
#Unit: milliseconds
# expr min lq mean median
# x1 <<- updateDF1(df, vars = names(df)[-1]) 587.6023 604.9242 711.8981 651.1242
# x2 <<- updateDF2(df, vars = names(df)[-1]) 581.7129 641.2876 882.9999 766.9354
# x3 <<- updateDF3(df, vars = names(df)[-1]) 406.1824 417.5892 542.2559 420.8485