This looks very similar to a normal reshaping operation, except with some idiosyncrasies that require several lines of code to achieve in base R.
First, for reference and comparison purposes, here's what the minimalist reshape()
call produces:
df <- data.frame(Destination=c('A','A','B','B','C','C'),User=c('a','b','c','a','b','d'),User_Price=c(5L,4L,6L,5L,4L,7L),stringsAsFactors=F);
reshape(df,dir='w',idvar='User',timevar='Destination');
## User User_Price.A User_Price.B User_Price.C
## 1 a 5 5 NA
## 2 b 4 NA 4
## 3 c NA 6 NA
## 6 d NA NA 7
Clearly there are several issues that must be addressed before we can arrive at the required output:
- We must compute the required singular
User_Price
column from the multiple widened columns.
- We must replace NA prices with 0.
- We must replace non-NA prices with 1.
- We must fix the column names to omit the
User_Price.
prefix.
Here's a complete solution, using df
from above:
res <- reshape(df,dir='w',idvar='User',timevar='Destination');
pre <- '^User_Price\\.';
cis <- grep(pre,names(res));
res$User_Price <- do.call(pmax,c(res[cis],na.rm=T));
names(res)[cis] <- sub(pre,'',names(res)[cis]);
nas <- is.na(res[cis]);
res[cis][nas] <- 0;
res[cis][!nas] <- 1;
res;
User A B C User_Price
1 a 1 1 0 5
2 b 1 0 1 4
3 c 0 1 0 6
6 d 0 0 1 7
Benchmarking
library(microbenchmark);
library(dplyr);
library(tidyr);
library(data.table);
bgoldst <- function(df) { res <- reshape(df,dir='w',idvar='User',timevar='Destination'); pre <- '^User_Price\\.'; cis <- grep(pre,names(res)); res$User_Price <- do.call(pmax,c(res[cis],na.rm=T)); names(res)[cis] <- sub(pre,'',names(res)[cis]); nas <- is.na(res[cis]); res[cis][nas] <- 0; res[cis][!nas] <- 1; res; };
thelatemail <- function(df) { x <- table(df[,c('User','Destination')]); data.frame(User=rownames(x),User_Price=df[match(rownames(x),df$User),'User_Price'],unclass(x)); };
jazzurro <- function(foo) { count(foo, User, User_Price, Destination) %>% spread(key = Destination, value = n, fill = 0); };
akrun <- function(foo) dcast(setDT(foo),User + User_Price ~ Destination, length, value.var="Destination");
## OP's test case
df <- data.frame(Destination=c('A','A','B','B','C','C'),User=c('a','b','c','a','b','d'),User_Price=c(5L,4L,6L,5L,4L,7L));
dt <- as.data.table(df);
ex <- bgoldst(df); o <- names(ex); us <- ex$User;
all.equal(ex,thelatemail(df)[us,o],check.attributes=F);
## [1] TRUE
all.equal(ex,jazzurro(df)[us,o],check.attributes=F);
## [1] TRUE
all.equal(ex,as.data.frame(akrun(dt))[us,o],check.attributes=F);
## [1] TRUE
microbenchmark(bgoldst(df),thelatemail(df),jazzurro(df),akrun(dt));
## Unit: microseconds
## expr min lq mean median uq max neval
## bgoldst(df) 1767.488 1897.281 2021.7741 1943.894 2035.6260 5227.196 100
## thelatemail(df) 473.412 536.063 574.4233 578.186 608.1225 738.129 100
## jazzurro(df) 2707.468 2914.666 3145.7258 3032.270 3160.3515 5677.514 100
## akrun(dt) 4403.964 4721.069 5026.5023 4875.238 5028.1230 7703.303 100
## scale test
set.seed(1L);
ND <- 1e3L; NU <- 1e3L; NR <- 1e4L;
dests <- sample(make.unique(rep(LETTERS,len=ND)),NR,T);
us <- make.unique(rep(letters,len=NU));
users <- ave(dests,dests,FUN=function(x) sample(us,length(x)));
prices <- ave(seq_along(users),users,FUN=function(x) rep(sample(1:9,1L),len=length(x)));
df <- data.frame(Destination=dests,User=users,User_Price=prices);
dt <- as.data.table(df);
ex <- bgoldst(df); o <- names(ex); us <- ex$User;
all.equal(ex,thelatemail(df)[us,o],check.attributes=F);
## [1] TRUE
all.equal(ex,jazzurro(df)[us,o],check.attributes=F);
## [1] TRUE
all.equal(ex,as.data.frame(akrun(dt))[us,o],check.attributes=F);
## [1] TRUE
microbenchmark(bgoldst(df),thelatemail(df),jazzurro(df),akrun(dt),times=10L);
## Unit: milliseconds
## expr min lq mean median uq max neval
## bgoldst(df) 1381.46461 1418.13922 1445.20568 1437.82683 1474.79075 1538.37153 10
## thelatemail(df) 31.84727 37.56498 57.47417 44.54106 82.39749 92.63933 10
## jazzurro(df) 79.18924 91.20755 117.20360 126.22693 136.13885 168.26623 10
## akrun(dt) 52.06625 59.02158 79.59568 70.09136 106.93019 130.31208 10
## scale test 2
set.seed(1L);
ND <- 1e4L; NU <- 1e4L; NR <- 1e6L;
dests <- sample(make.unique(rep(LETTERS,len=ND)),NR,T);
us <- make.unique(rep(letters,len=NU));
users <- ave(dests,dests,FUN=function(x) sample(us,length(x)));
prices <- ave(seq_along(users),users,FUN=function(x) rep(sample(1:9,1L),len=length(x)));
df <- data.frame(Destination=dests,User=users,User_Price=prices);
dt <- as.data.table(df);
ex <- bgoldst(df); o <- names(ex); us <- ex$User;
all.equal(ex,thelatemail(df)[us,o],check.attributes=F);
## [1] TRUE
all.equal(ex,jazzurro(df)[us,o],check.attributes=F);
## [1] TRUE
all.equal(ex,as.data.frame(akrun(dt))[us,o],check.attributes=F);
## [1] TRUE
microbenchmark(bgoldst(df),thelatemail(df),jazzurro(df),akrun(dt),times=1L);
## Unit: seconds
## expr min lq mean median uq max neval
## bgoldst(df) 485.849043 485.849043 485.849043 485.849043 485.849043 485.849043 1
## thelatemail(df) 3.377981 3.377981 3.377981 3.377981 3.377981 3.377981 1
## jazzurro(df) 12.858542 12.858542 12.858542 12.858542 12.858542 12.858542 1
## akrun(dt) 4.132785 4.132785 4.132785 4.132785 4.132785 4.132785 1