0

I have a dataframe as follows:

Destination    User     User_Price 
     A          a           5
     A          b           4
     B          c           6
     B          a           5
     C          b           4
     C          d           7

I want to convert this into a matrix that shows what destination a user has hit that looks like this:

   User    User_Price    A    B    C    
    a          5         1    1    0
    b          4         1    0    1
    c          6         0    1    0
    d          7         0    0    1
nak5120
  • 4,089
  • 4
  • 35
  • 94
  • 2
    Essentially `table(foo[c("User", "Destination")])` if you are just looking to use this as a summary table. – thelatemail May 30 '16 at 02:27
  • Relevant question - with relevant links attached to that question: http://stackoverflow.com/questions/11659128/how-to-use-cast-or-another-function-to-create-a-binary-table-in-r/11659636 – thelatemail May 30 '16 at 02:30

4 Answers4

6

One way with the dplyr and tidyr package would be:

library(dplyr)
library(tidyr)

count(foo, User, User_Price, Destination) %>%
spread(key = Destination, value = n, fill = 0)

#    User User_Price     A     B     C
#  (fctr)      (int) (dbl) (dbl) (dbl)
#1      a          5     1     1     0
#2      b          4     1     0     1
#3      c          6     0     1     0
#4      d          7     0     0     1

If you need a matrix, you can convert this outcome (data frame) to matrix.

DATA

foo <- structure(list(Destination = structure(c(1L, 1L, 2L, 2L, 3L, 
3L), .Label = c("A", "B", "C"), class = "factor"), User = structure(c(1L, 
2L, 3L, 1L, 2L, 4L), .Label = c("a", "b", "c", "d"), class = "factor"), 
User_Price = c(5L, 4L, 6L, 5L, 4L, 7L)), .Names = c("Destination", 
"User", "User_Price"), class = "data.frame", row.names = c(NA, 
-6L))
jazzurro
  • 23,179
  • 35
  • 66
  • 76
2

Here is an option using data.table

library(data.table)
dcast(setDT(foo),User + User_Price ~ Destination, length, value.var="Destination")
#   User User_Price A B C
#1:    a          5 1 1 0
#2:    b          4 1 0 1
#3:    c          6 0 1 0
#4:    d          7 0 0 1
akrun
  • 874,273
  • 37
  • 540
  • 662
1

This looks very similar to a normal reshaping operation, except with some idiosyncrasies that require several lines of code to achieve in base R.

First, for reference and comparison purposes, here's what the minimalist reshape() call produces:

df <- data.frame(Destination=c('A','A','B','B','C','C'),User=c('a','b','c','a','b','d'),User_Price=c(5L,4L,6L,5L,4L,7L),stringsAsFactors=F);
reshape(df,dir='w',idvar='User',timevar='Destination');
##   User User_Price.A User_Price.B User_Price.C
## 1    a            5            5           NA
## 2    b            4           NA            4
## 3    c           NA            6           NA
## 6    d           NA           NA            7

Clearly there are several issues that must be addressed before we can arrive at the required output:

  • We must compute the required singular User_Price column from the multiple widened columns.
  • We must replace NA prices with 0.
  • We must replace non-NA prices with 1.
  • We must fix the column names to omit the User_Price. prefix.

Here's a complete solution, using df from above:

res <- reshape(df,dir='w',idvar='User',timevar='Destination');
pre <- '^User_Price\\.';
cis <- grep(pre,names(res));
res$User_Price <- do.call(pmax,c(res[cis],na.rm=T));
names(res)[cis] <- sub(pre,'',names(res)[cis]);
nas <- is.na(res[cis]);
res[cis][nas] <- 0;
res[cis][!nas] <- 1;
res;
  User A B C User_Price
1    a 1 1 0          5
2    b 1 0 1          4
3    c 0 1 0          6
6    d 0 0 1          7

Benchmarking

library(microbenchmark);
library(dplyr);
library(tidyr);
library(data.table);

bgoldst <- function(df) { res <- reshape(df,dir='w',idvar='User',timevar='Destination'); pre <- '^User_Price\\.'; cis <- grep(pre,names(res)); res$User_Price <- do.call(pmax,c(res[cis],na.rm=T)); names(res)[cis] <- sub(pre,'',names(res)[cis]); nas <- is.na(res[cis]); res[cis][nas] <- 0; res[cis][!nas] <- 1; res; };
thelatemail <- function(df) { x <- table(df[,c('User','Destination')]); data.frame(User=rownames(x),User_Price=df[match(rownames(x),df$User),'User_Price'],unclass(x)); };
jazzurro <- function(foo) { count(foo, User, User_Price, Destination) %>% spread(key = Destination, value = n, fill = 0); };
akrun <- function(foo) dcast(setDT(foo),User + User_Price ~ Destination, length, value.var="Destination");

## OP's test case
df <- data.frame(Destination=c('A','A','B','B','C','C'),User=c('a','b','c','a','b','d'),User_Price=c(5L,4L,6L,5L,4L,7L));
dt <- as.data.table(df);

ex <- bgoldst(df); o <- names(ex); us <- ex$User;
all.equal(ex,thelatemail(df)[us,o],check.attributes=F);
## [1] TRUE
all.equal(ex,jazzurro(df)[us,o],check.attributes=F);
## [1] TRUE
all.equal(ex,as.data.frame(akrun(dt))[us,o],check.attributes=F);
## [1] TRUE

microbenchmark(bgoldst(df),thelatemail(df),jazzurro(df),akrun(dt));
## Unit: microseconds
##             expr      min       lq      mean   median        uq      max neval
##      bgoldst(df) 1767.488 1897.281 2021.7741 1943.894 2035.6260 5227.196   100
##  thelatemail(df)  473.412  536.063  574.4233  578.186  608.1225  738.129   100
##     jazzurro(df) 2707.468 2914.666 3145.7258 3032.270 3160.3515 5677.514   100
##        akrun(dt) 4403.964 4721.069 5026.5023 4875.238 5028.1230 7703.303   100

## scale test
set.seed(1L);
ND <- 1e3L; NU <- 1e3L; NR <- 1e4L;
dests <- sample(make.unique(rep(LETTERS,len=ND)),NR,T);
us <- make.unique(rep(letters,len=NU));
users <- ave(dests,dests,FUN=function(x) sample(us,length(x)));
prices <- ave(seq_along(users),users,FUN=function(x) rep(sample(1:9,1L),len=length(x)));
df <- data.frame(Destination=dests,User=users,User_Price=prices);
dt <- as.data.table(df);

ex <- bgoldst(df); o <- names(ex); us <- ex$User;
all.equal(ex,thelatemail(df)[us,o],check.attributes=F);
## [1] TRUE
all.equal(ex,jazzurro(df)[us,o],check.attributes=F);
## [1] TRUE
all.equal(ex,as.data.frame(akrun(dt))[us,o],check.attributes=F);
## [1] TRUE

microbenchmark(bgoldst(df),thelatemail(df),jazzurro(df),akrun(dt),times=10L);
## Unit: milliseconds
##             expr        min         lq       mean     median         uq        max neval
##      bgoldst(df) 1381.46461 1418.13922 1445.20568 1437.82683 1474.79075 1538.37153    10
##  thelatemail(df)   31.84727   37.56498   57.47417   44.54106   82.39749   92.63933    10
##     jazzurro(df)   79.18924   91.20755  117.20360  126.22693  136.13885  168.26623    10
##        akrun(dt)   52.06625   59.02158   79.59568   70.09136  106.93019  130.31208    10

## scale test 2
set.seed(1L);
ND <- 1e4L; NU <- 1e4L; NR <- 1e6L;
dests <- sample(make.unique(rep(LETTERS,len=ND)),NR,T);
us <- make.unique(rep(letters,len=NU));
users <- ave(dests,dests,FUN=function(x) sample(us,length(x)));
prices <- ave(seq_along(users),users,FUN=function(x) rep(sample(1:9,1L),len=length(x)));
df <- data.frame(Destination=dests,User=users,User_Price=prices);
dt <- as.data.table(df);

ex <- bgoldst(df); o <- names(ex); us <- ex$User;
all.equal(ex,thelatemail(df)[us,o],check.attributes=F);
## [1] TRUE
all.equal(ex,jazzurro(df)[us,o],check.attributes=F);
## [1] TRUE
all.equal(ex,as.data.frame(akrun(dt))[us,o],check.attributes=F);
## [1] TRUE

microbenchmark(bgoldst(df),thelatemail(df),jazzurro(df),akrun(dt),times=1L);
## Unit: seconds
##             expr        min         lq       mean     median         uq        max neval
##      bgoldst(df) 485.849043 485.849043 485.849043 485.849043 485.849043 485.849043     1
##  thelatemail(df)   3.377981   3.377981   3.377981   3.377981   3.377981   3.377981     1
##     jazzurro(df)  12.858542  12.858542  12.858542  12.858542  12.858542  12.858542     1
##        akrun(dt)   4.132785   4.132785   4.132785   4.132785   4.132785   4.132785     1
bgoldst
  • 34,190
  • 6
  • 38
  • 64
  • I am not sure `table` works that efficient when the number of rows increases to around 1e7 or so. – akrun May 30 '16 at 05:13
  • @akrun I added a benchmark with 1e6 rows. I tried 1e7 but got the error `Error in table(df[, c("User", "Destination")]) : attempt to make a table with >= 2^31 elements` when running `thelatemail()`. – bgoldst May 30 '16 at 15:05
0

Another way to achieve the same is using dcast.

a <- dcast(foo,User + User_Price ~ Destination,fill=0)

Later change the values for the Destination columns

karthikbharadwaj
  • 368
  • 1
  • 7
  • 17