Creating a panel data frame based on two other dataframes

Question

I have two data frames:

ANNUALSALARY <- structure(list(FIRM = structure(1:3, .Label = c("A", "B", "C"), class = "factor"), SLY_ADMIN = c(0.1, 0.2, 0.3), SLY_MKT = c(0.5, 0.003,0.3), SLY_FIN = c(0.11, 0.12, 0.03)), .Names = c("FIRM", "SLY_ADMIN", "SLY_MKT", "SLY_FIN"), row.names = c(NA, -3L), class = "data.frame")

and:

WEEKLYPRODUCTIVITY <- structure(list(FIRM = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L), .Label = c("A", "B", "C"), class = "factor"), WEEKS = structure(c(1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L), .Label = c("1", "2", "3", "4", "5"), class = "factor"), PR_ADMIN = c(1, 5, 4, 3, 2, 1, 4, 2, 4, 2, 3, 1, 4, 5, 5), Z_ADMIN = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6), PR_MKT = c(0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2), Z_MKT = c(9, 8, 7, 6, 5, 4, 3, 2, 1, 9, 8, 7, 6, 5, 4), PR_FIN = c(5, 4, 3, 2, 1, 5, 4, 3, 2, 1, 5, 4, 3, 2, 1), Z_FIN = c(1, 2, 3, 4, 5, 5, 4, 3, 2, 1, 1, 2, 3, 4, 5)), .Names = c("FIRM", "WEEKS", "PR_ADMIN", "Z_ADMIN", "PR_MKT", "Z_MKT", "PR_FIN", "Z_FIN"), row.names = c(NA, 15L), class = c("plm.dim", "data.frame"))

I am interested in creating a data frame that takes minimum out of SLY_ADMIN, SLY_MKT, and SLY_FIN for each FIRM. Then takes the corresponding value out of PR_ADMIN, PR_MKT, and PR_FIN as well as ot of Z_ADMIN, Z_MKT, and Z_FIN. e.g. if SLY_MKT is minimum for FIRM A then it returns the PR_MKT and Z_MKT for 5 WEEKS. The panel data frame would look like this (I have created it manually):

REQUIRED <- structure(list(FIRM = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L), .Label = c("A", "B", "C"), class = "factor"),WEEKS = structure(c(1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L), .Label = c("1", "2", "3", "4", "5"), class = "factor"), PR = c(1, 5, 4, 3, 2, 5, 0, 1, 2, 3, 5, 4, 3, 2, 1), MIN_SLY = c(0.1, 0.1, 0.1, 0.1, 0.1, 0.003, 0.003, 0.003, 0.003, 0.003, 0.03, 0.03, 0.03, 0.03, 0.03), SLY_DEPT = structure(c(1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L), .Label = c("SLY_ADMIN", "SLY_FIN", "SLY_MKT"), class = "factor"), Z = c(1, 2, 3, 4, 5, 4, 3, 2, 1, 9, 1, 2, 3, 4, 5)), .Names = c("FIRM", "WEEKS", "PR", "MIN_SLY", "SLY_DEPT", "Z"), row.names = c(NA, 15L), class = c("plm.dim", "data.frame"))

Please help. Thanks

akrun · Accepted Answer · 2016-05-15T09:50:54.120

We can use data.table. Get the index of the minimum value of the numeric columns in the "ANNUALSALARY" using max.col. Then, we convert the 'data.frame' to 'data.table' and melt it from 'wide' to 'long' format, get the "MIN_SLY" and "S

library(data.table)
i1 <- max.col(-1*ANNUALSALARY[-1])
dN <- melt(setDT(ANNUALSALARY), id.var = "FIRM", value.name = "MIN_SLY", 
   variable.name = "SLY_DEPT")[ , .SD[which.min(MIN_SLY)], by = FIRM]
setDT(WEEKLYPRODUCTIVITY)

Or instead of melting, we can create the 'data.table' using the 'i1'

dN <- data.table(FIRM= ANNUALSALARY$FIRM, 
                MIN_SLY=as.data.frame(ANNUALSALARY)[-1][cbind(1:nrow(ANNUALSALARY), i1)], 
                SLY_DEPT = names(ANNUALSALARY)[-1][i1])

Then, we join the 'dN' by 'WEEKLYPRODUCTIVITY' and melt to 'long' format based on the patterns in column names. We order by 'FIRM', 'variable', 'WEEKS', create a grouping variable ('gr1') based on the "WEEKS" value, grouped by 'FIRM'.

dN2 <- melt(dN[WEEKLYPRODUCTIVITY, on = "FIRM"], measure = patterns("^PR", "^Z"), 
    value.name = c("PR", "Z"))[order(FIRM, variable, WEEKS)
       ][, gr1 := cumsum(WEEKS==1), FIRM][]

Finally, we join with a data.table' created using 'i1', on "FIRM", subset the rows where 'gr1' is equal to 'i1', and select the columns of interest.

res <- data.table(FIRM= ANNUALSALARY$FIRM, i1)[dN2, on = "FIRM"
            ][gr1==i1][,names(REQUIRED), with = FALSE]

all.equal(as.data.frame(res), REQUIRED, check.attributes=FALSE)
#[1] TRUE
res
#    FIRM WEEKS PR MIN_SLY  SLY_DEPT Z
# 1:    A     1  1   0.100 SLY_ADMIN 1
# 2:    A     2  5   0.100 SLY_ADMIN 2
# 3:    A     3  4   0.100 SLY_ADMIN 3
# 4:    A     4  3   0.100 SLY_ADMIN 4
# 5:    A     5  2   0.100 SLY_ADMIN 5
# 6:    B     1  5   0.003   SLY_MKT 4
# 7:    B     2  0   0.003   SLY_MKT 3
# 8:    B     3  1   0.003   SLY_MKT 2
# 9:    B     4  2   0.003   SLY_MKT 1
#10:    B     5  3   0.003   SLY_MKT 9
#11:    C     1  5   0.030   SLY_FIN 1
#12:    C     2  4   0.030   SLY_FIN 2
#13:    C     3  3   0.030   SLY_FIN 3
#14:    C     4  2   0.030   SLY_FIN 4
#15:    C     5  1   0.030   SLY_FIN 5

when I run the dN2 line :Error in `[.data.table`(dN, WEEKLYPRODUCTIVITY, on = "FIRM") : logical error. i is not a data.table, but 'on' argument is provided. — Polar Bear, May 15 '16 at 09:46
@PolarBear I am not getting any error. I use `data.table_1.9.6` — akrun, May 15 '16 at 09:48

Jaap · Answer 2 · 2016-05-15T09:48:46.510

A different approach, but also using the data.table package:

library(data.table)
# convert the dataframes to datatables (which is an enhanced form of dataframe)
setDT(ANNUALSALARY)
setDT(WEEKLYPRODUCTIVITY)

# join them on 'FIRM'
res <- WEEKLYPRODUCTIVITY[ANNUALSALARY, on = 'FIRM']
# create a convenience vector with the columnnames starting with 'SLY_
sly.cols <- grep('^SLY_', names(res), value = TRUE)

# create the 'MIN_SLY' & 'SLY_DEPT' columns
res[, `:=` (MIN_SLY = min(.SD),
            SLY_DEPT = sly.cols[which.min(.SD)]), 
    by = 1:nrow(res), .SDcols = sly.cols][]

# melt it in log format and create the 'PR' & 'Z' column
res2 <- melt(res, id = c('FIRM','WEEKS','MIN_SLY','SLY_DEPT'), 
             measure.vars = patterns('^PR_','^Z_'),
             value.name = c('PR','Z'))[, variable := c('ADMIN','MKT','FIN')[variable]
                                       ][, `:=` (PR = PR[sub('^SLY_','',SLY_DEPT) == variable],
                                                 Z = Z[sub('^SLY_','',SLY_DEPT) == variable]), 
                                         by = .(FIRM,WEEKS)
                                         ][, variable := NULL]

# removing the duplicates
res2 <- res2[!duplicated(res2)]

which results in:

> res2
    FIRM WEEKS MIN_SLY  SLY_DEPT PR Z
 1:    A     1   0.100 SLY_ADMIN  1 1
 2:    A     2   0.100 SLY_ADMIN  5 2
 3:    A     3   0.100 SLY_ADMIN  4 3
 4:    A     4   0.100 SLY_ADMIN  3 4
 5:    A     5   0.100 SLY_ADMIN  2 5
 6:    B     1   0.003   SLY_MKT  5 4
 7:    B     2   0.003   SLY_MKT  0 3
 8:    B     3   0.003   SLY_MKT  1 2
 9:    B     4   0.003   SLY_MKT  2 1
10:    B     5   0.003   SLY_MKT  3 9
11:    C     1   0.030   SLY_FIN  5 1
12:    C     2   0.030   SLY_FIN  4 2
13:    C     3   0.030   SLY_FIN  3 3
14:    C     4   0.030   SLY_FIN  2 4
15:    C     5   0.030   SLY_FIN  1 5

do you know any known issues when datatables are used with plm package? — Polar Bear, May 15 '16 at 15:14
@PolarBear I never used that package before. Have you run into a problem with it? If so, it's probaly best to ask a new question. — Jaap, May 15 '16 at 15:17
Please have a look at this question: http://stackoverflow.com/questions/37293841/r-matching-aggregate-values-with-daily-values-by-repetition — Polar Bear, May 18 '16 at 17:41
@PolarBear If you think that question is not a duplicate: I can reopen it, but you will have to undelete it first. — Jaap, May 20 '16 at 08:23
Thanks. Since I have deleted it , I am posting link to the new question http://stackoverflow.com/questions/37313500/rrepeating-aggregate-values-for-daily-values-in-a-panel-setting — Polar Bear, May 20 '16 at 11:01

score 2 · Answer 3 · answered May 15 '16 at 10:56

This was a tricky problem! I've come up with a base R solution built around max.col(), merge(), and index matrices.

Note that for concision I used the variable names sal and prod.

sufs <- c('ADMIN','MKT','FIN');
slys <- paste0('SLY_',sufs);
mins <- max.col(-sal[slys]);
res <- merge(prod[,c('FIRM','WEEKS')],cbind(sal[,'FIRM',drop=F],SLY_DEPT=slys[mins],MIN_SLY=sal[slys][cbind(seq_len(nrow(sal)),mins)]));
res.sufs <- sub('.*_','',res$SLY_DEPT);
for (pre in c('PR','Z')) { pre.cns <- paste0(pre,'_',sufs); res[[pre]] <- prod[pre.cns][cbind(seq_len(nrow(prod)),match(paste0(pre,'_',res.sufs),pre.cns))]; };

res;
##    FIRM WEEKS  SLY_DEPT MIN_SLY PR Z
## 1     A     1 SLY_ADMIN   0.100  1 1
## 2     A     2 SLY_ADMIN   0.100  5 2
## 3     A     3 SLY_ADMIN   0.100  4 3
## 4     A     4 SLY_ADMIN   0.100  3 4
## 5     A     5 SLY_ADMIN   0.100  2 5
## 6     B     1   SLY_MKT   0.003  5 4
## 7     B     2   SLY_MKT   0.003  0 3
## 8     B     3   SLY_MKT   0.003  1 2
## 9     B     4   SLY_MKT   0.003  2 1
## 10    B     5   SLY_MKT   0.003  3 9
## 11    C     1   SLY_FIN   0.030  5 1
## 12    C     2   SLY_FIN   0.030  4 2
## 13    C     3   SLY_FIN   0.030  3 3
## 14    C     4   SLY_FIN   0.030  2 4
## 15    C     5   SLY_FIN   0.030  1 5

Benchmarking

## libraries
library(data.table);
library(microbenchmark);

## define inputs, including data.table instances for akrun and maximus solutions
sal <- structure(list(FIRM = structure(1:3, .Label = c("A", "B", "C"), class = "factor"), SLY_ADMIN = c(0.1, 0.2, 0.3), SLY_MKT = c(0.5, 0.003,0.3), SLY_FIN = c(0.11, 0.12, 0.03)), .Names = c("FIRM", "SLY_ADMIN", "SLY_MKT", "SLY_FIN"), row.names = c(NA, -3L), class = "data.frame");
prod <- structure(list(FIRM = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L), .Label = c("A", "B", "C"), class = "factor"), WEEKS = structure(c(1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L), .Label = c("1", "2", "3", "4", "5"), class = "factor"), PR_ADMIN = c(1, 5, 4, 3, 2, 1, 4, 2, 4, 2, 3, 1, 4, 5, 5), Z_ADMIN = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6), PR_MKT = c(0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2), Z_MKT = c(9, 8, 7, 6, 5, 4, 3, 2, 1, 9, 8, 7, 6, 5, 4), PR_FIN = c(5, 4, 3, 2, 1, 5, 4, 3, 2, 1, 5, 4, 3, 2, 1), Z_FIN = c(1, 2, 3, 4, 5, 5, 4, 3, 2, 1, 1, 2, 3, 4, 5)), .Names = c("FIRM", "WEEKS", "PR_ADMIN", "Z_ADMIN", "PR_MKT", "Z_MKT", "PR_FIN", "Z_FIN"), row.names = c(NA, 15L), class = c("plm.dim", "data.frame"));
sal.dt <- as.data.table(sal);
prod.dt <- as.data.table(prod);

## solutions
bgoldst <- function(sal,prod) { sufs <- c('ADMIN','MKT','FIN'); slys <- paste0('SLY_',sufs); mins <- max.col(-sal[slys]); res <- merge(prod[,c('FIRM','WEEKS')],cbind(sal[,'FIRM',drop=F],SLY_DEPT=slys[mins],MIN_SLY=sal[slys][cbind(seq_len(nrow(sal)),mins)])); res.sufs <- sub('.*_','',res$SLY_DEPT); for (pre in c('PR','Z')) { pre.cns <- paste0(pre,'_',sufs); res[[pre]] <- prod[pre.cns][cbind(seq_len(nrow(prod)),match(paste0(pre,'_',res.sufs),pre.cns))]; }; res; };
akrun <- function(ANNUALSALARY,WEEKLYPRODUCTIVITY) { i1 <- max.col(-1*ANNUALSALARY[,-1,with=F]); dN <- data.table(FIRM= ANNUALSALARY$FIRM, MIN_SLY=as.data.frame(ANNUALSALARY)[-1][cbind(1:nrow(ANNUALSALARY), i1)], SLY_DEPT = names(ANNUALSALARY)[-1][i1]); dN2 <- melt(dN[WEEKLYPRODUCTIVITY, on = "FIRM"], measure = patterns("^PR", "^Z"), value.name = c("PR", "Z"))[order(FIRM, variable, WEEKS)][, gr1 := cumsum(WEEKS==1), FIRM][]; res <- data.table(FIRM= ANNUALSALARY$FIRM, i1)[dN2, on = "FIRM"][gr1==i1]; res[,!names(res)%in%c('i1','variable','gr1'),with=F]; };
maximus <- function(ANNUALSALARY,WEEKLYPRODUCTIVITY) { res <- WEEKLYPRODUCTIVITY[ANNUALSALARY, on = 'FIRM']; sly.cols <- grep('^SLY_', names(res), value = TRUE); res[, `:=` (MIN_SLY = min(.SD), SLY_DEPT = sly.cols[which.min(.SD)]), by = 1:nrow(res), .SDcols = sly.cols][]; res2 <- melt(res, id = c('FIRM','WEEKS','MIN_SLY','SLY_DEPT'), measure.vars = patterns('^PR_','^Z_'), value.name = c('PR','Z'))[, variable := c('ADMIN','MKT','FIN')[variable]][, `:=` (PR = PR[sub('^SLY_','',SLY_DEPT) == variable], Z = Z[sub('^SLY_','',SLY_DEPT) == variable]), by = .(FIRM,WEEKS)][, variable := NULL]; res2 <- res2[!duplicated(res2)]; };

## proofs of equivalence
ex <- bgoldst(sal,prod); co <- names(ex);
identical(ex,transform(as.data.frame(akrun(sal.dt,prod.dt))[co],SLY_DEPT=factor(SLY_DEPT)));
## [1] TRUE
identical(ex,transform(as.data.frame(maximus(sal.dt,prod.dt))[co],SLY_DEPT=factor(SLY_DEPT)));
## [1] TRUE

## benchmark
microbenchmark(bgoldst(sal,prod),akrun(sal.dt,prod.dt),maximus(sal.dt,prod.dt));
## Unit: milliseconds
##                      expr      min       lq     mean   median       uq       max neval
##        bgoldst(sal, prod) 1.639193 1.730070 1.883285 1.807047 1.881031  3.230917   100
##    akrun(sal.dt, prod.dt) 6.392125 6.666251 7.744077 6.901033 7.230752 53.621663   100
##  maximus(sal.dt, prod.dt) 5.002254 5.229979 5.853681 5.423492 6.034609 12.182544   100

Thanks for the base R solution. Is there generally any problem when datatables are used with plm package? I had been thinking about this for a long time. — Polar Bear, May 15 '16 at 11:00
You're welcome. I've never used the plm package, so I'm the wrong guy to ask. You can try asking akrun, he's very knowledgeable about R. — bgoldst, May 15 '16 at 11:03
@bgoldst Kindly have a look on this similar question: http://stackoverflow.com/questions/37293841/r-matching-aggregate-values-with-daily-values-by-repetition-to-form-panel-datas — Polar Bear, May 18 '16 at 18:00

Creating a panel data frame based on two other dataframes

3 Answers3

Benchmarking