Combine data frames by taking one row from each and still keep the same number of data frames

Question

I have 4 data frames:

> dput(dat1)
structure(list(standard = c("LFGPDLK", "GLPAPIEK", "VTTHPLAK", 
"AQGYSGLSVK", "AQTTVTCTEK", "TSPVDEK", "TLVVHEK", "TGAVSGHSLK", 
"GASQELK", "HYEGSTVPEK"), Q1 = c(399.24, 416.76, 437.76, 509.28, 
573.78, 392.2, 417.25, 482.77, 370.7, 577.78), `Imputed RT` = c(27.7927700981606, 
21.4383252210455, 14.3163703676944, 19.2089872117407, 14.3213935030202, 
13.9690939378742, 14.6465672186729, 14.1218362333025, 13.8034690838775, 
14.4475462934999)), row.names = c(NA, -10L), class = c("tbl_df", 
"tbl", "data.frame"))
> dput(dat2)
structure(list(standard = c("HLVALSPK", "DHGETAFAVYDK", "DYWSTVK", 
"SSDANLYR", "IVVVTAGVR", "FFESHVAR", "DTDLDGFPDEK", "AATVGSLAGQPLQER", 
"SPAGPTVVSIGGGK", "VAAGAFQGLR"), Q1 = c(436.78, 680.82, 453.73, 
468.23, 462.3, 501.76, 630.28, 754.41, 617.85, 500.28), `Imputed RT` = c(14.3030184, 
21.409683, 24.3425922, 10.90761198, 23.06790474, 13.38930438, 
27.3883056, 26.22642234, 23.5529628, 24.20722716)), row.names = c(NA, 
-10L), class = c("tbl_df", "tbl", "data.frame"))
> dput(dat3)
structure(list(standard = c("ELGCGAASGTPSGILYEPPAEK", "IDYGVFAK", 
"SGFSFGFK", "LPVAPLR", "LGPHAGDVEGHLSFLEK", "DFAEHLLIPR", "VGLSDAFVVVHR", 
"NFPSPVDAAFR", "GPGGVWAAEAISDAR", "VTESEIK"), Q1 = c(738.03, 
460.75, 442.72, 388.26, 605.32, 407.56, 436.92, 615.81, 733.87, 
407.23), `Imputed RT` = c(53.712886618, 
51.07663857, 54.33689214, 49.201116355, 51.872420893, 55.03802194, 
54.37194863, 55.059055834, 54.007361134, 44.156487444)), row.names = c(NA, 
-10L), class = c("tbl_df", "tbl", "data.frame"))
> dput(dat4)
structure(list(standard = c("LVTDLTK", "ITGAQVGTGCGTLNDGK", "HVLVTLGEK", 
"HLEDVFSK", "QYFYETK", "NANTFISPQQR", "ADLSGITGAR"), Q1 = c(399.247, 
828.909, 502.305, 491.758, 493.739, 643.33, 485.763), `Imputed RT` = c(12.05265, 
12.61343, 13.87918, 14.80058, 14.80058, 14.80871, 16.88051)), row.names = c(NA, 
-7L), class = c("tbl_df", "tbl", "data.frame"))

I would like to combine them row by row but still keep 4 independent data frames. That means first row should be taken from each data frame and kept in for example DF1, second row should be taken from each data frame and kept in for example DF2, third row should be taken from each data frame and kept in for example DF3, fourth row should be taken from each data frame and kept in for example DF4, and then fifth rows go to DF1, etc.

The problematic part is that one of initial data frames may have lower number of rows what means when is over, then just keep combining from 3 data frames.

Any ideas ?

score 3 · Answer 1 · answered Aug 29 '23 at 13:23

I think the approach is to just add the repeating sequence 1:4 as a column to each data frame. Then you can bind the data frames, and split on this column. The result will be four data frames, each of which contains every fourth row, in the order specified.

dat_list <- list(dat1, dat2, dat3, dat4) |>
    lapply(\(df) transform(df, grp = rep(1:4, nrow(df)))) |>
    do.call(rbind, args = _)

split(dat_list, dat_list$grp)

This is a summary of the output:

str(split(dat_list, dat_list$grp))
# List of 4
#  $ 1:'data.frame':      37 obs. of  4 variables:
#   ..$ standard  : chr [1:37] "LFGPDLK" "AQTTVTCTEK" "GASQELK" "VTTHPLAK" ...
#   ..$ Q1        : num [1:37] 399 574 371 438 417 ...
#   ..$ Imputed.RT: num [1:37] 27.8 14.3 13.8 14.3 14.6 ...
#   ..$ grp       : int [1:37] 1 1 1 1 1 1 1 1 1 1 ...
#  $ 2:'data.frame':      37 obs. of  4 variables:
#   ..$ standard  : chr [1:37] "GLPAPIEK" "TSPVDEK" "HYEGSTVPEK" "AQGYSGLSVK" ...
#   ..$ Q1        : num [1:37] 417 392 578 509 483 ...
#   ..$ Imputed.RT: num [1:37] 21.4 14 14.4 19.2 14.1 ...
#   ..$ grp       : int [1:37] 2 2 2 2 2 2 2 2 2 2 ...
#  $ 3:'data.frame':      37 obs. of  4 variables:
#   ..$ standard  : chr [1:37] "VTTHPLAK" "TLVVHEK" "LFGPDLK" "AQTTVTCTEK" ...
#   ..$ Q1        : num [1:37] 438 417 399 574 371 ...
#   ..$ Imputed.RT: num [1:37] 14.3 14.6 27.8 14.3 13.8 ...
#   ..$ grp       : int [1:37] 3 3 3 3 3 3 3 3 3 3 ...
#  $ 4:'data.frame':      37 obs. of  4 variables:
#   ..$ standard  : chr [1:37] "AQGYSGLSVK" "TGAVSGHSLK" "GLPAPIEK" "TSPVDEK" ...
#   ..$ Q1        : num [1:37] 509 483 417 392 578 ...
#   ..$ Imputed.RT: num [1:37] 19.2 14.1 21.4 14 14.4 ...
#   ..$ grp       : int [1:37] 4 4 4 4 4 4 4 4 4 4 ...

score 3 · Accepted Answer · answered Aug 29 '23 at 13:25

Another option, I've put your df's into a list, since it makes this easier

dat=list(dat1,dat2,dat3,dat4)

split(
  do.call(
    rbind,
    lapply(
      dat,
      function(x){
        cbind(x,"g"=(seq(1,nrow(x))-1)%%4)
      }
    )
  ),
  ~g
)

resulting in

$`0`
                 standard      Q1 Imputed RT g
1                 LFGPDLK 399.240   27.79277 0
5              AQTTVTCTEK 573.780   14.32139 0
9                 GASQELK 370.700   13.80347 0
11               HLVALSPK 436.780   14.30302 0
15              IVVVTAGVR 462.300   23.06790 0
19         SPAGPTVVSIGGGK 617.850   23.55296 0
21 ELGCGAASGTPSGILYEPPAEK 738.030   53.71289 0
25      LGPHAGDVEGHLSFLEK 605.320   51.87242 0
29        GPGGVWAAEAISDAR 733.870   54.00736 0
31                LVTDLTK 399.247   12.05265 0
35                QYFYETK 493.739   14.80058 0

$`1`
            standard      Q1 Imputed RT g
2           GLPAPIEK 416.760   21.43833 1
6            TSPVDEK 392.200   13.96909 1
10        HYEGSTVPEK 577.780   14.44755 1
12      DHGETAFAVYDK 680.820   21.40968 1
16          FFESHVAR 501.760   13.38930 1
20        VAAGAFQGLR 500.280   24.20723 1
22          IDYGVFAK 460.750   51.07664 1
26        DFAEHLLIPR 407.560   55.03802 1
30           VTESEIK 407.230   44.15649 1
32 ITGAQVGTGCGTLNDGK 828.909   12.61343 1
36       NANTFISPQQR 643.330   14.80871 1

$`2`
       standard      Q1 Imputed RT g
3      VTTHPLAK 437.760   14.31637 2
7       TLVVHEK 417.250   14.64657 2
13      DYWSTVK 453.730   24.34259 2
17  DTDLDGFPDEK 630.280   27.38831 2
23     SGFSFGFK 442.720   54.33689 2
27 VGLSDAFVVVHR 436.920   54.37195 2
33    HVLVTLGEK 502.305   13.87918 2
37   ADLSGITGAR 485.763   16.88051 2

$`3`
          standard      Q1 Imputed RT g
4       AQGYSGLSVK 509.280   19.20899 3
8       TGAVSGHSLK 482.770   14.12184 3
14        SSDANLYR 468.230   10.90761 3
18 AATVGSLAGQPLQER 754.410   26.22642 3
24         LPVAPLR 388.260   49.20112 3
28     NFPSPVDAAFR 615.810   55.05906 3
34        HLEDVFSK 491.758   14.80058 3

Combine data frames by taking one row from each and still keep the same number of data frames

2 Answers2