Combine matrices in list by identifier column

Question

The problem attempts to join matrices in a set, by a column identifier. We can express the problem in the following form:

Setup

mat1 <- data.frame(matrix(nrow=4, ncol =3, rnorm(12,0,1)))
mat2 <- data.frame(matrix(nrow =5, ncol=3, rnorm(15,0,1)))
mat3 <- data.frame(matrix(nrow=3, ncol =3, rnorm(9,0,1)))
mat4 <- data.frame(matrix(nrow =6, ncol =3, rnorm(18,0,1)))
colnames(mat1) = colnames(mat2) = colnames(mat3) = colnames(mat4) <- c("Code", "x1", "x2")

mat1$Code <- c(1,2,3,4)
mat2$Code <- c(2,3,4,5,6)
mat3$Code <- c(6,7,8)
mat4$Code <- c(1,2,3,4,5,6)

mat_set <- c(mat1, mat2, mat3, mat4)

> mat1
  Code         x1         x2
1    1  0.6425172 -1.9404704
2    2 -0.1278021  0.8485476
3    3 -0.5525808 -0.9060624
4    4 -1.3013592  0.7350129
> mat2
  Code          x1         x2
1    2 -0.06543585 -1.1244444
2    3  0.03773743 -0.8124004
3    4  3.53421807 -0.4935844
4    5  0.56686927  0.3433276
5    6  0.41849489  0.8782866
> mat3
  Code        x1         x2
1    6 1.0821070 0.08006585
2    7 0.1038577 0.61057716
3    8 2.7002036 0.19693561
> mat1
  Code         x1         x2
1    1  0.6425172 -1.9404704
2    2 -0.1278021  0.8485476
3    3 -0.5525808 -0.9060624
4    4 -1.3013592  0.7350129
> mat2
  Code          x1         x2
1    2 -0.06543585 -1.1244444
2    3  0.03773743 -0.8124004
3    4  3.53421807 -0.4935844
4    5  0.56686927  0.3433276
5    6  0.41849489  0.8782866
> mat3
  Code        x1         x2
1    6 1.0821070 0.08006585
2    7 0.1038577 0.61057716
3    8 2.7002036 0.19693561
> mat4
  Code         x1         x2
1    1 -0.1188262  0.6338566
2    2  0.6128098  1.3759910
3    3 -1.3504901 -0.2830859
4    4 -1.2153638 -1.1611660
5    5 -1.7420065  0.2470048
6    6 -0.9786468 -1.2214594

I then want to bind by column all matrices in the set by "Code". Preserve the ordering. This will yield output of the form:

output <- data.frame(matrix(nrow = 8, ncol =9))
output[,1] <- c(1,2,3,4,5,6,7,8)
output[,2] <- c(mat1$x1, NA, NA,NA,NA)
output[,3] <- c(mat1$x2, NA,NA,NA,NA)
output[,4] <- c(NA, mat2$x1, NA, NA)
output[,5] <- c(NA, mat2$x2, NA, NA)
output[,6] <- c(NA,NA,NA,NA,NA,mat3$x1)
output[,7] <- c(NA,NA,NA,NA,NA,mat3$x2)
output[,8] <- c(mat4$x1, NA,NA)
output[,9] <- c(mat4$x2, NA,NA)
output

 X1         X2         X3          X4         X5        X6         X7         X8         X9
1  1  0.6425172 -1.9404704          NA         NA        NA         NA -0.1188262  0.6338566
2  2 -0.1278021  0.8485476 -0.06543585 -1.1244444        NA         NA  0.6128098  1.3759910
3  3 -0.5525808 -0.9060624  0.03773743 -0.8124004        NA         NA -1.3504901 -0.2830859
4  4 -1.3013592  0.7350129  3.53421807 -0.4935844        NA         NA -1.2153638 -1.1611660
5  5         NA         NA  0.56686927  0.3433276        NA         NA -1.7420065  0.2470048
6  6         NA         NA  0.41849489  0.8782866 1.0821070 0.08006585 -0.9786468 -1.2214594
7  7         NA         NA          NA         NA 0.1038577 0.61057716         NA         NA
8  8         NA         NA          NA         NA 2.7002036 0.19693561         NA         NA
>

A final point is that the code must be replicable over a large set of matrices. Thanks!

See https://stackoverflow.com/questions/8091303/simultaneously-merge-multiple-data-frames-in-a-list. Use `list` instead of `c` to create a list. — A. Suliman, Jul 29 '19 at 10:50

Ronak Shah · Accepted Answer · 2019-07-29T11:48:33.153

You can use merge in Reduce

out <- Reduce(function(x, y) merge(x, y, by = "Code", all = TRUE), mat_set)
colnames(out) <- paste0("x", seq_along(out))

out
#  x1         x2         x3         x4         x5         x6         x7         x8         x9
#1  1  0.4291247 -0.5644520         NA         NA         NA         NA -0.8553646 -0.5238281
#2  2  0.5060559 -0.8900378 -0.9111954 -0.4405479         NA         NA -0.2806230 -0.4968500
#3  3 -0.5747400 -0.4771927 -0.8371717  0.4595894         NA         NA -0.9943401 -1.8060313
#4  4 -0.5466319 -0.9983864  2.4158352 -0.6937202         NA         NA -0.9685143 -0.5820759
#5  5         NA         NA  0.1340882 -1.4482049         NA         NA -1.1073182 -1.1088896
#6  6         NA         NA -0.4906859  0.5747557  1.1022975 -0.5012581 -1.2519859 -1.0149620
#7  7         NA         NA         NA         NA -0.4755931 -1.6290935         NA         NA
#8  8         NA         NA         NA         NA -0.7094400 -1.1676193         NA         NA

data

set.seed(1234)
mat1 <- data.frame(matrix(nrow=4, ncol =3, rnorm(12,0,1)))
mat2 <- data.frame(matrix(nrow =5, ncol=3, rnorm(15,0,1)))
mat3 <- data.frame(matrix(nrow=3, ncol =3, rnorm(9,0,1)))
mat4 <- data.frame(matrix(nrow =6, ncol =3, rnorm(18,0,1)))
colnames(mat1) = colnames(mat2) = colnames(mat3) = colnames(mat4) <- c("Code", "x1", "x2")

mat1$Code <- c(1,2,3,4)
mat2$Code <- c(2,3,4,5,6)
mat3$Code <- c(6,7,8)
mat4$Code <- c(1,2,3,4,5,6)
mat_set <- list(mat1, mat2, mat3, mat4)

As noted by @G. Grothendieck, the above code gives a warning because the column names are similar in the list of dataframes. Since we only want to join by "Code", while creating the dataframe we can give unique names to columns so to avoid the warning and renaming later.

Updated data

set.seed(1234)
mat1 <- setNames(data.frame(matrix(nrow=4, ncol =3, rnorm(12,0,1))), c("Code", "x1", "x2"))
mat2 <- setNames(data.frame(matrix(nrow =5, ncol=3, rnorm(15,0,1))), c("Code", "x3", "x4"))
mat3 <- setNames(data.frame(matrix(nrow=3, ncol =3, rnorm(9,0,1))), c("Code", "x5", "x6"))
mat4 <- setNames(data.frame(matrix(nrow =6, ncol =3, rnorm(18,0,1))), c("Code", "x7", "x8"))

mat1$Code <- c(1,2,3,4)
mat2$Code <- c(2,3,4,5,6)
mat3$Code <- c(6,7,8)
mat4$Code <- c(1,2,3,4,5,6)
mat_set <- list(mat1, mat2, mat3, mat4)

and then we can use Reduce

Reduce(function(x, y) merge(x, y, by = "Code", all = TRUE), mat_set)

Suggest renaming the columns before merging to avoid the warning that the above code gives. — G. Grothendieck, Jul 29 '19 at 11:27

Combine matrices in list by identifier column

1 Answers1