1

My list (l) contains different Data Frames of different length.
x1, x2 and x3 are values of different product-variables.
y is a factor variable from 1 to 15.

I struggle how to match or join those Data Frames by the correct but unequal distributed factor-variable y.

df1 <- data.frame(x1=c(100,10,20,70,30), y =as.factor(c(1,2,3,11,15)))
df2 <- data.frame(x2=c(11,22,33,44,55,66,77,88,99), y =as.factor(c(1,2,3,4,5,7,8,11,12)))
df3 <- data.frame(x3=c(11,12,13,14,15,16,17,18,19,20), y =as.factor(c(1,2,3,4,5,11,12,13,14,15)))
l   <- list(df1,df2,df3)

Often recommended, but in this case not working is the following line:

do.call(rbind.fill, l)

My expected output is ONE, new Data Frame or table like this:

     x1       x2       x3
1    100      11       11
2    10       22       12
3    20       33       13
4    NA       44       14 
5    NA       55       15
6    NA       NA       NA
7    NA       66       NA
8    NA       77       NA
9    NA       NA       NA
10   NA       NA       NA
11   70       88       16
12   NA       99       17
13   NA       NA       18
14   NA       NA       19
15   30       NA       20
Mario
  • 17
  • 3

2 Answers2

1
library(dplyr)
qqq <- full_join(df1, df2, by='y') 
    %>% full_join(df3, by = 'y') 
    %>% mutate(y = as.numeric(as.character(y)))
y_seq <- data.frame(y = 1:15)
qqq <- full_join(qqq, y_seq, by='y') %>% arrange(y)
rownames(qqq) <- qqq$y
qqq <- select(qqq, -y)
David Kaufman
  • 989
  • 1
  • 7
  • 20
0

There might be a better way to do this, but basically you can use reduce to merge all 3 dataframes together. Then, we can create a second dataframe that has the values of y that are missing from the merged dataframe and 1:15 and rep the x1, x2, x3 for the number of values that are missing. Then just rbind and sort.

> df = Reduce(function(...) merge(..., all = T), l)
> df
    y  x1 x2 x3
1   1 100 11 11
2   2  10 22 12
3   3  20 33 13
4  11  70 88 16
5  15  30 NA 20
6   4  NA 44 14
7   5  NA 55 15
8   7  NA 66 NA
9   8  NA 77 NA
10 12  NA 99 17
11 13  NA NA 18
12 14  NA NA 19


df$y = as.numeric(levels(df$y))

df2 = data.frame(c(which(!1:15 %in% df$y)), c(rep(NA, length(which(!1:15 %in% df$y)))), c(rep(NA, length(which(!1:15 %in% df$y)))),c(rep(NA, length(which(!1:15 %in% df$y)))))
colnames(df2) = colnames(df)

df = rbind.data.frame(df, df2)
df = df[order(df$y),]


> df
    y  x1 x2 x3
1   1 100 11 11
2   2  10 22 12
3   3  20 33 13
6   4  NA 44 14
7   5  NA 55 15
13  6  NA NA NA
8   7  NA 66 NA
9   8  NA 77 NA
14  9  NA NA NA
15 10  NA NA NA
4  11  70 88 16
10 12  NA 99 17
11 13  NA NA 18
12 14  NA NA 19
5  15  30 NA 20

EDIT

David's way of filling in the extra rows is a lot better so i'm going to steal that.

df1 <- data.frame(x1=c(100,10,20,70,30), y =as.factor(c(1,2,3,11,15)))
df2 <- data.frame(x2=c(11,22,33,44,55,66,77,88,99), y =as.factor(c(1,2,3,4,5,7,8,11,12)))
df3 <- data.frame(x3=c(11,12,13,14,15,16,17,18,19,20), y =as.factor(c(1,2,3,4,5,11,12,13,14,15)))
l   <- list(df1,df2,df3)


df = Reduce(function(...) merge(..., all = T), l)
df$y = as.numeric(levels(df$y))

df2 = data.frame(y = 1:15)
df = merge(df, df2, on = "y", all = TRUE)
Kristofersen
  • 2,736
  • 1
  • 15
  • 31