Simple but not easy merge task

Question

I have two incomplete dataframes (df_a, df_b): Columns are missing or NA values. "by" is the merge index and df_a has "priority" over df_b.

df_a = structure(list(Datum = structure(c(1635163200, 1635166800, 1635170400, 1635174000), class = c("POSIXct", "POSIXt")), Vorhersage = c(10.297922, 10.155121, 10.044135, 9.699513), Export = c("10.912", "10.47", NA, NA), color = c("rgb(0,128,0)", "rgb(0,128,0)", NA, NA), Status = c("ok", "ok", NA, NA), Plausibilität = c("4", "4", NA, NA), min = c(7.93000000000001, 9.4, 8.7, 8.3), max = c(12.31715325, 12.42822725, 12.51326325, 12.28620625)), row.names = c(NA, -4L), class = "data.frame")
df_b = structure(list(Datum = structure(c(1632510000, 1632513600, 1632517200, 1632520800), class = c("POSIXct", "POSIXt")), Vorhersage = c(14.821988, 14.832919, 14.706179, 14.573527), Referenz = c(16.6, 16.2, 15.9, 16), DWD_Name = c("Elpersbüttel", "Elpersbüttel", "Elpersbüttel", "Elpersbüttel"), Export = c(17.198, 16.713, 16.378, 16.358), color = c("rgb(0,128,0)", "rgb(0,128,0)", "rgb(0,128,0)", "rgb(0,128,0)"), Status = c("ok", "ok", "ok", "ok"), Plausibilität = c(4, 4, 4, 4), min = c(13.05, 12.808, 11.631891, 12.312), max = c(17, 17, 16.9, 16.7)), row.names = c(NA, -4L), class = "data.frame")

desired output is:

                Datum Vorhersage Export        color Status Plausibilität  min max Referenz
1 2021-10-25 14:00:00       10.3 10.912 rgb(0,128,0)     ok             4  7.9  12       NA
2 2021-10-25 15:00:00       10.2  10.47 rgb(0,128,0)     ok             4  9.4  12       NA
3 2021-10-25 16:00:00       10.0   <NA>         <NA>   <NA>          <NA>  8.7  13       NA
4 2021-10-25 17:00:00        9.7   <NA>         <NA>   <NA>          <NA>  8.3  12       NA
5 2021-09-24 21:00:00       14.8 17.198 rgb(0,128,0)     ok             4 13.1  17       17
6 2021-09-24 22:00:00       14.8 16.713 rgb(0,128,0)     ok             4 12.8  17       16
7 2021-09-24 23:00:00       14.7 16.378 rgb(0,128,0)     ok             4 11.6  17       16
8 2021-09-25 00:00:00       14.6 16.358 rgb(0,128,0)     ok             4 12.3  17       16
      DWD_Name
1         <NA>
2         <NA>
3         <NA>
4         <NA>
5 Elpersbüttel
6 Elpersbüttel
7 Elpersbüttel
8 Elpersbüttel

# for rebuild:
structure(list(Datum = structure(c(1635163200, 1635166800, 1635170400, 
1635174000, 1632510000, 1632513600, 1632517200, 1632520800), class = c("POSIXct", 
"POSIXt")), Vorhersage = c(10.297922, 10.155121, 10.044135, 9.699513, 
14.821988, 14.832919, 14.706179, 14.573527), Export = c("10.912", 
"10.47", NA, NA, "17.198", "16.713", "16.378", "16.358"), color = c("rgb(0,128,0)", 
"rgb(0,128,0)", NA, NA, "rgb(0,128,0)", "rgb(0,128,0)", "rgb(0,128,0)", 
"rgb(0,128,0)"), Status = c("ok", "ok", NA, NA, "ok", "ok", "ok", 
"ok"), Plausibilität = c("4", "4", NA, NA, "4", "4", "4", "4"
), min = c(7.93000000000001, 9.4, 8.7, 8.3, 13.05, 12.808, 11.631891, 
12.312), max = c(12.31715325, 12.42822725, 12.51326325, 12.28620625, 
17, 17, 16.9, 16.7), Referenz = c(NA, NA, NA, NA, 16.6, 16.2, 
15.9, 16), DWD_Name = c(NA, NA, NA, NA, "Elpersbüttel", "Elpersbüttel", 
"Elpersbüttel", "Elpersbüttel")), row.names = c(NA, -8L), class = "data.frame")

Thanks to the help of @r2evans I tried the following:


by = "Datum"

library(data.table)
colnms <- setdiff(intersect(names(df_a), names(df_b)), by)

setDT(df_a)
setDT(df_b)

merge(df_a, df_b, by = by, all = TRUE
)[, (colnms) := lapply(colnms, function(nm) fcoalesce(.SD[[paste0(nm, ".x")]], .SD[[paste0(nm, ".y")]]))
  ][, c(outer(colnms, c(".x", ".y"), paste0)) := NULL ][]

but I get the following error:

 Error in fcoalesce(.SD[[paste0(nm, ".x")]], .SD[[paste0(nm, ".y")]]) : 
  Item 2 is type double but the first item is type character. Please coerce

@KarthikS, your answer is correct, I think you should undelete it. — r2evans, Oct 19 '21 at 15:23
What does it mean that `df_a` has priority? Why is do you take the second number in c(2,7,NA,2) from `df_b` when it is available in `df_a`? — s_baldur, Oct 19 '21 at 16:45
@r2evans: ran into trouble again.. my function works, but your solution is faster.. I updated the example maybe you can help. — Nimrookie, Oct 26 '21 at 12:14
I think the error is clear: you are potentially corrupting your data by trying to merge not-the-same class vectors. For instance, `Export` and `Plausibilität` are `character` in `df_a` and `numeric` in `df_b`. My guess is that `df_a`'s of those two columns are supposed to be `numeric`. If you fix that with `df_a[,(c("Export","Plausibilität")) := lapply(.SD, as.numeric), .SDcols=c("Export","Plausibilität")]`, the merge command works. Pay attention to your classes, it makes a difference. (I usually do not trust merging functions to know how to coerce different classes.) — r2evans, Oct 26 '21 at 12:26

r2evans · Answer 1 · 2021-10-23T22:57:33.553

Most of the other answers are good, but many either over-complicate the result (in my opinion) or they perform a left or right join, not the full join as expected in the OP.

Here's a quick solution that uses dynamic column names.

library(data.table)
colnms <- setdiff(intersect(names(df_a), names(df_b)), "by")
colnms
# [1] "a"

setDT(df_a)
setDT(df_b)

merge(df_a, df_b, by = "by", all = TRUE
  )[, (colnms) := lapply(colnms, function(nm) fcoalesce(.SD[[paste0(nm, ".x")]], .SD[[paste0(nm, ".y")]]))
  ][, c(outer(colnms, c(".x", ".y"), paste0)) := NULL ][]
#       by     b     c     a
#    <num> <num> <num> <num>
# 1:     1     1    NA     1
# 2:     2    NA     2     2
# 3:     3     3     3     3
# 4:     4    NA     4     4

Notes:

the normal data.table::[ merge is a left-join only, so we need to use data.table::merge in order to be able to get a full-join with all=TRUE;
because it's using merge, the repeated columns get the .x and .y suffixes, something we can easily capitalize on;
the canonical and most-performant way when using (colnms) := ... is to also include .SDcols=colnms, but that won't work as well here since we need the suffixed columns, not the colnms columns themselves; this is a slight performance penalty but certainly not an anti-pattern (I believe) given what we need to do; and since we could have more than one duplicate column, we have to be careful to do it with each pair at a time, not all of them at once;
the last [-block (using outer) is for removing the duplicate columns; without it, the output would have column names c("by", "a.x", "b", "a.y", "c", "a"). It uses outer because that's a straight-forward way to get 1-or-more colnms and combine .x and .y to each of them; it then uses data.table's := NULL shortcut for removing one-or-more columns.

Thank you very much @r2evans for this very informative, enlightning and pedagogical response. In only three lines of code, I learned a lot! I really appreciate the sharing, the pedagogical effort... and, incidentally, I measure my margin of progress which is tremendous !!! ;-) — lovalery, Oct 24 '21 at 17:44
r2evans: Got the following error: Error in `[.data.table`(merge(df_a, df_b, by = by, all = TRUE)[, `:=`((colnms), : (converted from warning) Column 'Datum.x' does not exist to remove — Nimrookie, Oct 25 '21 at 14:26
Okay, I understand. It appears that the issue is that your sample data is not representative (enough). Please update your data to something that can reproduce this error you have. — r2evans, Oct 25 '21 at 14:29

lovalery · Answer 2 · 2021-10-19T17:40:26.840

0

Not the answer with R base. But one possible solution with the package data.table

library(data.table)

setDT(df_a)
setDT(df_b)

df_a <- rbind(df_a, list(4, NA, NA))
df_b <- rbind(list(1, NA, NA), df_b)


df_a[df_b, `:=` (a = fifelse(is.na(a), i.a, a), c = c), on = .(by)][]
#>    by a  b  c
#> 1:  1 1  1 NA
#> 2:  2 2 NA  2
#> 3:  3 3  3  3
#> 4:  4 4 NA  4

Edit with the help of @r2evans, A much more elegant and efficient solution:

df_a[df_b, `:=` (a = fcoalesce(a, i.a), c = c), on = .(by)][]
#>    by a  b  c
#> 1:  1 1  1 NA
#> 2:  2 2 NA  2
#> 3:  3 3  3  3
#> 4:  4 4 NA  4

^{Created on 2021-10-19 by the reprex package (v2.0.1)}

edited Oct 19 '21 at 17:40

answered Oct 19 '21 at 15:43

lovalery

4,524
3
14
28

1

`data.table::fcoalesce(a, i.a)` is likely faster than `fifelse(...)` and made for this situation. – r2evans Oct 19 '21 at 17:12
1

Thank you so much @r2evans. To be honest I didn't know this function... magic! So, I will edit the answer. – lovalery Oct 19 '21 at 17:20
@lovalery, r2evans: Fast solution! Thanks! Is there a way to let it work with dynamic col names? – Nimrookie Oct 20 '21 at 21:42
Hi Nimrookie, I'm not sure if this is what you are looking for. But I'm giving it a try! I'm not very experienced in this, but I think you have to abandon the functional form of the code to benefit from dynamic column names. Therefore, in the example you provided, I would write the code as follows: `colnames_vector <- c("a", "foo")` and then `df_a[df_b, (colnames_vector) := .(fcoalesce(a, i.a), c), on = .(by)][]` That said, I'll leave it up to @r2evans to complete and/or correct this attempted answer if needed. – lovalery Oct 23 '21 at 13:52

score 0 · Answer 3 · 2021-10-19T16:26:59.220

This isn't the most elegant, but you can make a function that applies your rule to coalesce the values if they occur in both data frames.

# find the unique column names (not called "by")
cols <- union(names(df_a),names(df_b))
cols <- cols[!(cols == "by")]

# merge the data sets
df_merge <- merge(df_a, df_b, by = "by", all = TRUE)

# function to check for the base column names that now have a '.x' and
# a '.y' version. for the columns, fill in the NAs from '.x' with the
# value from '.y'
col_val <- function(col_base, df) {
  
  x <- names(df)
  if (all(paste0(col_base, c(".x", ".y")) %in% x)) {
    na.x <- is.na(df[[paste0(col_base, ".x")]])
    df[[paste0(col_base, ".x")]][na.x] <- df[[paste0(col_base, ".y")]][na.x]
    df[[paste0(col_base, ".x")]]
  } else {
    df[[col_base]]
  }
  
}

# apply this function to every column
cbind(df_merge["by"], sapply(cols, col_val, df = df_merge))

This will give the following result.

  by a  b  c
1  1 1  1 NA
2  2 2 NA  2
3  3 3  3  3
4  4 4 NA  4

I know you specified base, by the natural_join() function is worth mentioning.

library(rqdatatable)

natural_join(df_a, df_b, by = "by", jointype = "FULL")

This gives exactly what you want.

  by a  b  c
1  1 1  1 NA
2  2 2 NA  2
3  3 3  3  3
4  4 4 NA  4

score 0 · Answer 4 · answered Oct 21 '21 at 18:00

here a dynamic solution.. not bad, but maybe someone knows how to speed it up.

get_complete_df<-function(df_a,df_b, by = "by"){
  df_a  = unique(df_a)
  df_b  = unique(df_b)
  nam_a = names(df_a)[!(names(df_a) == by)] 
  nam_b = names(df_b)[!(names(df_b) == by)]
  nums_a = unlist(lapply(df_a, is.numeric)) 
  nums_b = unlist(lapply(df_b, is.numeric))
  nums   = unique(names(df_a)[nums_a],names(df_b)[nums_b])
  ## try to supplement NAs
  x = df_b[[by]][df_b[[by]] %in% df_a[[by]]]
  y = nam_b[nam_b %in% nam_a]
  vna = is.na(df_a[df_a[,1] %in% x,y]) 
  df_a[df_a[,1] %in% x ,y][vna] = df_b[df_b[,1] %in% x,y][vna]
  ## get complete df
  all_names  = c(nam_a,nam_b )
  all_names  = c(by, unique(all_names))
  all_by     = na.omit(unique(c(df_a[[by]],df_b[[by]]) ))
  ## build
  df_o = as.data.frame(matrix(nrow = length(all_by),ncol = length(all_names)))
  names(df_o) = all_names
  df_o[[by]]  = all_by
  ## fill in content 
  df_o[df_o[,1] %in% df_b[,1],names(df_b)] = df_b
  df_o[df_o[,1] %in% df_a[,1],names(df_a)] = df_a ## df_a has priority!
  # fix numeric:
  # why did some(!) num fields changed to chr ?
  df_o[,nums]   = as.data.frame(apply(df_o[,nums], 2, as.numeric))
  df_o
}

Simple but not easy merge task

4 Answers4