3

I want to update join two tables within a function. Here is an example without using a function:

library(data.table)
Xtest <- data.table(a = rnorm(20), b = rnorm(20), c = 1:20)
Ytest <- data.table(c = 1:10, d = rnorm(10))

Xtest[Ytest, on = .(c), newcol := i.d]

# > Xtest[Ytest, on = .(c), newcol := i.d]
# > Xtest
# a           b  c      newcol
# 1: -1.68473343 -0.74498296  1  0.35096663
# 2: -0.98461614  2.15317525  2 -1.33890396
# 3: -1.65427602  1.21183896  3  1.49641480
# 4: -0.65045253 -0.74609860  4 -0.03227097
# 5:  1.49058508  1.20315276  5  1.41580186
# 6: -0.31631871  0.68716871  6 -0.03671959
# 7:  1.35923085 -0.20082238  7 -2.27959124
# 8: -0.75649545  0.24058212  8  0.93770862
# 9:  0.22452260 -0.28212892  9 -0.02500419
# 10:  0.30209786  1.33697797 10  0.67729741
# 11:  0.88748221 -0.54421418 11          NA
# 12:  0.47207422 -0.28159382 12          NA
# 13: -1.17270475  0.83940750 13          NA
# 14: -2.02787820 -0.03672582 14          NA
# 15: -0.22187761  0.59137210 15          NA
# 16:  0.97750330 -0.27030756 16          NA
# 17:  0.22725940  0.54617488 17          NA
# 18:  0.94065525 -0.23482152 18          NA
# 19:  2.12049977  0.69920776 19          NA
# 20:  0.06192823  0.12262739 20          NA


# Xtest[, newcol := NULL]

I tried to recast the above code into a function, but the Ycol argument seems to be hidden:

myjoinfunction <- function(X, Y, joinlist, newcol, Ycol) {
  eval(substitute(X[Y, on = joinlist, newcol:=i.Ycol])) 
}

# > myjoinfunction <- function(X, Y, joinlist, newcol, Ycol) {
#   +   eval(substitute(X[Y, on = joinlist, newcol:=i.Ycol])) 
#   + }
# > myjoinfunction(Xtest, Ytest, list(c), D, d)
# Error in eval(jsub, SDenv, parent.frame()) : object 'i.Ycol' not found
# 8.
# eval(jsub, SDenv, parent.frame())
# 7.
# eval(jsub, SDenv, parent.frame())
# 6.
# `[.data.table`(Xtest, Ytest, on = list(c), `:=`(D, i.Ycol))
# 5.
# Xtest[Ytest, on = list(c), `:=`(D, i.Ycol)]
# 4.
# eval(substitute(X[Y, on = joinlist, `:=`(newcol, i.Ycol)]))
# 3.
# eval(substitute(X[Y, on = joinlist, `:=`(newcol, i.Ycol)]))
# 2.
# eval(substitute(X[Y, on = joinlist, `:=`(newcol, i.Ycol)]))
# 1.
# myjoinfunction(Xtest, Ytest, list(c), D, d)

How can I make the Ycol argument visible within the function?

UPDATED EXAMPLE

To be more general, here is a new example of an update join I would like to write a function for:

Xtest <- data.table(
a = rnorm(20), 
b = rnorm(20), 
c = rep(letters[1:4], rep(5, 4)), 
d = rep(1:5, 4)
)
Ytest <- data.table(
c = rep(letters[1:2], rep(5, 2)), 
d = rep(1:5, 2), 
e = rnorm(10), 
f = rnorm(10)
)

# > Xtest[Ytest, on = .(c, d), `:=`(newcol1 = i.e, newcol2 = i.f) ]
# > Xtest
# a            b c d    newcol1    newcol2
# 1: -2.4939743 -0.200370619 a 1 -1.4934893 -1.0288955
# 2:  1.0188321 -1.182286508 a 2  1.3811712  0.9747131
# 3:  0.5217161 -0.152117649 a 3 -0.4168069  0.1218213
# 4: -0.1584167  0.583640353 a 4  0.4644738  1.7888567
# 5: -0.4271398  0.020067301 a 5  2.5279998  2.0919953
# 6: -1.7692909  0.250129040 b 1 -1.5964246 -1.0884861
# 7: -0.8899915  0.971742055 b 2  0.3011304  1.2629524
# 8: -0.4490363 -1.540005621 b 3 -0.7992208 -0.5155775
# 9: -0.5706488 -1.037077614 b 4  1.0058213  1.9787692
# 10: -0.0922679  1.444487848 b 5 -0.2893311 -0.6095043
# 11:  0.9924810 -1.144513228 c 1         NA         NA
# 12:  1.2232591  1.503649791 c 2         NA         NA
# 13:  0.8751961  0.892765910 c 3         NA         NA
# 14:  0.9960554  0.499310073 c 4         NA         NA
# 15: -0.6184695  1.867985589 c 5         NA         NA
# 16:  0.6503936  0.422683211 d 1         NA         NA
# 17: -0.6160834 -1.585713893 d 2         NA         NA
# 18:  1.5949931 -0.544704857 d 3         NA         NA
# 19:  0.7232079 -0.006460518 d 4         NA         NA
# 20: -0.2824961  0.119585859 d 5         NA         NA

Using a suggestion from David Arenburg in the comments trips up when joinlist is a list of strings

myjoinfunction1 <- function(X, Y, joinlist, newcol, Ycol) X[Y, on = joinlist, newcol:= get(paste0("i.", Ycol))]

# > myjoinfunction1(Xtest, Ytest, list("c", "d"), "newcol", "e")
# Error in .parse_on(substitute(on), isnull_inames) : 
#   'on' argument should be a named atomic vector of column names indicating which columns in 'i' should be joined with which columns in 'x'.
# 5.
# stop("'on' argument should be a named atomic vector of column names indicating which columns in 'i' should be joined with which columns in 'x'.")
# 4.
# .parse_on(substitute(on), isnull_inames)
# 3.
# `[.data.table`(X, Y, on = joinlist, `:=`(newcol, get(paste0("i.", 
#                                                             Ycol))))
# 2.
# X[Y, on = joinlist, `:=`(newcol, get(paste0("i.", Ycol)))]
# 1.
# myjoinfunction1(Xtest, Ytest, list("c", "d"), "newcol", "e")

rainyday
  • 41
  • 5
  • 1
    Try `myjoinfunction <- function(X, Y, joinlist, newcol, Ycol) X[Y, on = joinlist, newcol:= get(paste0("i.", Ycol))]`, then `myjoinfunction(Xtest, Ytest, "c", "newcol", "d") ; Xtest` – David Arenburg Jun 22 '23 at 08:49
  • That does arrive at the desired output, however, is there any way to make the Ycol argument visible within `eval(substitute(xxx))` ? By using `eval` and `substitute`, I don't need the other arguments as strings but they can be typed directly. It might not matter in the end but I'm just curious if it can be done. – rainyday Jun 22 '23 at 09:02
  • 1
    Try `myjoinfunction <- function(X, Y, joinlist, newcol, Ycol) {Ycol <- paste0("i.", as.character(substitute(Ycol)));X[Y, on = eval(substitute(joinlist)), as.character(substitute(newcol)) := get(Ycol)] };myjoinfunction(Xtest, Ytest, list(c, d), newcol, e);Xtest` – David Arenburg Jun 22 '23 at 10:43
  • `joinlist` should be a vector of strings like `c(...)` rather than a list. – Frank Jun 23 '23 at 05:12
  • 1
    @Frank Why? It can be a list too. You can parse it using `eval(substitute(joinlist))` – David Arenburg Jun 23 '23 at 06:10
  • @DavidArenburg I'm responding to the error at the end of the OP where you can see they called the function with `list("c", "d")` which is not supported. `on=` is flexible but that pattern doesn't work (or used to not work?), which I guess is where the error comes from. I suggested strings instead of `list(c, d)` since I'm guessing the OP wants to pass the colnames programmatically. `eval(substitute(list("c", "d")))` is just the same list again, so I guess that's not a workaround – Frank Jun 23 '23 at 06:55
  • In fact, in this case, I'm intending the function for interactive use (which I didn't mention in the original question), so it's more convenient not to have strings. However, it's also correct that this is not useful in a programmatic environment. – rainyday Jun 23 '23 at 07:04

2 Answers2

2

For the more general second example:

f <- function(X, Y, joinlist, cols) {
    X[Y, on = joinlist, names(cols) := mget(sprintf("i.%s", cols))]
}

Usage:

set.seed(1)
Xtest2 <- data.table(
a = rnorm(20), 
b = rnorm(20), 
c = rep(letters[1:4], rep(5, 4)), 
d = rep(1:5, 4)
)
Ytest2 <- data.table(
c = rep(letters[1:2], rep(5, 2)), 
d = rep(1:5, 2), 
e = rnorm(10), 
f = rnorm(10)
)

f(Xtest2, Ytest2, c("c", "d"), c(newcol1 = "e", newcol2 = "f"))

Side note: There are some edge cases to watch out for when adding columns in a data.table (running out of "column slots"; adding to a table loaded from disk)

Frank
  • 66,179
  • 8
  • 96
  • 180
  • 1
    This is a potentially a nice solution for a programmatic environment, but it isn't exactly what I want right now, which unfortunately I didn't completely specify in my question (I am looking for something to be used interactively and it's slightly easier not to type strings). But I learnt something new in any case, nice to have options! – rainyday Jun 23 '23 at 07:08
1

Here is my answer based on the comments. The eval and substitute can be applied at different locations within the data.table, instead of wrapping the entire data.table in eval(substitute(DT)). Also, when using substitute with a string instead of an expression, eval is not needed.

myjoinfunction <- function(X, Y, joinlist, newcol, Ycol) {
  Ycol <- paste0("i.", as.character(substitute(Ycol))[-1])
  X[
     Y, 
     on = eval(substitute(joinlist)), 
     as.character(substitute(newcol))[-1] := mget(Ycol)
   d] 
  }

Usage:

set.seed(42)
Xtest <- data.table(
a = rnorm(20), 
b = rnorm(20), 
c = rep(letters[1:4], rep(5, 4)), 
d = rep(1:5, 4), 
id = 1:20
)
Ytest <- data.table(
c = rep(letters[1:2], rep(5, 2)), 
d = rep(1:5, 2), 
e = rnorm(10), 
f = rnorm(10), 
id = 1:10
)

# multiple columns supported both for joining and updating
myjoinfunction(Xtest, Ytest, list(c, d), list(newcol1, newcol2), list(e, f));

# Single columns have to be expressed in a list
myjoinfunction(Xtest, Ytest, list(id), list(newcol1), list(e));

By not having the arguments as strings, this is better for interactive use than programmatic use.

Suggestions for refinements are welcome! Especially if there is a more elegant way to express single columns in the function arguments or handle lists within the function (I don't like having to use [-1] as a workaround within the function when handling lists, I think it will lead to errors eventually.)

rainyday
  • 41
  • 5
  • 1
    Re lists vs singletons, you can wrap it in a helper function, `extract_symbols <- function(symbol_list){sub_list = substitute(symbol_list); syms = if (length(sub_list) == 1) sub_list else sub_list[-1]; as.character(syms)}` examples: `extract_symbols(list(c,d)); extract_symbols(c)` – Frank Jun 28 '23 at 05:39