Retaining all columns in both data tables during a join, then adding a column

Question

Say I have the following data tables:

# Seed random number generator
set.seed(33550336)

# Create data tables
dt1 <- data.table(ID = sample(LETTERS[1:5], 20, replace = TRUE),
                  loc = sample(1:50, 20, replace = TRUE),
                  a = runif(20),
                  b = runif(20),
                  c = runif(20),
                  d = runif(20))



dt2 <- data.table(ID = sample(LETTERS[1:5], 20, replace = TRUE),
                  loc = sample(1:50, 20, replace = TRUE),
                  e = runif(20),
                  f = runif(20),
                  g = runif(20),
                  h = runif(20))

I'd like to join them like this (as per this answer):

# Join on ID and nearest rolling join on loc
dt2[dt1,
    on = .(ID, loc),
    roll = "nearest"]

#     ID loc         e          f         g          h          a          b          c           d
#  1:  E   2 0.6080648 0.59558616 0.9680243 0.65885155 0.75533475 0.46796072 0.07874670 0.372224933
#  2:  B  22 0.2900181 0.89395076 0.5012072 0.81403388 0.24129711 0.66914193 0.11941211 0.330982361
#  3:  C  23 0.7753557 0.31772779 0.3302613 0.02004258 0.32252276 0.09341920 0.29665070 0.563954195
#  4:  A  46 0.1193827 0.89183103 0.7142606 0.17231293 0.62979589 0.19621242 0.48943734 0.318145133
#  5:  B  26 0.2900181 0.89395076 0.5012072 0.81403388 0.65672029 0.45106318 0.47421905 0.605327569
#  6:  E  17 0.4417452 0.03226111 0.5975499 0.49336668 0.83821385 0.99078941 0.93356571 0.459227328
#  7:  D  24 0.8974042 0.90725532 0.5008502 0.21681072 0.86831894 0.41260922 0.65389531 0.930843432
#  8:  D  24 0.8974042 0.90725532 0.5008502 0.21681072 0.82042112 0.82906524 0.59829109 0.859362233
#  9:  D  44 0.3958956 0.06361996 0.8068514 0.56349064 0.29823590 0.04765864 0.65412304 0.742808806
# 10:  E  11 0.4417452 0.03226111 0.5975499 0.49336668 0.15013055 0.83683385 0.18847332 0.139363770
# 11:  D  11 0.5967619 0.23497655 0.5429504 0.56322079 0.68644344 0.46995509 0.35128292 0.910443478
# 12:  A  50 0.1193827 0.89183103 0.7142606 0.17231293 0.65811523 0.48901176 0.96854302 0.875838825
# 13:  E  17 0.4417452 0.03226111 0.5975499 0.49336668 0.93484739 0.57810132 0.75250483 0.607710552
# 14:  A  21 0.4491745 0.61724476 0.3283133 0.51406071 0.96610736 0.03222779 0.05768814 0.436536989
# 15:  A   6 0.4491745 0.61724476 0.3283133 0.51406071 0.69975907 0.35564120 0.42206040 0.309386788
# 16:  B  49 0.1152318 0.99716746 0.1440101 0.70734795 0.05138897 0.80463532 0.41856763 0.421029334
# 17:  C   9 0.1204828 0.47622000 0.6802176 0.36385191 0.98509395 0.49711655 0.68159049 0.003570911
# 18:  D   7 0.5967619 0.23497655 0.5429504 0.56322079 0.69862668 0.91597522 0.53630369 0.297000037
# 19:  C   8 0.1204828 0.47622000 0.6802176 0.36385191 0.80761410 0.87051653 0.93177628 0.671692311
# 20:  B   5 0.5652708 0.50866629 0.3992037 0.87643314 0.69493460 0.99878010 0.77953456 0.820925302

This is awesome. Only one thing missing: the difference between loc in dt1 and dt2 (i.e., delta = abs(x.loc - i.loc)). Nevertheless, the only loc remaining is from dt1, so I can't do this calculation at this point.

In his response to my previous question, Jaap named each column to be retained individually and performed the calculation at the same time, something like this:

dt2[dt1,
        on = c("ID", "loc"),
        roll = "nearest", 
        .(ID, loc = i.loc, a, b, c, d, e, f, g, h, delta = abs(x.loc - i.loc))][]

#     ID loc          a          b          c           d         e          f         g          h delta
#  1:  E   2 0.75533475 0.46796072 0.07874670 0.372224933 0.6080648 0.59558616 0.9680243 0.65885155     1
#  2:  B  22 0.24129711 0.66914193 0.11941211 0.330982361 0.2900181 0.89395076 0.5012072 0.81403388     5
#  3:  C  23 0.32252276 0.09341920 0.29665070 0.563954195 0.7753557 0.31772779 0.3302613 0.02004258     6
#  4:  A  46 0.62979589 0.19621242 0.48943734 0.318145133 0.1193827 0.89183103 0.7142606 0.17231293     0
#  5:  B  26 0.65672029 0.45106318 0.47421905 0.605327569 0.2900181 0.89395076 0.5012072 0.81403388     1
#  6:  E  17 0.83821385 0.99078941 0.93356571 0.459227328 0.4417452 0.03226111 0.5975499 0.49336668     2
#  7:  D  24 0.86831894 0.41260922 0.65389531 0.930843432 0.8974042 0.90725532 0.5008502 0.21681072    14
#  8:  D  24 0.82042112 0.82906524 0.59829109 0.859362233 0.8974042 0.90725532 0.5008502 0.21681072    14
#  9:  D  44 0.29823590 0.04765864 0.65412304 0.742808806 0.3958956 0.06361996 0.8068514 0.56349064     1
# 10:  E  11 0.15013055 0.83683385 0.18847332 0.139363770 0.4417452 0.03226111 0.5975499 0.49336668     4
# 11:  D  11 0.68644344 0.46995509 0.35128292 0.910443478 0.5967619 0.23497655 0.5429504 0.56322079     8
# 12:  A  50 0.65811523 0.48901176 0.96854302 0.875838825 0.1193827 0.89183103 0.7142606 0.17231293     4
# 13:  E  17 0.93484739 0.57810132 0.75250483 0.607710552 0.4417452 0.03226111 0.5975499 0.49336668     2
# 14:  A  21 0.96610736 0.03222779 0.05768814 0.436536989 0.4491745 0.61724476 0.3283133 0.51406071     4
# 15:  A   6 0.69975907 0.35564120 0.42206040 0.309386788 0.4491745 0.61724476 0.3283133 0.51406071    19
# 16:  B  49 0.05138897 0.80463532 0.41856763 0.421029334 0.1152318 0.99716746 0.1440101 0.70734795     6
# 17:  C   9 0.98509395 0.49711655 0.68159049 0.003570911 0.1204828 0.47622000 0.6802176 0.36385191     3
# 18:  D   7 0.69862668 0.91597522 0.53630369 0.297000037 0.5967619 0.23497655 0.5429504 0.56322079     4
# 19:  C   8 0.80761410 0.87051653 0.93177628 0.671692311 0.1204828 0.47622000 0.6802176 0.36385191     2
# 20:  B   5 0.69493460 0.99878010 0.77953456 0.820925302 0.5652708 0.50866629 0.3992037 0.87643314     1

This is perfect, except for having to name each and every column. So, as a workaround, I retain all columns from both data tables (using mget), then calculate delta through chaining:

# Columns to select
cols2sel <- c(paste0("x.", names(dt2)), paste0("i.", names(dt1)))

dt2[dt1,
    on = c("ID", "loc"),
    roll = "nearest", 
    mget(cols2sel)][, delta := abs(x.loc - i.loc)][]

#     x.ID x.loc       x.e        x.f       x.g        x.h i.ID i.loc        i.a        i.b        i.c         i.d delta
#  1:    E     1 0.6080648 0.59558616 0.9680243 0.65885155    E     2 0.75533475 0.46796072 0.07874670 0.372224933     1
#  2:    B    27 0.2900181 0.89395076 0.5012072 0.81403388    B    22 0.24129711 0.66914193 0.11941211 0.330982361     5
#  3:    C    29 0.7753557 0.31772779 0.3302613 0.02004258    C    23 0.32252276 0.09341920 0.29665070 0.563954195     6
#  4:    A    46 0.1193827 0.89183103 0.7142606 0.17231293    A    46 0.62979589 0.19621242 0.48943734 0.318145133     0
#  5:    B    27 0.2900181 0.89395076 0.5012072 0.81403388    B    26 0.65672029 0.45106318 0.47421905 0.605327569     1
#  6:    E    15 0.4417452 0.03226111 0.5975499 0.49336668    E    17 0.83821385 0.99078941 0.93356571 0.459227328     2
#  7:    D    38 0.8974042 0.90725532 0.5008502 0.21681072    D    24 0.86831894 0.41260922 0.65389531 0.930843432    14
#  8:    D    38 0.8974042 0.90725532 0.5008502 0.21681072    D    24 0.82042112 0.82906524 0.59829109 0.859362233    14
#  9:    D    45 0.3958956 0.06361996 0.8068514 0.56349064    D    44 0.29823590 0.04765864 0.65412304 0.742808806     1
# 10:    E    15 0.4417452 0.03226111 0.5975499 0.49336668    E    11 0.15013055 0.83683385 0.18847332 0.139363770     4
# 11:    D     3 0.5967619 0.23497655 0.5429504 0.56322079    D    11 0.68644344 0.46995509 0.35128292 0.910443478     8
# 12:    A    46 0.1193827 0.89183103 0.7142606 0.17231293    A    50 0.65811523 0.48901176 0.96854302 0.875838825     4
# 13:    E    15 0.4417452 0.03226111 0.5975499 0.49336668    E    17 0.93484739 0.57810132 0.75250483 0.607710552     2
# 14:    A    25 0.4491745 0.61724476 0.3283133 0.51406071    A    21 0.96610736 0.03222779 0.05768814 0.436536989     4
# 15:    A    25 0.4491745 0.61724476 0.3283133 0.51406071    A     6 0.69975907 0.35564120 0.42206040 0.309386788    19
# 16:    B    43 0.1152318 0.99716746 0.1440101 0.70734795    B    49 0.05138897 0.80463532 0.41856763 0.421029334     6
# 17:    C     6 0.1204828 0.47622000 0.6802176 0.36385191    C     9 0.98509395 0.49711655 0.68159049 0.003570911     3
# 18:    D     3 0.5967619 0.23497655 0.5429504 0.56322079    D     7 0.69862668 0.91597522 0.53630369 0.297000037     4
# 19:    C     6 0.1204828 0.47622000 0.6802176 0.36385191    C     8 0.80761410 0.87051653 0.93177628 0.671692311     2
# 20:    B     6 0.5652708 0.50866629 0.3992037 0.87643314    B     5 0.69493460 0.99878010 0.77953456 0.820925302     1

This almost gives me what I want, but now I have to mess around fixing column names, removing duplicate columns (i.e., ID), etc. unlike Jaap's elegant initial solution. That solution, however, required naming all columns.

My question: is there a way to get the best of both worlds and not have to specify each and every column, yet also get the solution is a clean format like in code block #3 above?

Related issue: [.SD in expression with j?](https://github.com/Rdatatable/data.table/issues/3115), and links therein. — Henrik, Oct 26 '18 at 14:02
Thanks, @Henrik. That's pretty much exactly my issue – and I had tried something similar to the second code block in the first post. It looks like [PR #3093](https://github.com/Rdatatable/data.table/pull/3093) would solve my question. — Dan, Oct 26 '18 at 14:20

score 2 · Accepted Answer · answered Oct 26 '18 at 14:32

Here's a workaround thanks to a post by sritchie73 at the link @Henrik provided in his comment above. One solution is to copy the variables which are used in the join prior to the join so that they're retained in the result and can be used in the calculation.

# Copy loc variables
dt1[, loc1 := loc]
dt2[, loc2 := loc]

# Perform join, calculate delta, drop loc1 & loc2    
dt2[dt1,
    on = c("ID", "loc"),
    roll = "nearest"][
      , delta := abs(loc1 - loc2)][
        , c("loc1", "loc2") := NULL][]

which gives,

#     ID loc         e          f         g          h          a          b          c           d delta
#  1:  E   2 0.6080648 0.59558616 0.9680243 0.65885155 0.75533475 0.46796072 0.07874670 0.372224933     1
#  2:  B  22 0.2900181 0.89395076 0.5012072 0.81403388 0.24129711 0.66914193 0.11941211 0.330982361     5
#  3:  C  23 0.7753557 0.31772779 0.3302613 0.02004258 0.32252276 0.09341920 0.29665070 0.563954195     6
#  4:  A  46 0.1193827 0.89183103 0.7142606 0.17231293 0.62979589 0.19621242 0.48943734 0.318145133     0
#  5:  B  26 0.2900181 0.89395076 0.5012072 0.81403388 0.65672029 0.45106318 0.47421905 0.605327569     1
#  6:  E  17 0.4417452 0.03226111 0.5975499 0.49336668 0.83821385 0.99078941 0.93356571 0.459227328     2
#  7:  D  24 0.8974042 0.90725532 0.5008502 0.21681072 0.86831894 0.41260922 0.65389531 0.930843432    14
#  8:  D  24 0.8974042 0.90725532 0.5008502 0.21681072 0.82042112 0.82906524 0.59829109 0.859362233    14
#  9:  D  44 0.3958956 0.06361996 0.8068514 0.56349064 0.29823590 0.04765864 0.65412304 0.742808806     1
# 10:  E  11 0.4417452 0.03226111 0.5975499 0.49336668 0.15013055 0.83683385 0.18847332 0.139363770     4
# 11:  D  11 0.5967619 0.23497655 0.5429504 0.56322079 0.68644344 0.46995509 0.35128292 0.910443478     8
# 12:  A  50 0.1193827 0.89183103 0.7142606 0.17231293 0.65811523 0.48901176 0.96854302 0.875838825     4
# 13:  E  17 0.4417452 0.03226111 0.5975499 0.49336668 0.93484739 0.57810132 0.75250483 0.607710552     2
# 14:  A  21 0.4491745 0.61724476 0.3283133 0.51406071 0.96610736 0.03222779 0.05768814 0.436536989     4
# 15:  A   6 0.4491745 0.61724476 0.3283133 0.51406071 0.69975907 0.35564120 0.42206040 0.309386788    19
# 16:  B  49 0.1152318 0.99716746 0.1440101 0.70734795 0.05138897 0.80463532 0.41856763 0.421029334     6
# 17:  C   9 0.1204828 0.47622000 0.6802176 0.36385191 0.98509395 0.49711655 0.68159049 0.003570911     3
# 18:  D   7 0.5967619 0.23497655 0.5429504 0.56322079 0.69862668 0.91597522 0.53630369 0.297000037     4
# 19:  C   8 0.1204828 0.47622000 0.6802176 0.36385191 0.80761410 0.87051653 0.93177628 0.671692311     2
# 20:  B   5 0.5652708 0.50866629 0.3992037 0.87643314 0.69493460 0.99878010 0.77953456 0.820925302     1

Retaining all columns in both data tables during a join, then adding a column

1 Answers1