3

I'm trying to match two data tables by an equality condition and two inequality conditions simultaneously. These are the tables and the desired output using sqldf:

library(data.table)
library(sqldf)
set.seed(1)
DT1 = data.table(x = c(1,2,3,4,5),
                y = c(15,25,35,45,55),
                z = rnorm(5))

DT2 = data.table(a = c(2,4,5,8,9),
                 y_start = c(20,11,54,31,60),
                 y_end = c(27,14,55,37,70),
                 t = sample(1000:2000,size = 5))

x  y  z
1  15  -0.626453811
2  25  0.183643324
3  35  -0.835628612
4  45  1.595280802
5  55  0.329507772

a  y_start  y_end  t
2  20  27  1206
4  11  14  1176
5  54  55  1686
8  31  37  1383
9  60  70  1767

output = sqldf("select DT1.*, DT2.t
                from DT1 left join DT2
                                on DT1.x = DT2.a
                               and DT1.y >= DT2.y_start
                               and DT1.y <= DT2.y_end")

#desired output
x  y  z  t
1  15  -0.626453811  NA
2  25  0.183643324  1206
3  35  -0.835628612  NA
4  45  1.595280802  NA
5  55  0.329507772  1686

Trying to achieve the same output with data.table I managed to come up with the following code, but the ouput is not exactly what i need:

DT = DT1[DT2, on  = .(x = a, y >= y_start, y <= y_end),  nomatch = 0L]
x  y  z             y.1 t
2  20  0.183643324  27  1206
5  54  0.329507772  55  1686

I can work with this and append the missing rows from DT1 and also drop column y.1, but maybe there is a way to achieve this directly?

zx8754
  • 52,746
  • 12
  • 114
  • 209
BogdanC
  • 1,316
  • 3
  • 16
  • 36
  • 1
    @Aramis7d didn't see that question, the `dplyr` formulation probably hid it. The `data.table` answer there is great – BogdanC May 18 '18 at 11:41

2 Answers2

3

You can also include the selection of variables within the first query.

DT2[DT1, .(x, y, z, t), on = .(a = x, y_start <= y, y_end >= y)]
#   x  y          z    t
#1: 1 15 -0.6264538   NA
#2: 2 25  0.1836433 1206
#3: 3 35 -0.8356286   NA
#4: 4 45  1.5952808   NA
#5: 5 55  0.3295078 1686

Regards!

0

We could join the reverse join

DT2[DT1, on = .(a= x, y_start <= y, y_end >= y)][, .(x = a, y = y_start, z, t)]
#   x  y          z    t
#1: 1 15 -0.6264538   NA
#2: 2 25  0.1836433 1206
#3: 3 35 -0.8356286   NA
#4: 4 45  1.5952808   NA
#5: 5 55  0.3295078 1686
akrun
  • 874,273
  • 37
  • 540
  • 662