R - more effective left_join

Question

I have got two dataframes - one containing names and ranges of limits (only few hundreds of rows, 1000 at most), which needs to be assigned to a "measurements" dataframe which can consist of million of rows (or ten's of millions of row).

Currently I am doing left_join and filtering value to get a specific limit assigned to each measurement. This however is quite ineffective and cost a lot of resources. For larger dataframes, the code is even unable to run.

Any ideas for more effective solutions will be helpful.

library(dplyr)

## this one has got only few houndreds rows
df_limits <- read.table(text="Title station_id  limit_from  limit_to
Level_3_Low 1   0   70
Level_2_Low 1   70  90
Level_1_Low 1   90  100
Optimal 1   100 110
Level_1_High    1   110 130
Level_2_High    1   130 150
Level_3_High    1   150 180
Level_3_Low 2   0   70
Level_2_Low 2   70  90
Level_1_Low 2   90  100
Optimal 2   100 110
Level_1_High    2   110 130
Level_2_High    2   130 150
Level_3_High    2   150 180
Level_3_Low 3   0   70
Level_2_Low 3   70  90
Level_1_Low 3   90  100
Optimal 3   100 110
Level_1_High    3   110 130
Level_2_High    3   130 150
Level_3_High    3   150 180
",header  = TRUE, stringsAsFactors = TRUE)


 # this DF has got millions of rows
df_measurements <- read.table(text="measurement_id  station_id  value
12121534    1   172
12121618    1   87
12121703    1   9
12121709    2   80
12121760    2   80
12121813    2   115
12121881    3   67
12121907    3   100
12121920    3   108
12121979    1   102
12121995    1   53
12122022    1   77
12122065    2   158
12122107    2   144
12122113    2   5
12122135    3   100
12122187    3   136
12122267    3   130
12122359    1   105
12122366    1   126
12122398    1   143
",header  = TRUE, stringsAsFactors = TRUE)


df_results <- left_join(df_measurements,df_limits, by = "station_id") %>% 
              filter ((value >= limit_from & value < limit_to) | is.na(Title)) %>%
              select(names(df_measurements), Title)

Have you tried `data.table` solutions from this post? https://stackoverflow.com/questions/24480031/overlap-join-with-start-and-end-positions — Ronak Shah, Aug 06 '21 at 12:16
I've tried fuzzy_left_join. The data.table solution doesn't seems to work for me, I am trying: df_measurements[df_limits, .(x, station_id=x.station_id, limit_from, limit_to), on=.(x, value>=limit_from, value — Petrik, Aug 06 '21 at 13:37
I've tried this as well: df_measurements[df_limits, .(station_id, value=x.value, limit_from, limit_to) , on=.(station_id, value>=limit_from, value — Petrik, Aug 06 '21 at 15:23
@Petrik What went wrong with your second attempt (using `data.table`)? — Greg, Aug 09 '21 at 14:30

Waldi · Accepted Answer · 2021-08-09T18:19:55.413

Another data.table solution using non-equijoins:

library(data.table)

setDT(df_measurements)
setDT(df_limits) 

df_limits[df_measurements, .(station_id, measurement_id, value, Title),
          on=.(station_id = station_id, limit_from < value, limit_to >= value)]

   station_id measurement_id value        Title
 1:          1       12121534   172 Level_3_High
 2:          1       12121618    87  Level_2_Low
 3:          1       12121703     9  Level_3_Low
 4:          2       12121709    80  Level_2_Low
 5:          2       12121760    80  Level_2_Low
 6:          2       12121813   115 Level_1_High
 7:          3       12121881    67  Level_3_Low
 8:          3       12121907   100  Level_1_Low
 9:          3       12121920   108      Optimal
10:          1       12121979   102      Optimal
11:          1       12121995    53  Level_3_Low
12:          1       12122022    77  Level_2_Low
13:          2       12122065   158 Level_3_High
14:          2       12122107   144 Level_2_High
15:          2       12122113     5  Level_3_Low
16:          3       12122135   100  Level_1_Low
17:          3       12122187   136 Level_2_High
18:          3       12122267   130 Level_1_High
19:          1       12122359   105      Optimal
20:          1       12122366   126 Level_1_High
21:          1       12122398   143 Level_2_High

ThomasIsCoding · Answer 2 · 2021-08-10T09:04:02.707

A simple base R (no need additional packages) option using subset + merge

subset(
  merge(
    df_measurements,
    df_limits,
    all = TRUE
  ),
  limit_from < value & limit_to >= value
)

gives

    station_id measurement_id value        Title limit_from limit_to
7            1       12121534   172 Level_3_High        150      180
9            1       12121618    87  Level_2_Low         70       90
15           1       12121703     9  Level_3_Low          0       70
23           1       12122022    77  Level_2_Low         70       90
34           1       12122398   143 Level_2_High        130      150
39           1       12121979   102      Optimal        100      110
43           1       12121995    53  Level_3_Low          0       70
54           1       12122366   126 Level_1_High        110      130
60           1       12122359   105      Optimal        100      110
65           2       12121760    80  Level_2_Low         70       90
75           2       12121813   115 Level_1_High        110      130
79           2       12121709    80  Level_2_Low         70       90
91           2       12122065   158 Level_3_High        150      180
97           2       12122107   144 Level_2_High        130      150
99           2       12122113     5  Level_3_Low          0       70
108          3       12121907   100  Level_1_Low         90      100
116          3       12121920   108      Optimal        100      110
124          3       12122267   130 Level_1_High        110      130
127          3       12121881    67  Level_3_Low          0       70
136          3       12122135   100  Level_1_Low         90      100
146          3       12122187   136 Level_2_High        130      150

Another option is using dplyr

  df_measurements %>%
    group_by(station_id) %>%
    mutate(Title = with(
      df_limits,
      Title[
        findInterval(
          value,
          unique(unlist(cbind(limit_from, limit_to)[station_id == first(.$station_id)])),
          left.open = TRUE
        )
      ]
    )) %>%
    ungroup()

which gives

# A tibble: 21 x 4
   measurement_id station_id value Title       
            <int>      <int> <int> <fct>
 1       12121534          1   172 Level_3_High
 2       12121618          1    87 Level_2_Low
 3       12121703          1     9 Level_3_Low
 4       12121709          2    80 Level_2_Low
 5       12121760          2    80 Level_2_Low
 6       12121813          2   115 Level_1_High
 7       12121881          3    67 Level_3_Low
 8       12121907          3   100 Level_1_Low
 9       12121920          3   108 Optimal
10       12121979          1   102 Optimal
# ... with 11 more rows

Benchmarking


f_TIC1 <- function() {
  subset(
    merge(
      df_measurements,
      df_limits,
      all = TRUE
    ),
    limit_from < value & limit_to >= value
  )
}

f_TIC2 <- function() {
  df_measurements %>%
    group_by(station_id) %>%
    mutate(Title = with(
      df_limits,
      Title[
        findInterval(
          value,
          unique(unlist(cbind(limit_from, limit_to)[station_id == first(station_id)])),
          left.open = TRUE
        )
      ]
    )) %>%
    ungroup()
}


dt_limits <- as.data.table(df_limits)
dt_measurements <- as.data.table(df_measurements)
f_Waldi <- function() {
  dt_limits[
    dt_measurements,
    .(station_id, measurement_id, value, Title),
    on = .(station_id, limit_from < value, limit_to >= value)
  ]
}

f_TimTeaFan <- function() {
  setkey(dt_limits, station_id, limit_from, limit_to)
  foverlaps(dt_measurements[, value2 := value],
    dt_limits,
    by.x = c("station_id", "value", "value2"),
    type = "within",
  )[
    value < limit_to,
    .(measurement_id, station_id, value, Title)
  ]
}

you will see that

Unit: relative
          expr      min       lq     mean   median       uq      max neval
      f_TIC1() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000   100
      f_TIC2() 4.848639 4.909985 4.895588 4.942616 5.124704 2.580819   100
     f_Waldi() 3.182027 3.010615 3.069916 3.114160 3.397845 1.698386   100
 f_TimTeaFan() 5.523778 5.112872 5.226145 5.112407 5.745671 2.446987   100

Nice one! Upvoted! To be tested on a larger dataset to see if performance holds on a cartesian product without index. — Waldi, Aug 10 '21 at 06:25
@Waldi I believe your `data.table` solution scales well with large dataset, and mine will be very slow. — ThomasIsCoding, Aug 10 '21 at 08:02

TimTeaFan · Answer 3 · 2021-08-09T11:56:22.283

Here is one way to do it. The problematic part was the condition value < limit_to. foverlaps checks for the condition value <= limit_to which results in double matches so here we call the filter condition after the overlapping join and then select the desired columns. Note that the result is not in the same order as the df_results generated with dplyr.

library(data.table)

dt_limits <- as.data.table(df_limits)
dt_measurements <- as.data.table(df_measurements)

setkey(dt_limits, station_id, limit_from, limit_to)

dt_results <- foverlaps(dt_measurements[, value2 := value],
                        dt_limits,
                        by.x = c("station_id", "value", "value2"),
                        type = "within",
                        )[value < limit_to,
                          .(measurement_id , station_id, value, Title)]
dt_results[]
#>     measurement_id station_id value        Title
#>  1:       12121534          1   172 Level_3_High
#>  2:       12121618          1    87  Level_2_Low
#>  3:       12121703          1     9  Level_3_Low
#>  4:       12121709          2    80  Level_2_Low
#>  5:       12121760          2    80  Level_2_Low
#>  6:       12121813          2   115 Level_1_High
#>  7:       12121881          3    67  Level_3_Low
#>  8:       12121907          3   100      Optimal
#>  9:       12121920          3   108      Optimal
#> 10:       12121979          1   102      Optimal
#> 11:       12121995          1    53  Level_3_Low
#> 12:       12122022          1    77  Level_2_Low
#> 13:       12122065          2   158 Level_3_High
#> 14:       12122107          2   144 Level_2_High
#> 15:       12122113          2     5  Level_3_Low
#> 16:       12122135          3   100      Optimal
#> 17:       12122187          3   136 Level_2_High
#> 18:       12122267          3   130 Level_2_High
#> 19:       12122359          1   105      Optimal
#> 20:       12122366          1   126 Level_1_High
#> 21:       12122398          1   143 Level_2_High
#>     measurement_id station_id value        Title

^{Created on 2021-08-09 by the reprex package (v0.3.0)}

R - more effective left_join

3 Answers3

Benchmarking