0

I am trying to filter data based on a value from another dataset in loop. I have 2 dataframes named x and mx_long2. Based on value of age and range from df x, I need to filter another dataframe mx_long2. Currently I am using simple filter function from dplyr and fuzzyjoin approach. Both are slow as actual loop is iteration of close to 20 millions. Is there any efficient way of doing this?

x <- structure(list(Ars_0 = c(1308.56, 5728.84, 2177.82), Ars_1 = c(0, 0, 0), 
                    Ars_2 = c(0, 0, 0), age = c(13, 31, 43), region = c('A','A','B')), 
               row.names = c(NA, -3L), 
               class = "data.frame")


mx_long2 = read.table(header = T, text = '
                      Arrears   Ars_0   Ars_1   Ars_2   Seasoning   Region
                      Ars_0 0.985   0.0148  0.0002  mths:36-47  A
                      Ars_1 0.3816  0.286   0.3317  mths:36-47  A
                      Ars_2 0.2959  0.0057  0.2524  mths:36-47  A
                      Ars_0 0.9822  0.0176  0.0002  mths:24-35  A
                      Ars_1 0.389   0.2753  0.3347  mths:24-35  A
                      Ars_2 0.3026  0.0334  0.2399  mths:24-35  A
                      Ars_0 0.9753  0.0243  0.0004  mths:12-23  A
                      Ars_1 0.4002  0.2592  0.3394  mths:12-23  A
                      Ars_2 0.3032  0.0208  0.2387  mths:12-23  A
                      Ars_0 0.8865  0.01332 0.00018 mths:36-47  B
                      Ars_1 0.34344 0.2574  0.29853 mths:36-47  B
                      Ars_2 0.26631 0.00513 0.22716 mths:36-47  B
                      Ars_0 0.88398 0.01584 0.00018 mths:24-35  B
                      Ars_1 0.3501  0.24777 0.30123 mths:24-35  B
                      Ars_2 0.27234 0.03006 0.21591 mths:24-35  B
                      Ars_0 0.87777 0.02187 0.00036 mths:12-23  B
                      Ars_1 0.36018 0.23328 0.30546 mths:12-23  B
                      Ars_2 0.27288 0.01872 0.21483 mths:12-23  B
                      ')


mx_long2 = mx_long2 %>% mutate(minage = as.numeric(substr(as.character(Seasoning), 6,7)),
                               maxage = as.numeric(substr(as.character(Seasoning), 9,10)))

Filter

l <- list()
for (i in 1:nrow(x))
{

m <- x %>% slice(i) %>% pull(age)
r <- x[i, 'region']
Bx = filter(mx_long2, (i + m) >= minage, (i + m) <= maxage, Region==r) %>%
  select(starts_with('Ars_')) %>% data.matrix() 

# Matrix Multiplication
l[[i]] <- data.matrix(m) %*% Bx
}

Second Approach : Fuzzyjoin

library(fuzzyjoin)
    Bx = fuzzy_inner_join(
      x[i, c('age', 'region')],
      mx_long2,
      by = c(
        "age" = "minage",
        "age" = "maxage",
        "region" = "Region"
      ),
      match_fun = list(`>=`, `<`, `==`)
    ) %>% select(starts_with('Ars_')) %>%
      data.matrix()
john
  • 1,026
  • 8
  • 19
  • 1
    Sounds like you need `data.table` non-equi join https://stackoverflow.com/a/56281648/786542 – Tung Nov 11 '19 at 18:10
  • yup something like `setDT(mx_long2)[, c("minage","maxage") := lapply(.(substring(Seasoning, 6, 7), substring(Seasoning, 9, 10)), as.numeric)][ setDT(x), on=.(Region=region, minage<=age, maxage>=age), nomatch=0L]` – chinsoon12 Nov 12 '19 at 01:05

0 Answers0