2

DUMMY DATA SET: (difference from my data set is item_code is string in my case)

in_cluster <- data.frame(item_code = c(1:500))
in_cluster$cluster <-
        sample(5, size = nrow(in_cluster), replace = TRUE)
real_sales <- data.frame(item_code = numeric(0), sales = numeric(0))
real_sales <-
    data.frame(
            item_code = sample(500, size = 100000, replace = TRUE),
            sales = sample(500, size = 100000, replace = TRUE)
    )

mean_trajectory <- data.frame(sales = c(1:52))
mean_trajectory$sales <- sample(500, size = 52, replace = TRUE)
training_df <- data.frame(
        LTF_t_minus_1 = numeric(0),
        LTF_t = numeric(0),
        LTF_t_plus_1 = numeric(0),
        RS_t_minus_1 = numeric(0),
        RS_t = numeric(0),
        STF_t_plus_1 = numeric(0)
)
training_df[nrow(training_df) + 1, ] <-
        c(0, 0, mean_trajectory$sales[[1]], 0, 0, 19) # week 0

week = 2

I have a simple function in R in which all I do is:

system.time({
    for (r in 1:nrow(in_cluster)) {
            item <- in_cluster[r,]
            sale_row <-
                    dplyr::filter(real_sales, item_code == item$item_code)
            if (nrow(sale_row) > 2) {
                    new_df <- data.frame(
                            LTF_t_minus_1 = mean_trajectory$sales[[week - 1]],
                            LTF_t = mean_trajectory$sales[[week]],
                            LTF_t_plus_1 = mean_trajectory$sales[[week + 1]],
                            RS_t_minus_1 = sale_row$sales[[week - 1]],
                            RS_t = sale_row$sales[[week]],
                            STF_t_plus_1 = sale_row$sales[[week + 1]]
                    )
                    training_df <-
                            bind_rows(training_df, new_df)
            }
    }
}) 

I am quite new to R and found this really weird looking at how small the data really is yet how long (421.59 seconds to loop through 500 rows) it is taking to loop through the data frame.

EDIT_IMPORTANT: However for above given dummy data set all it took was 1.10 seconds to get the output > could this be because of having string for item_code? does it take that much time to process a string item_code. (I didn't use string for dummy data sets because I do not know how to have 500 unique strings for item_code in in_cluster, and have the same strings in real_sales as item_code)

I read through few other articles which suggested ways to optimize the R code and used bind_rows instead of rbind or using:

training_df[nrow(training_df) + 1,] <-
    c(mean_trajectory$sales[[week-1]], mean_trajectory$sales[[week]], mean_trajectory$sales[[week+1]], sale_row$sales[[week-1]], sale_row$sales[[week]], sale_row$sales[[week+1]])

using bind_rows seems to have improved the performance by 36 seconds when looping through 500 rows of data frame in_cluster

Is it possible to use lapply in this scenario? I tried code below and got an error:

Error in filter_impl(.data, dots) : $ operator is invalid for atomic vectors

myfun <- function(item, sales, mean_trajectory, week) {
sale_row<- filter(sales, item_code == item$item_code)
data.frame(
  LTF_t_minus_1 = mean_trajectory$sales[[week-1]],
  LTF_t = mean_trajectory$sales[[week]],
  LTF_t_plus_1 = mean_trajectory$sales[[week+1]],
  RS_t_minus_1 = sale_row$sales[[week-1]],
  RS_t = sale_row$sales[[week]],
  STF_t_plus_1 = sale_row$sales[[week+1]])  
}

system.time({
      lapply(in_cluster, myfun, sales= sales, mean_trajectory = mean_trajectory) %>% bind_rows()
})

Help with lapply would be appreciated, however my main target is to speed up the loop.

ro ko
  • 2,906
  • 3
  • 37
  • 58
  • 1
    Please include a [reproducible example](http://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example/5963610), this will make it much easier for others to help you. – Jaap Apr 23 '17 at 17:13
  • @Jaap sure, was on it. Thanks though. – ro ko Apr 23 '17 at 17:15

1 Answers1

5

Ok, so there a lot of bad practices in your code.

  1. You are operating per row
  2. You are creating 2(!) new data frames per row (very expensive)
  3. You are growing objects in a loop )that training_df <- bind_rows(training_df, new_df) keeps growing in each iteration while running a pretty expensive operation (bind_rows))
  4. You are running the same operation over and over again when you could just run them once (why are you running mean_trajectory$sales[[week-1]] and al per row while mean_trajectory has nothing to do with the loop? You could just assign it afterwards).
  5. And the list goes on...

I would suggest an alternative simple data.table solution which will perform much better. The idea is to first make a binary join between in_cluster and real_sales (and run all the operations while joining without creating extra data frames and then binding them). Then, run all the mean_trajectoryrelated lines only once. (I ignored the training_df[nrow(training_df) + 1, ] <- c(0, 0, mean_trajectory$sales[[1]], 0, 0, 19) initialization as it's irrelevant here and you can just add it afterwards using and rbind)

library(data.table) #v1.10.4
## First step
res <-
  setDT(real_sales)[setDT(in_cluster), # binary join
                  if(.N > 2) .(RS_t_minus_1 = sales[week - 1], # The stuff you want to do
                               RS_t = sales[week],             # by condition
                               STF_t_plus_1 = sales[week + 1]), 
                  on = "item_code", # The join key
                  by = .EACHI] # Do the operations per each join

## Second step (run the `mean_trajectory` only once)
res[, `:=`(LTF_t_minus_1 = mean_trajectory$sales[week - 1],
           LTF_t = mean_trajectory$sales[week],
           LTF_t_plus_1 = mean_trajectory$sales[week + 1])]

Some benchmarks:

### Creating your data sets
set.seed(123)
N <- 1e5
N2 <- 5e7

in_cluster <- data.frame(item_code = c(1:N))

real_sales <-
  data.frame(
    item_code = sample(N, size = N2, replace = TRUE),
    sales = sample(N, size = N2, replace = TRUE)
  )

mean_trajectory <- data.frame(sales = sample(N, size = 25, replace = TRUE))

training_df <- data.frame(
  LTF_t_minus_1 = numeric(0),
  LTF_t = numeric(0),
  LTF_t_plus_1 = numeric(0),
  RS_t_minus_1 = numeric(0),
  RS_t = numeric(0),
  STF_t_plus_1 = numeric(0)
)
week = 2

###############################
################# Your solution
system.time({
  for (r in 1:nrow(in_cluster)) {
    item <- in_cluster[r,, drop = FALSE]
    sale_row <-
      dplyr::filter(real_sales, item_code == item$item_code)
    if (nrow(sale_row) > 2) {
      new_df <- data.frame(
        LTF_t_minus_1 = mean_trajectory$sales[[week - 1]],
        LTF_t = mean_trajectory$sales[[week]],
        LTF_t_plus_1 = mean_trajectory$sales[[week + 1]],
        RS_t_minus_1 = sale_row$sales[[week - 1]],
        RS_t = sale_row$sales[[week]],
        STF_t_plus_1 = sale_row$sales[[week + 1]]
      )
      training_df <-
        bind_rows(training_df, new_df)
    }
  }
}) 
### Ran forever- I've killed it after half an hour


######################
########## My solution
library(data.table)
system.time({
res <-
  setDT(real_sales)[setDT(in_cluster), 
                  if(.N > 2) .(RS_t_minus_1 = sales[week - 1],
                               RS_t = sales[week],
                               STF_t_plus_1 = sales[week + 1]), 
                  on = "item_code",
                  by = .EACHI]
res[, `:=`(LTF_t_minus_1 = mean_trajectory$sales[week - 1],
           LTF_t = mean_trajectory$sales[week],
           LTF_t_plus_1 = mean_trajectory$sales[week + 1])]
})

# user  system elapsed 
# 2.42    0.05    2.47 

So for 50MM rows the data.table solution ran for about 2 secs, while your solution ran endlessly until I've killed it (after half an hour).

David Arenburg
  • 91,361
  • 17
  • 137
  • 196