Is there a computationally faster way of doing this data transformation?

Question

I'm really struggling with how to pull this transformation without resorting to loops. My dataframe is in a long format so that each year, grade, and school combination has its own row.

I'm wanting to divide a value for each year + grade + school by the value for the previous year and grade. The code below should do that for me, but with 500k rows, it will take days to complete.

Any ideas on how I could do this faster?

I've tried using dplyr but wasn't getting anywhere with it. Same with standard base R transformation approaches.

for (i in 1:NROW(df)) {
  for (j in 1:NROW(df)) {
    if(df$COUNTY[i] == df$COUNTY[j] & 
       df$YEAR[i] == (df$YEAR[j] + 1) & 
       df$Grade[i] == (df$Grade[j] + 1)){

      df$RATE[i] <- df$value[i] / df$value[j]

    } else{

      next

    }
  }
  if(i %% 10 == 0){print(i)}
}

Data:

structure(list(YEAR = c(2011, 2011, 2011, 2011, 2011, 2012, 2012, 
2012, 2012, 2012, 2013, 2013, 2013, 2013, 2013, 2014, 2014, 2014, 
2014, 2014), Grade = c(-1, 0, 1, 2, 3, -1, 0, 1, 2, 3, -1, 0, 
1, 2, 3, -1, 0, 1, 2, 3), COUNTY = structure(c(1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), .Label = c("001", "002", "003", "004", "005", "006", "007", 
"008", "009", "010", "011", "012", "013", "014", "015", "016", 
"017", "018", "019", "020", "021", "022", "023", "024", "025", 
"026", "027", "028", "029", "030", "031", "032", "033", "034", 
"035", "036", "037", "038", "039", "040", "041", "042", "043", 
"044", "045", "046", "047", "048", "049", "050", "051", "052", 
"053", "054", "055", "056", "057", "058", "059", "060", "061", 
"062", "063", "064", "065", "066", "067", "068", "069", "070", 
"071", "072", "073", "074", "075", "076", "077", "078", "079", 
"080", "081", "082", "083", "084", "085", "086", "087", "088", 
"089", "090", "091", "092", "093", "094", "095", "096", "097", 
"098", "099", "100", "101", "102", "103", "104", "105", "106", 
"107", "108", "109", "110", "111", "112", "113", "114", "115", 
"126", "145", "166", "201", "347", "401", "640", "KCS"), class = "factor"), 
    value = c(178, 212, 208, 208, 242, 199, 230, 227, 208, 208, 
    187, 245, 235, 216, 204, 189, 235, 250, 226, 217)), row.names = c(NA, 
-20L), class = c("grouped_df", "tbl_df", "tbl", "data.frame"), .internal.selfref = <pointer: 0x000001d7929a1ef0>, groups = structure(list(
    YEAR = c(2011, 2011, 2011, 2011, 2011, 2012, 2012, 2012, 
    2012, 2012, 2013, 2013, 2013, 2013, 2013, 2014, 2014, 2014, 
    2014, 2014), Grade = c(-1, 0, 1, 2, 3, -1, 0, 1, 2, 3, -1, 
    0, 1, 2, 3, -1, 0, 1, 2, 3), .rows = list(1L, 2L, 3L, 4L, 
        5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 
        17L, 18L, 19L, 20L)), row.names = c(NA, -20L), class = c("tbl_df", 
"tbl", "data.frame")))

please provide the result of doing `dput(df)` so that we know what your data looks like. If your data is too big, use `dput(head(df))` — boski, Sep 30 '19 at 15:36
Should I copy-paste the results directly into the textbox, or is there a way it should be formatted? — dmacfour, Sep 30 '19 at 15:38
@dmacfour read through this: https://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example you don't need to share all of your data. You can share couple rows from each year (a minimal example that can reproduce your problem/question). — M--, Sep 30 '19 at 15:41
Note that the data sample does not contain the column `SCHOOL`, and contains no information that would yield a positive result. It is not really reproducible in this case. — Oliver, Sep 30 '19 at 16:04
Just a small trick: If you have to use for-loop, using a standalone vector instead of a column in your huge data.frame may significantly speed up your for-loop. For instance, generate a vector `COUNTY = df$COUNTY` and replace `df$COUNTY` with `COUNTY` in your for-loop. And do the same thing for `YEAR`, `RATE` and `Grade`. Let `df$RATE` equal `RATE` afterwards. — Fan Wang, Oct 01 '19 at 20:16

score 2 · Answer 1 · answered Sep 30 '19 at 16:30

2

Using data.table

library(data.table)
setDT(df)[order(YEAR),  Ratio := value/shift(value) , .(COUNTY, Grade)]

answered Sep 30 '19 at 16:30

akrun

874,273
37
540
662

score 1 · Accepted Answer · answered Sep 30 '19 at 16:21

The simplest way to speed this up, is by eliminating the inner for-loop and using vectorized functions instead.

Example:

df$RATE2 <- 0
for(i in seq(nrow(df))){
    indx <- which(df$COUNTY[i] == df$COUNTY & 
                  df$YEAR[i] == (df$YEAR + 1) & 
                  df$Grade[i] == (df$Grade + 1))
    if((n <- length(indx)) > 1)
        stop("Error, rowcount too great!")
    else if(n == 1)
        df$RATE2[i] <- df$value[i] / df$value[indx]
}
all.equal(df$RATE, df$RATE2)
[1] TRUE

Note that & will perform comparison across all elements in the vectors, thus the logical statement will return a TRUE or FALSE for each row in the data frame. For convenience (and usualy without loosing much if any time) i convert it to an index vector using which, and if the length is only 1 (not overwriting 1 element with multiple) i overwrite the value of RATE2 with the appropriate index.

Comparison:

microbenchmark:::microbenchmark(original = {
    df$RATE <- 0
    for (i in 1:NROW(df)) {
        for (j in 1:NROW(df)) {
            if(df$COUNTY[i] == df$COUNTY[j] && 
               df$YEAR[i] == (df$YEAR[j] + 1) && 
               df$Grade[i] == (df$Grade[j] + 1)){
                df$RATE[i] <- df$value[i] / df$value[j]
            }
        }
    }
}, improved = {
    df$RATE2 <- 0
    for(i in seq(nrow(df))){
        indx <- which(df$COUNTY[i] == df$COUNTY & df$YEAR[i] == (df$YEAR + 1) & df$Grade[i] == (df$Grade + 1))
        if((n <- length(indx)) > 1)
            stop("Error, rowcount too great!")
        else if(n == 1)
            df$RATE2[i] <- df$value[i] / df$value[indx]
    }

})
#output:
Unit: milliseconds
     expr       min        lq      mean    median        uq      max neval
 original 15.452877 19.751258 26.944155 23.750028 33.886566 70.93348   100
 improved  1.020224  1.221664  2.121345  1.730265  2.311173 17.56658   100

From the median time we see that we now only use 1.73/23.75 * 100 = 7.3 % of the original time to perform the calculations.

Note that using apply would not speed up this process, it is the usage of vectorized functions that does the job here. Further note that i slightly altered the code for the original function to use && and removed the redundant else part. This speeds up this version of the code slightly.

Thank you! I don't fully understand what is happening with which(), but the end result was correct. — dmacfour, Sep 30 '19 at 16:34
To illustrate the function `which`, consider `which(c(FALSE, TRUE, FALSE))`, which returns `2` and `which(c(FALSE, TRUE, TRUE))` which returns `c(2,3)`. Basically it just returns which indices are true. I just prefer having indices instead of a vector of true/false to work with. — Oliver, Sep 30 '19 at 16:36

score 1 · Answer 3 · answered Sep 30 '19 at 16:28

In dplyr you can just do this, which should be pretty quick...

library(dplyr)
df <- df %>% group_by(COUNTY, Grade) %>%      #for your df above, but replace with SCHOOL or whatever
             arrange(YEAR) %>%                #sort by increasing year
             mutate(Ratio = value/lag(value)) #value for year / value for previous year

df
    YEAR Grade COUNTY value  Ratio
   <dbl> <dbl> <fct>  <dbl>  <dbl>
 1  2011    -1 001      178 NA    
 2  2011     0 001      212 NA    
 3  2011     1 001      208 NA    
 4  2011     2 001      208 NA    
 5  2011     3 001      242 NA    
 6  2012    -1 001      199  1.12 
 7  2012     0 001      230  1.08 
 8  2012     1 001      227  1.09 
 9  2012     2 001      208  1    
10  2012     3 001      208  0.860
11  2013    -1 001      187  0.940
12  2013     0 001      245  1.07 
13  2013     1 001      235  1.04 
14  2013     2 001      216  1.04 
15  2013     3 001      204  0.981
16  2014    -1 001      189  1.01 
17  2014     0 001      235  0.959
18  2014     1 001      250  1.06 
19  2014     2 001      226  1.05 
20  2014     3 001      217  1.06

Parfait · Answer 4 · 2019-09-30T16:47:27.177

Simply use the vectorized ifelse off of shifted columns. Below assumes COUNTY is a factor variable (not character):

# SHIFT COLUMNS FORWARD
df$COUNTY_SHIFT <- factor(levels(df$COUNTY)[c(0, df$COUNTY[1:(nrow(df)-1)])]
df$YEAR_SHIFT <- c(NA, df$YEAR[1:(nrow(df)-1)])
df$Grade_SHIFT <- c(NA, df$Grade[1:(nrow(df)-1)])
df$value_SHIFT <- c(NA, df$value[1:(nrow(df)-1)])

# CONDITIONALLY ASSIGN
df$RATE <- ifelse(df$COUNTY == df$COUNTY_SHIFT & 
                  df$YEAR == df$YEAR_SHIFT & 
                  df$Grade == df$Grade_SHIFT,
                  df$value / df$value_SHIFT,
                  NA)

Or all inside a within context:

df <- within(df, {
         # SHIFT COLUMNS FORWARD
         COUNTY_SHIFT <- factor(levels(COUNTY)[c(0, COUNTY[1:(nrow(df)-1)])]
         YEAR_SHIFT <- c(NA, YEAR[1:(nrow(df)-1)])
         Grade_SHIFT <- c(NA, Grade[1:(nrow(df)-1)])
         value_SHIFT <- c(NA, value[1:(nrow(df)-1)])

         # CONDITIONALLY ASSIGN
         RATE <- ifelse(COUNTY == COUNTY_SHIFT & 
                        YEAR == YEAR_SHIFT & 
                        Grade == Grade_SHIFT,
                        value / value_SHIFT,
                        NA)
         # REMOVE HELPER COLUMNS
         rm(COUNTY_SHIFT, YEAR_SHIFT, Grade_SHIFT, value_SHIFT)
})

Alternatively, merge on a shifted data frame:

df$ID <- 1:nrow(df)
shifted_df <- merge(transform(df, ID=ID-1), df[-1,], by="ID", suffixes=c("", "_SHIFT"))

final_df <- within(shifted_df , {   
         # CONDITIONALLY ASSIGN
         RATE <- ifelse(COUNTY == COUNTY_SHIFT & 
                        YEAR == YEAR_SHIFT & 
                        Grade == Grade_SHIFT,
                        value / value_SHIFT,
                        NA)
         # REMOVE HELPER COLUMNS
         rm(COUNTY_SHIFT, YEAR_SHIFT, Grade_SHIFT, value_SHIFT)
})

Is there a computationally faster way of doing this data transformation?

4 Answers4