I'm really struggling with how to pull this transformation without resorting to loops. My dataframe is in a long format so that each year, grade, and school combination has its own row.
I'm wanting to divide a value for each year + grade + school by the value for the previous year and grade. The code below should do that for me, but with 500k rows, it will take days to complete.
Any ideas on how I could do this faster?
I've tried using dplyr but wasn't getting anywhere with it. Same with standard base R transformation approaches.
for (i in 1:NROW(df)) {
for (j in 1:NROW(df)) {
if(df$COUNTY[i] == df$COUNTY[j] &
df$YEAR[i] == (df$YEAR[j] + 1) &
df$Grade[i] == (df$Grade[j] + 1)){
df$RATE[i] <- df$value[i] / df$value[j]
} else{
next
}
}
if(i %% 10 == 0){print(i)}
}
Data:
structure(list(YEAR = c(2011, 2011, 2011, 2011, 2011, 2012, 2012,
2012, 2012, 2012, 2013, 2013, 2013, 2013, 2013, 2014, 2014, 2014,
2014, 2014), Grade = c(-1, 0, 1, 2, 3, -1, 0, 1, 2, 3, -1, 0,
1, 2, 3, -1, 0, 1, 2, 3), COUNTY = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), .Label = c("001", "002", "003", "004", "005", "006", "007",
"008", "009", "010", "011", "012", "013", "014", "015", "016",
"017", "018", "019", "020", "021", "022", "023", "024", "025",
"026", "027", "028", "029", "030", "031", "032", "033", "034",
"035", "036", "037", "038", "039", "040", "041", "042", "043",
"044", "045", "046", "047", "048", "049", "050", "051", "052",
"053", "054", "055", "056", "057", "058", "059", "060", "061",
"062", "063", "064", "065", "066", "067", "068", "069", "070",
"071", "072", "073", "074", "075", "076", "077", "078", "079",
"080", "081", "082", "083", "084", "085", "086", "087", "088",
"089", "090", "091", "092", "093", "094", "095", "096", "097",
"098", "099", "100", "101", "102", "103", "104", "105", "106",
"107", "108", "109", "110", "111", "112", "113", "114", "115",
"126", "145", "166", "201", "347", "401", "640", "KCS"), class = "factor"),
value = c(178, 212, 208, 208, 242, 199, 230, 227, 208, 208,
187, 245, 235, 216, 204, 189, 235, 250, 226, 217)), row.names = c(NA,
-20L), class = c("grouped_df", "tbl_df", "tbl", "data.frame"), .internal.selfref = <pointer: 0x000001d7929a1ef0>, groups = structure(list(
YEAR = c(2011, 2011, 2011, 2011, 2011, 2012, 2012, 2012,
2012, 2012, 2013, 2013, 2013, 2013, 2013, 2014, 2014, 2014,
2014, 2014), Grade = c(-1, 0, 1, 2, 3, -1, 0, 1, 2, 3, -1,
0, 1, 2, 3, -1, 0, 1, 2, 3), .rows = list(1L, 2L, 3L, 4L,
5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L,
17L, 18L, 19L, 20L)), row.names = c(NA, -20L), class = c("tbl_df",
"tbl", "data.frame")))