1

Consider the following data.frame:

data_frame = structure(list(AGREEDTIME = structure(c(1451785888.76968, 1451785945.59156, 
1451786041.22135, 1451786089.17713, 1451786671.55922, 1451786853.52841, 
1451787231.03475, 1451787641.43011, 1451787999.77345, 1451788571.08314, 
1451788695.76539, 1451788769.29787, 1451788891.90181, 1451789206.47645, 
1451789653.27497, 1451789740.52194, 1451789875.7906, 1451789971.7024, 
1451790030.94949, 1451790681.31701), class = c("POSIXct", "POSIXt"
), tzone = "UTC"), EA = c(2383, 1064, 848, 68, 2665, 277, 175, 
2761, 773, 426, 1164, 600, 413, 371, 733, 259, 976, 297, 1973, 
1022), FW = structure(c(2L, 2L, 1L, 1L, 3L, 1L, 2L, 2L, 2L, 3L, 
1L, 3L, 3L, 1L, 1L, 3L, 2L, 3L, 3L, 1L), .Label = c("NZNR", "WZNI", 
"HSCW"), class = "factor"), CP = structure(c(1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), .Label = "OZU/NQV", class = "factor")), .Names = c("AGREEDTIME", 
"EA", "FW", "CP"), row.names = c(NA, -20L), class = "data.frame")

or in more human readable form:

            AGREEDTIME   EA   FW      CP
1  2016-01-03 01:51:28 2383 WZNI OZU/NQV
2  2016-01-03 01:52:25 1064 WZNI OZU/NQV
3  2016-01-03 01:54:01  848 NZNR OZU/NQV
4  2016-01-03 01:54:49   68 NZNR OZU/NQV
5  2016-01-03 02:04:31 2665 HSCW OZU/NQV
6  2016-01-03 02:07:33  277 NZNR OZU/NQV
7  2016-01-03 02:13:51  175 WZNI OZU/NQV
8  2016-01-03 02:20:41 2761 WZNI OZU/NQV
9  2016-01-03 02:26:39  773 WZNI OZU/NQV
10 2016-01-03 02:36:11  426 HSCW OZU/NQV
11 2016-01-03 02:38:15 1164 NZNR OZU/NQV
12 2016-01-03 02:39:29  600 HSCW OZU/NQV
13 2016-01-03 02:41:31  413 HSCW OZU/NQV
14 2016-01-03 02:46:46  371 NZNR OZU/NQV
15 2016-01-03 02:54:13  733 NZNR OZU/NQV
16 2016-01-03 02:55:40  259 HSCW OZU/NQV
17 2016-01-03 02:57:55  976 WZNI OZU/NQV
18 2016-01-03 02:59:31  297 HSCW OZU/NQV
19 2016-01-03 03:00:30 1973 HSCW OZU/NQV
20 2016-01-03 03:11:21 1022 NZNR OZU/NQV

Now, my objective is, for each row i of this matrix,

sum the values of EA for all rows satisfying:

  • AGREEDTIME is in the window [AGREEDTIME(i)-3600sec, AGREEDTIME(i)-1800sec]
  • FW == FW[i]
  • CP == CP[i]

using naive R code, the actual results should be:

slow_function <- function(ind, data_frame){
    index     = data_frame[ind, ]
    index_set = which(data_frame$AGREEDTIME > index$AGREEDTIME - 3600 & 
            data_frame$AGREEDTIME < index$AGREEDTIME - 1800 &
            data_frame$FW == index$FW &
            data_frame$CP == index$CP)
    sum(data_frame$EA[index_set])
}
data_frame_results = data.frame(data_frame, results = sapply(1:nrow(data_frame), slow_function, data_frame = data_frame))

Now, I'm trying to use data.table to obtain the same results. My first stab is:

data_table  <- data.table(data_frame)
setkey(data_table, FW, CP, AGREEDTIME)

less_slow_function <- function(ind, data_table){
    ATm = data_table$AGREEDTIME[ind] - 3600
    ATM = data_table$AGREEDTIME[ind] - 1800
    fw  = data_table$FW[ind]
    cp  = data_table$CP[ind]
    as.numeric(data_table[J(fw, cp)][AGREEDTIME > ATm & AGREEDTIME < ATM, 'EA'][,lapply(.SD, sum)])
}
data_table_results = data.table(data_table, results = sapply(1:nrow(data_table), less_slow_function, data_table = data_table))

Which gives the desired numbers. My question is: is there a faster way to get the same results using data.table magic better than my naive solution?

user189035
  • 5,589
  • 13
  • 52
  • 112
  • 1
    The answer there translates to `DT[, v := DT[.(FW = FW, CP = CP, t_dn = AGREEDTIME - 3600, t_up = AGREEDTIME - 1800), on=.(FW, CP, AGREEDTIME >= t_dn, AGREEDTIME <= t_up), sum(EA, na.rm = TRUE) , by=.EACHI]$V1 ]` in your case. It's adding a new column to the original data.table rather than making a new one. There may be some better rolling way, but someone can post it over there, I figure. – Frank Apr 08 '17 at 20:03
  • 1
    @Frank: `biutiful` thanks! Upvoted the original! – user189035 Apr 08 '17 at 20:10

0 Answers0