R faster function to repeatedly subset data

Question

I have two data.tables entext.new and transit.service1 (posted as data frames below).

> dput(data.frame(entext.new))
structure(list(person = c(1701L, 1701L), vehicle = c("tr_71_3", 
"tr_5_7"), atstop = c(108557L, 108536L), time1 = c(31931, 32560
), acttype1 = structure(c(1L, 1L), .Label = c("PersonEntersVehicle", 
"PersonLeavesVehicle", "waitingForPt"), class = "factor"), person2 = c(1701L, 
1701L), vehicle2 = c("tr_71_3", "tr_5_7"), deststop = c(108558L, 
100905L), time2 = c(31998, 32620), acttype2 = structure(c(2L, 
2L), .Label = c("PersonEntersVehicle", "PersonLeavesVehicle", 
"waitingForPt"), class = "factor")), .Names = c("person", "vehicle", 
"atstop", "time1", "acttype1", "person2", "vehicle2", "deststop", 
"time2", "acttype2"), row.names = c(NA, -2L), class = "data.frame")

> dput(data.frame(entext.new))
structure(list(person = c(1701L, 1701L), vehicle = c("tr_71_3", 
"tr_5_7"), atstop = c(108557L, 108536L), time1 = c(31931, 32560
), acttype1 = structure(c(1L, 1L), .Label = c("PersonEntersVehicle", 
"PersonLeavesVehicle", "waitingForPt"), class = "factor"), person2 = c(1701L, 
1701L), vehicle2 = c("tr_71_3", "tr_5_7"), deststop = c(108558L, 
100905L), time2 = c(31998, 32620), acttype2 = structure(c(2L, 
2L), .Label = c("PersonEntersVehicle", "PersonLeavesVehicle", 
"waitingForPt"), class = "factor")), .Names = c("person", "vehicle", 
"atstop", "time1", "acttype1", "person2", "vehicle2", "deststop", 
"time2", "acttype2"), row.names = c(NA, -2L), class = "data.frame")
> dput(data.frame(transit.service1))
structure(list(id = c(725531L, 725532L, 726871L, 728273L, 728274L, 
728825L, 728826L, 729489L, 729490L, 730106L, 730109L, 730315L, 
730316L, 732297L, 732298L, 734989L, 734990L, 735945L, 735948L, 
736878L, 736879L, 737807L, 737808L, 737834L, 737835L, 738292L, 
738293L, 738314L, 738315L, 739275L, 739276L, 740407L, 740408L, 
741248L, 741249L, 700159L, 700160L, 700244L, 700245L, 700292L, 
700490L, 700526L, 700527L, 702052L, 702053L, 702725L, 702726L, 
702812L, 702815L, 702872L, 702991L), vehicle = c("tr_5_7", "tr_5_7", 
"tr_5_7", "tr_5_7", "tr_5_7", "tr_5_7", "tr_5_7", "tr_5_7", "tr_5_7", 
"tr_5_7", "tr_5_7", "tr_5_7", "tr_5_7", "tr_5_7", "tr_5_7", "tr_5_7", 
"tr_5_7", "tr_5_7", "tr_5_7", "tr_5_7", "tr_5_7", "tr_5_7", "tr_5_7", 
"tr_5_7", "tr_5_7", "tr_5_7", "tr_5_7", "tr_5_7", "tr_5_7", "tr_5_7", 
"tr_5_7", "tr_5_7", "tr_5_7", "tr_5_7", "tr_5_7", "tr_71_3", 
"tr_71_3", "tr_71_3", "tr_71_3", "tr_71_3", "tr_71_3", "tr_71_3", 
"tr_71_3", "tr_71_3", "tr_71_3", "tr_71_3", "tr_71_3", "tr_71_3", 
"tr_71_3", "tr_71_3", "tr_71_3"), time = c(32542, 32542, 32563, 
32584, 32584, 32594, 32594, 32604, 32604, 32613, 32613, 32617, 
32617, 32648, 32648, 32691, 32691, 32706, 32706, 32721, 32721, 
32736, 32736, 32737, 32737, 32744, 32744, 32745, 32745, 32760, 
32760, 32778, 32778, 32793, 32793, 31927, 31927, 31929, 31929, 
31930, 31935, 31936, 31936, 31977, 31977, 31994, 31994, 31996, 
31996, 31997, 32000), link = c(200016105L, NA, NA, 200016105L, 
61056124L, 61056124L, 61246144L, 61246144L, 61446158L, NA, NA, 
61446158L, 61589049L, 61589049L, 90496198L, 90496198L, 61986249L, 
NA, NA, 61986249L, 62496295L, NA, NA, 62496295L, 62956316L, NA, 
NA, 62956316L, 63166350L, NA, NA, 63166350L, 63506404L, 63506404L, 
64046472L, 61176131L, 613120013L, 613120013L, 200136131L, NA, 
NA, 200136131L, 61316194L, 61316194L, 61946230L, 61946230L, 623020014L, 
623020014L, 200146230L, NA, NA), facility = c(NA, 108536L, 108536L, 
NA, NA, NA, NA, NA, NA, 100905L, 100905L, NA, NA, NA, NA, NA, 
NA, 100979L, 100979L, NA, NA, 101017L, 101017L, NA, NA, 101075L, 
101075L, NA, NA, 101098L, 101098L, NA, NA, NA, NA, NA, NA, NA, 
NA, 108557L, 108557L, NA, NA, NA, NA, NA, NA, NA, NA, 108558L, 
108558L), acttype = structure(c(3L, 4L, 5L, 2L, 1L, 2L, 1L, 2L, 
1L, 4L, 5L, 2L, 1L, 2L, 1L, 2L, 1L, 4L, 5L, 2L, 1L, 4L, 5L, 2L, 
1L, 4L, 5L, 2L, 1L, 4L, 5L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 4L, 
5L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 4L, 5L), .Label = c("entered link", 
"left link", "vehicle enters traffic", "VehicleArrivesAtFacility", 
"VehicleDepartsAtFacility"), class = "factor")), .Names = c("id", 
"vehicle", "time", "link", "facility", "acttype"), row.names = c(NA, 
-51L), class = "data.frame")

I need to subset transit.service1 based on some criteria from entext.new. For this, I wrote a small function like below. But, this function is operating pretty slow.

func_subset1 <- function(x,y,z,k) {
  list(cbind(x, transit.service1[vehicle==y & time>=z & time<=k]))
}
list1 <- mapply(func_subset1, entext.new$person, entext.new$vehicle, entext.new$time1, entext.new$time2)
df.final <- do.call(rbind.data.frame, list1)

I have tried to profile the code like so (although I do not fully understand it):

> summaryRprof(tmp)
$by.self
      self.time self.pct total.time total.pct
"cat"      0.02      100       0.02       100

$by.total
                    total.time total.pct self.time self.pct
"cat"                     0.02       100      0.02      100
".rs.valueContents"       0.02       100      0.00        0
".rs.valueFromStr"        0.02       100      0.00        0
".rs.withTimeLimit"       0.02       100      0.00        0
"<Anonymous>"             0.02       100      0.00        0
"capture.output"          0.02       100      0.00        0
"do.call"                 0.02       100      0.00        0
"doTryCatch"              0.02       100      0.00        0
"eval"                    0.02       100      0.00        0
"evalVis"                 0.02       100      0.00        0
"NextMethod"              0.02       100      0.00        0
"str"                     0.02       100      0.00        0
"str.data.frame"          0.02       100      0.00        0
"str.default"             0.02       100      0.00        0
"strSub"                  0.02       100      0.00        0
"try"                     0.02       100      0.00        0
"tryCatch"                0.02       100      0.00        0
"tryCatchList"            0.02       100      0.00        0
"tryCatchOne"             0.02       100      0.00        0
"withVisible"             0.02       100      0.00        0

$sample.interval
[1] 0.02

$sampling.time
[1] 0.02

Do you have any suggestions to speed up this subsetting function?

Please provide a reproducible example in the question so people dont need to download stuff from your dropbox — talat, Jun 08 '17 at 12:25
see second answer here but it seems you won't get much better : https://stackoverflow.com/questions/19053177/r-data-table-join-with-inequality-conditions — moodymudskipper, Jun 08 '17 at 14:56
maybe looping through restrictions if you don't have so many distinct values ? Here you're filtering each time, you probably don't have a thousand vehicles for example, so you could subset vehicle by vehicle — moodymudskipper, Jun 08 '17 at 14:59
@docendodiscimus Thanks. I updated the question with a reproducible example. — dataanalyst, Jun 08 '17 at 19:47
@Moody_Mudskipper About your first comment, are you referring to the use of setkey? If yes, I have already keyed the transit.services1 data table. After your comment, I benchmarked two functions one working on keyed table and another on unkeyed table. The function with unkeyed table appears to be performing very slightly better compared to the one with keyed table. About your second comment, I will try to implement that and see how it works. — dataanalyst, Jun 08 '17 at 20:31
@Moody_Mudskipper I tested with less restrictions (by pre-filtering vehicle). This seems to improve the performance a little bit. But, unfortunately I have about 2200 vehicles. So, I probably do not want to go that route. — dataanalyst, Jun 09 '17 at 09:05
maybe sort your data.frames by time and time1, then remove obsolete rows in transit.service as you progress, the script will always find the relevant rows in the top, thus it may be faster — moodymudskipper, Jun 09 '17 at 09:14

R faster function to repeatedly subset data

0 Answers0