3

In a dataset in which I have an activity, start and stop times, and an id, I want to merge values of two rows in the same column and update other columns if several conditions apply. First a data example:

library(data.table)
DT <- data.table(person=c(1,1,1,1,2,2,2,2,2,3,3,3,3),
             activity=c("grab", "walk", "remove", "delete", "run", "talk", "walk", "remove",
                        "grab", "walk", "delete", "talk", "remove"),
             start_time=c(0,1,3,6,0,2,2,3,3,3,6,6,7), stop_time=c(1,3,5,7,1,4,4,8,4,5,7,7,8))
 DT

I want to update start and stop times and merge column 'activity' for each person if:

  • activities are conducted in parallel. Specifically, if the start_time of the following activity is before the stop_time of the preceeding activity activity for that person. Or:
  • If either the start or stop times of activities by one person are identical.

The updated row should reflect the start and stop times of the combined activity, and all rows, except the one updated should be removed. Below is the goal I want to achieve with the data sample I provided:

DT.goal <- data.table(person=c(1,1,2,2,3,3),
                  activity=c("grab + walk + remove", "delete", "run", "talk + walk + grab + remove",
                             "walk", "delete + talk + remove"),
                  start_time=c(0,6,0,2,3,6), stop_time=c(5,7,1,8,5,8))
DT.goal

So far I've come up with the following incomplete attempt:

DT.test <- DT[start_time <= shift(stop_time, 1L, type="lag"), 
          cond := T, by=person]
DT.test <- DT.test[cond==T, 
          new_activity := paste(activity, shift(activity, 1L, type="lag")), by=person]
DT.test <- DT.test[, new_start := start_time, by=person][cond==T, new_start := min(start_time), by=person]
DT.test <- DT.test[, new_stop := stop_time, by=person][cond==T, new_stop := max(stop_time), by=person]

However, using the shift(, type="lag) is not very useful for the first row for each person, as it has no previous row to look at. Also, paste() pastes NA if the condition does not evaluate to TRUE.

Can anyone help me on my way?

mivandev
  • 321
  • 2
  • 14

4 Answers4

2

Please check the following:

library(data.table)
DT <- data.table(person=c(1,1,1,1,2,2,2,2,2,3,3,3,3),
                 activity=c("grab", "walk", "remove", "delete", "run", "talk", "walk", "remove",
                            "grab", "walk", "delete", "talk", "remove"),
                 start_time=c(0,1,3,6,0,2,2,3,3,3,6,6,7), stop_time=c(1,3,5,7,1,4,4,8,4,5,7,7,8))

setorder(DT, person, start_time)
DT[, concatenate := start_time %in% stop_time | stop_time %in% start_time | duplicated(start_time) | duplicated(start_time, fromLast=TRUE), by = "person"]
DT[, concatenate_grp := rleid(concatenate), by = "person"]
DT[, paste(activity, collapse = " + "), by = c("person", "concatenate_grp")]
DT.goal <- DT[, .(activity = paste(activity, collapse = " + "), start_time = min(start_time), stop_time = max(stop_time)), by = c("person", "concatenate_grp")][, concatenate_grp := NULL]

Which results in:

   person                    activity start_time stop_time
1:      1        grab + walk + remove          0         5
2:      1                      delete          6         7
3:      2                         run          0         1
4:      2 talk + walk + remove + grab          2         8
5:      3                        walk          3         5
6:      3      delete + talk + remove          6         8
ismirsehregal
  • 30,045
  • 5
  • 31
  • 78
2
setorder(DT, person, stop_time)
DT[, 
   break_here := start_time > shift(stop_time, 1, stop_time[1]) , 
   by = person
   ][, 
     .(activity = paste(activity, collapse = " + "), start_time = start_time[1], stop_time = stop_time[.N]), 
     keyby = .(person, helper_var = cumsum(break_here))
     ][, !"helper_var"]


   person                    activity start_time stop_time
1:      1        grab + walk + remove          0         5
2:      1                      delete          6         7
3:      2                         run          0         1
4:      2 talk + walk + grab + remove          2         8
5:      3                        walk          3         5
6:      3      delete + talk + remove          6         8
s_baldur
  • 29,441
  • 4
  • 36
  • 69
2

Another option by borrowing the idea from David Aurenburg's solution from here

setorder(DT, person, start_time, stop_time)
DT[, g := c(0L, cumsum(shift(start_time, -1L) > cummax(stop_time))[-.N]), person]
DT[, .(activity=paste(activity, collapse=" + "), 
        start_time=min(start_time), stop_time=max(stop_time)), 
    .(person, g)]

output:

   person g                    activity start_time stop_time
1:      1 0        grab + walk + remove          0         5
2:      1 1                      delete          6         7
3:      2 0                         run          0         1
4:      2 1 talk + walk + grab + remove          2         8
5:      3 0                        walk          3         5
6:      3 1      delete + talk + remove          6         8
chinsoon12
  • 25,005
  • 4
  • 25
  • 35
  • This works in the example data I provided, but given that my actual start and stop times are POSIXt, the following error is shown: ```'cummax' not defined for "POSIXt" objects``` – mivandev Oct 22 '19 at 07:44
  • 1
    maybe `as.numeric` might help? – chinsoon12 Oct 22 '19 at 07:46
1

Using dplyr, we arrange the data by person and start and stop time. We create a group where the time overlaps for each person and select the first start_time and last stop_time in each group and concatenate all the activities in each group.

library(dplyr)

DT %>%
  arrange(person, start_time, stop_time) %>%
  group_by(person, group = cumsum(start_time > 
                lag(stop_time, default = first(stop_time)))) %>%
  summarise(start_time = first(start_time), 
            stop_time = last(stop_time), 
            activity = paste(activity, collapse = " + ")) %>%
  select(-group)

#  person start_time stop_time activity                   
#   <dbl>      <dbl>     <dbl> <chr>                      
#1      1          0         5 grab + walk + remove       
#2      1          6         7 delete                     
#3      2          0         1 run                        
#4      2          2         8 talk + walk + grab + remove
#5      3          3         5 walk                       
#6      3          6         8 delete + talk + remove     
Ronak Shah
  • 377,200
  • 20
  • 156
  • 213