5

How does one extract the minutes from a time interval so that the number of minutes from each date is returned?

I have a dataframe with the case ID, start & stop times, and the total elapsed time interval in minutes. I need to figure out how many minutes were on each date for each case.

This is somewhat similar to this question but I was unable to transfer that approach to this situation.

The original data set is structured like this, however the final solution will need to scale up to allow many more cases (see alternative dataset below).

original data

>so
  Case               Start                Stop   Minutes
1    A 2013-01-10 18:00:00 2013-01-10 20:30:00  150 mins
2    B 2013-01-15 22:45:00 2013-01-16 00:15:00   90 mins
3    C 2013-01-20 22:00:00 2013-01-22 02:00:00 1680 mins
4    D 2013-01-27 12:00:00 2013-01-28 06:00:00 1080 mins
5    E 2013-01-01 00:00:00 2013-01-01 02:00:00  120 mins
6    F 2013-01-02 08:00:00 2013-01-03 07:00:00 1380 mins

desired output

  Case       Date Minutes
1    A 2013-01-10     150
2    B 2013-01-15      75
3    B 2013-01-16      15
4    C 2013-01-20     120
5    C 2013-01-21    1440
6    C 2013-01-22     120
7    D 2013-01-27     720
8    D 2013-01-28     360
9    E 2013-01-01     120
10   F 2013-01-02     960 
11   F 2013-01-03     420

DPUT of original data

> dput(so)  
structure(list(  
Case = structure(1:5, .Label = c("A", "B", "C", "D", "E"), class = "factor"), 
Start = structure(c(2L, 3L, 4L, 5L, 1L), .Label = c("2013-01-01 00:00:00", "2013-01-10 18:00:00", "2013-01-15 22:45:00", "2013-01-20 22:00:00", "2013-01-27 12:00:00"), class = "factor"),  
Stop = structure(c(2L, 3L, 4L, 5L, 1L), .Label = c("2013-01-01 02:00:00", "2013-01-10 20:30:00", "2013-01-16 00:15:00", "2013-01-22 02:00:00", "2013-01-28 06:00:00"), class = "factor"), 
Minutes = structure(c(150, 90, 1680, 1080, 120), tzone = "", units = "mins", class = "difftime")
), 
.Names = c("Case","Start", "Stop", "Minutes"), 
row.names = c(NA, -5L), 
class = "data.frame")  

DPUT of alternative larger data set

> dput(so_alt)
structure(list(Case = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 1L, 2L), .Label = c("A", "B", "C", "D", "E", "F", "G", "H"), class = "factor"),     Start = structure(c(12L, 13L, 4L, 5L, 19L, 20L, 21L, 30L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 9L, 6L, 7L, 8L, 3L, 10L, 11L, 14L, 15L, 16L), .Label = c("2012-10-25 18:10:00", "2012-11-01 20:18:00", "2012-11-16 17:45:00", "2012-12-06 01:00:00", "2012-12-06 08:00:00", "2012-12-26 13:15:00", "2012-12-29 19:50:00", "2012-12-30 00:00:00", "2013-01-01 01:46:00", "2013-01-10 20:15:00", "2013-01-11 09:00:00", "2013-01-29 17:00:00", "2013-02-05 21:30:00", "2013-02-21 01:50:00", "2013-02-21 09:25:00", "2013-02-21 12:20:00", "2013-02-22 21:45:00", "2013-02-24 15:30:00", "2013-03-01 10:10:00", "2013-03-06 16:15:00", "2013-03-07 20:00:00", "2013-03-12 21:00:00", "2013-03-13 05:15:00", "2013-03-14 00:45:00", "2013-03-14 11:30:00", "2013-03-15 21:00:00", "2013-03-16 08:15:00", "2013-03-17 06:45:00", "2013-03-18 18:04:00", "2013-03-21 21:40:00"), class = "factor"), Stop = structure(c(12L, 13L, 4L, 5L, 19L, 20L, 21L, 30L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 9L, 6L, 7L, 8L, 3L, 10L, 11L, 14L, 15L, 16L), .Label = c("2012-10-25 21:30:00", "2012-11-03 08:00:00", "2012-11-17 08:45:00", "2012-12-06 05:00:00", "2012-12-06 09:30:00", "2012-12-29 19:50:00", "2012-12-30 00:00:00", "2012-12-31 20:00:00", "2013-01-02 06:00:00", "2013-01-10 22:13:00", "2013-01-12 03:30:00", "2013-01-30 07:45:00", "2013-02-06 07:30:00", "2013-02-21 09:25:00", "2013-02-21 12:20:00", "2013-02-22 11:30:00", "2013-02-24 10:15:00", "2013-02-24 16:45:00", "2013-03-06 07:45:00", "2013-03-07 07:30:00", "2013-03-08 08:00:00", "2013-03-13 05:00:00", "2013-03-14 00:45:00", "2013-03-14 10:15:00", "2013-03-14 19:00:00", "2013-03-15 22:36:00", "2013-03-16 19:00:00", "2013-03-18 15:15:00", "2013-03-18 22:00:00", "2013-03-22 08:30:00"), class = "factor"), Minutes = structure(c(885, 600, 240, 90, 7055, 915, 720, 650, 480, 1170, 570, 450, 96, 645, 1950, 236, 1694, 4715, 250, 2640, 900, 118, 1110, 455, 175, 1390), class = "difftime", units = "mins")), .Names = c("Case", "Start", "Stop", "Minutes"), row.names = c(NA, 26L), class = "data.frame")  
>  
Community
  • 1
  • 1
Rollie
  • 145
  • 1
  • 5

2 Answers2

2

Here is a way.

library(plyr)

ddply(ddply(so, 1, function(x) {
    d1 <- as.Date(x$Start)
    d2 <- as.Date(x$Stop)
    if (d1 == d2) data.frame(date = d1, min = x$Minutes)
    else {
        ret <- rbind(data.frame(date=d1, min = difftime(as.POSIXct(paste(d1+1, "00:00:00")), as.POSIXct(x$Start), units = "mins")), 
        data.frame(date=d2, min=difftime(as.POSIXct(x$Stop), as.POSIXct(paste(d2, "00:00:00")), units = "mins")))
        if (d2-d1>1) {
            ret <- rbind(ret, data.frame(date = seq(d1+1, d2-1, by = "day"), min = 60*24))
        }
        ret
    }
}), .(Case, date), summarise, min = sum(min))

Output is:

  Case       date       min
1    A 2013-01-10  150 mins
2    B 2013-01-15   75 mins
3    B 2013-01-16   15 mins
4    C 2013-01-20  120 mins
5    C 2013-01-21 1440 mins
6    C 2013-01-22  120 mins

The final version here:

ddply(ddply(data.frame(.id = 1:nrow(so_alt), so_alt), .(.id, Case), function(x) {
    d1 <- as.Date(x$Start)
    d2 <- as.Date(x$Stop)
    if (d1 == d2) data.frame(date = d1, min = x$Minutes)
    else {
        ret <- rbind(data.frame(date=d1, min = difftime(as.POSIXct(paste(d1+1, "00:00:00")), as.POSIXct(x$Start), units = "mins")), 
        data.frame(date=d2, min=difftime(as.POSIXct(x$Stop), as.POSIXct(paste(d2, "00:00:00")), units = "mins")))
        if (d2-d1>1) {
            ret <- rbind(ret, data.frame(date = seq(d1+1, d2-1, by = "day"), min = 60*24))
        }
        ret
    }
}), .(Case, date), summarise, min = sum(min))

and output:

   Case       date       min
1     A 2013-01-01 1334 mins
2     A 2013-01-02  360 mins
3     A 2013-01-29  420 mins
4     A 2013-01-30  465 mins
5     A 2013-02-21  175 mins
6     A 2013-03-12  180 mins
7     A 2013-03-13  300 mins
8     B 2012-12-26  645 mins
9     B 2012-12-27 1440 mins
10    B 2012-12-28 1440 mins
11    B 2012-12-29 1190 mins
12    B 2013-02-05  150 mins
13    B 2013-02-06  450 mins
14    B 2013-02-21  700 mins
15    B 2013-02-22  690 mins
16    B 2013-03-13 1125 mins
17    B 2013-03-14   45 mins
18    C 2012-12-06  240 mins
19    C 2012-12-29  250 mins
20    C 2012-12-30    0 mins
21    C 2013-03-14  570 mins
22    D 2012-12-06   90 mins
23    D 2012-12-30 1440 mins
24    D 2012-12-31 1200 mins
25    D 2013-03-14  450 mins
26    E 2012-11-16  375 mins
27    E 2012-11-17  525 mins
28    E 2013-03-01  830 mins
29    E 2013-03-02 1440 mins
30    E 2013-03-03 1440 mins
31    E 2013-03-04 1440 mins
32    E 2013-03-05 1440 mins
33    E 2013-03-06  465 mins
34    E 2013-03-15   96 mins
35    F 2013-01-10  118 mins
36    F 2013-03-06  465 mins
37    F 2013-03-07  450 mins
38    F 2013-03-16  645 mins
39    G 2013-01-11  900 mins
40    G 2013-01-12  210 mins
41    G 2013-03-07  240 mins
42    G 2013-03-08  480 mins
43    G 2013-03-17 1035 mins
44    G 2013-03-18  915 mins
45    H 2013-02-21  455 mins
46    H 2013-03-18  236 mins
47    H 2013-03-21  140 mins
48    H 2013-03-22  510 mins
kohske
  • 65,572
  • 8
  • 165
  • 155
  • This works great. However, this version does not work when there are more than 4 cases in the 'so' dataframe. Can you please revise this to scale up to allow more cases? – Rollie Jun 30 '13 at 16:39
  • Thanks. That helped but it still does not work for larger data sets. I've added an alternative larger dataset for an example. – Rollie Jul 01 '13 at 04:12
  • @Rollie It looks working with `so_alt`. Can you put the output or error messages? – kohske Jul 01 '13 at 04:17
  • The issue is that my real-world dataset has more events per unique case than the examples I posted here. I've updated the alternative so_alt data set, which produces this error: `Error in seq.Date(d1 + 1, d2 - 1, by = "day") : 'from' must be of length 1 In addition: Warning messages: ......` – Rollie Jul 02 '13 at 20:57
  • Perfect. Thanks for your persistence and patience. I really appreciate the expertise. – Rollie Jul 03 '13 at 02:23
1

Here a data.table solution. The idea is quite simple(maybe not very efficient): Create a vector of minutes for each Case, transform it to a vector of days and compute the frequency using table.

DT <- as.data.table(dat)
DT[,list(seqD= seq(as.Date(Start),as.Date(Stop),'day'),
         duration= { 
           seq.minus <- seq(as.POSIXct(Start),
                                   as.POSIXct(Stop),'min')[-1]
           as.data.frame(table(format(seq.minus,'%Y-%m-%d')))$Freq
           }),
   by = list(Case)]

   Case       seqD duration
1:    A 2013-01-10      150
2:    B 2013-01-15       74
3:    B 2013-01-16       16
4:    C 2013-01-20      119
5:    C 2013-01-21     1440
6:    C 2013-01-22      121
agstudy
  • 119,832
  • 17
  • 199
  • 261