1

I have the following data in a CSV:

OBJECTID,District,Zone,year_value,FROM_DATE,SUM_Crime
1,Northwestern,Not in Zone,2019,2/6/2019,4
2,Northwestern,Zone 30,2019,2/7/2019,6
3,Northwestern,Zone 40,2019,2/8/2019,5
4,Northwestern,Zone 30,2019,2/9/2019,2
5,Northwestern,Not in Zone,2019,2/10/2019,4
6,Northwestern,Zone 40,2019,2/11/2019,4
7,Northwestern,Zone 30,2019,2/12/2019,0

How would I group this every 3 days, starting on the 6th in R? The end result would look like the following (the values are the Sum_Crime over the 3 day period):

Zone,2/6/2019 - 2/8/2019,2/9/2019 - 2/11/2019,2/12/2019 - 2/13/2019

Not in Zone,4,4,
Zone 30,6,2,0
Zone 40,5,4,

However this is for only one District (northwestern), ideally it would do this by each district I have.

Thank you.

Sathish
  • 12,453
  • 3
  • 41
  • 59
user13203033
  • 123
  • 7
  • (1) You need to convert to the `Date` class in R, perhaps `as.Date(FROM_DATE, format="%m/%d/%Y")` (I'm assuming format, verify). (2) You can use something akin to `seq.Date(Sys.Date(), Sys.Date()+100, by = 3)` (using your preferred start date) to produce a vector of boundaries, then `cut` to get what you need. – r2evans Apr 02 '20 at 19:53
  • (You can control `cut(..., labels=...)` in ways suggested in the comments under https://stackoverflow.com/q/60894989) – r2evans Apr 02 '20 at 19:55

3 Answers3

1

Several ways to attack this.

First, we need to identify a 3-day period for each. To do so, I'll create a datevec which is a sequence of days.

dat$RealDate <- as.Date(dat$FROM_DATE, format = "%m/%d/%Y")
datevec <- seq(min(dat$RealDate), max(dat$RealDate) + 3, by = 3)
dat$Period1 <- cut(dat$RealDate, datevec,
                   labels = paste(datevec[-length(datevec)], datevec[-1], sep = " - "))
dat
#   OBJECTID     District        Zone year_value FROM_DATE SUM_Crime   RealDate                 Period1
# 1        1 Northwestern Not in Zone       2019  2/6/2019         4 2019-02-06 2019-02-06 - 2019-02-09
# 2        2 Northwestern     Zone 30       2019  2/7/2019         6 2019-02-07 2019-02-06 - 2019-02-09
# 3        3 Northwestern     Zone 40       2019  2/8/2019         5 2019-02-08 2019-02-06 - 2019-02-09
# 4        4 Northwestern     Zone 30       2019  2/9/2019         2 2019-02-09 2019-02-09 - 2019-02-12
# 5        5 Northwestern Not in Zone       2019 2/10/2019         4 2019-02-10 2019-02-09 - 2019-02-12
# 6        6 Northwestern     Zone 40       2019 2/11/2019         4 2019-02-11 2019-02-09 - 2019-02-12
# 7        7 Northwestern     Zone 30       2019 2/12/2019         0 2019-02-12 2019-02-12 - 2019-02-15

If you want to keep the dates formatted as you have them (which are not real Dates in R), then we can add a second column for that format:

datevec2 <- format(datevec, format = "%m/%d/%Y")
dat$Period2 <- cut(dat$RealDate, datevec,
                   labels = paste(datevec2[-length(datevec2)], datevec2[-1], sep = " - "))
dat
#   OBJECTID     District        Zone year_value FROM_DATE SUM_Crime   RealDate                 Period1                 Period2
# 1        1 Northwestern Not in Zone       2019  2/6/2019         4 2019-02-06 2019-02-06 - 2019-02-09 02/06/2019 - 02/09/2019
# 2        2 Northwestern     Zone 30       2019  2/7/2019         6 2019-02-07 2019-02-06 - 2019-02-09 02/06/2019 - 02/09/2019
# 3        3 Northwestern     Zone 40       2019  2/8/2019         5 2019-02-08 2019-02-06 - 2019-02-09 02/06/2019 - 02/09/2019
# 4        4 Northwestern     Zone 30       2019  2/9/2019         2 2019-02-09 2019-02-09 - 2019-02-12 02/09/2019 - 02/12/2019
# 5        5 Northwestern Not in Zone       2019 2/10/2019         4 2019-02-10 2019-02-09 - 2019-02-12 02/09/2019 - 02/12/2019
# 6        6 Northwestern     Zone 40       2019 2/11/2019         4 2019-02-11 2019-02-09 - 2019-02-12 02/09/2019 - 02/12/2019
# 7        7 Northwestern     Zone 30       2019 2/12/2019         0 2019-02-12 2019-02-12 - 2019-02-15 02/12/2019 - 02/15/2019

From here, base R aggregation:

ag <- aggregate(SUM_Crime ~ Zone + Period2, data = dat, FUN = sum)
ag
#          Zone                 Period2 SUM_Crime
# 1 Not in Zone 02/06/2019 - 02/09/2019         4
# 2     Zone 30 02/06/2019 - 02/09/2019         6
# 3     Zone 40 02/06/2019 - 02/09/2019         5
# 4 Not in Zone 02/09/2019 - 02/12/2019         4
# 5     Zone 30 02/09/2019 - 02/12/2019         2
# 6     Zone 40 02/09/2019 - 02/12/2019         4
# 7     Zone 30 02/12/2019 - 02/15/2019         0
xtabs( SUM_Crime ~ Zone + Period2, data = ag)
#              Period2
# Zone          02/06/2019 - 02/09/2019 02/09/2019 - 02/12/2019 02/12/2019 - 02/15/2019
#   Not in Zone                       4                       4                       0
#   Zone 30                           6                       2                       0
#   Zone 40                           5                       4                       0

You can also put this into a dplyr pipe if you'd like:

library(dplyr)
library(tidyr) # pivot_wider
dat2 <- dat %>%
  mutate(RealDate = as.Date(FROM_DATE, format = "%m/%d/%Y"))
datevec <- seq(min(dat2$RealDate), max(dat2$RealDate) + 3, by = 3)
dat2 %>%
  mutate(
    Period1 = cut(RealDate, datevec,
                  labels = paste(datevec[-length(datevec)], datevec[-1], sep = " - "))
  ) %>%
  group_by(Zone, Period1) %>%
  summarize(SUM_Crime = sum(SUM_Crime)) %>%
  ungroup() %>%
  filter(SUM_Crime > 0) %>%  
  pivot_wider(., "Zone", names_from = "Period1", values_from = "SUM_Crime")
# # A tibble: 3 x 3
#   Zone        `2019-02-06 - 2019-02-09` `2019-02-09 - 2019-02-12`
#   <chr>                           <int>                     <int>
# 1 Not in Zone                         4                         4
# 2 Zone 30                             6                         2
# 3 Zone 40                             5                         4

Data

dat <- read.csv(header = TRUE, stringsAsFactors = FALSE, text = "
OBJECTID,District,Zone,year_value,FROM_DATE,SUM_Crime
1,Northwestern,Not in Zone,2019,2/6/2019,4
2,Northwestern,Zone 30,2019,2/7/2019,6
3,Northwestern,Zone 40,2019,2/8/2019,5
4,Northwestern,Zone 30,2019,2/9/2019,2
5,Northwestern,Not in Zone,2019,2/10/2019,4
6,Northwestern,Zone 40,2019,2/11/2019,4
7,Northwestern,Zone 30,2019,2/12/2019,0")
r2evans
  • 141,215
  • 6
  • 77
  • 149
  • Thank you! This is great. Now what if to the ag <- aggregate(SUM_Crime ~ Zone + Period2, data = dat, FUN = sum) code, I wanted to break it down not just by zone, but also District? So it would have District then Zone? Additionally, how would I add another summed variable along with sum_crime? Say if I also had sum_nonCrime? – user13203033 Apr 02 '20 at 21:23
  • 1
    Have you tried `SUM_Crime ~ District + Zone + Period2`? – r2evans Apr 02 '20 at 21:54
  • does this have overlap? I want the date groupings to be mutually exclusive. – user13203033 Apr 08 '20 at 16:53
  • 1
    `cut` does not overlap. – r2evans Apr 08 '20 at 16:57
  • how would I adjust this procedure so that the Period buckets are labeled to be mutually exclusive, and also how would I flip the procedure so that the first date in Period group does not decide where RealDate is placed, but the last one does. I would like the final date in the grouping to be where the final RealDate values for that Period fall into. – user13203033 Apr 08 '20 at 17:08
  • The periods are mutually exclusive. `cut` defaults to left-open, which means that the left date in the range is not included in that range. Since the dates are effectively integral, you could change the first component of labels to be `datevec2[-length(datevec2)]-1`. I suggest you look at [`?cut`](https://stat.ethz.ch/R-manual/R-devel/library/base/html/cut.html) for ways to control left-open, right-open, and related topics. – r2evans Apr 08 '20 at 17:14
0

One approach might be to calculate Period of 3 days starting with the first date. A Date_Range can be calculated between the first and last of these 3 days. Then pivot_wider would put data into long form.

library(tidyverse)

df2 <- df %>%
  group_by(Period = rep(row_number(), length.out = n(), each = 3)) %>%
  mutate(Date_Range = paste(first(FROM_DATE), "-", last(FROM_DATE))) %>%
  pivot_wider(id_cols = c(District, Zone), names_from = Date_Range, values_from = SUM_Crime) 

Then to put into comma separated format, you can add:

cat(format_csv(df2))

Output

District,Zone,2/6/2019 - 2/8/2019,2/9/2019 - 2/11/2019,2/12/2019 - 2/12/2019
Northwestern,Not in Zone,4,4,NA
Northwestern,Zone 30,6,2,0
Northwestern,Zone 40,5,4,NA
Ben
  • 28,684
  • 5
  • 23
  • 45
0

Code:

library('data.table')
setDT(df)[, FROM_DATE := as.Date(FROM_DATE, "%m/%d/%y")]
date_seq <- seq(from = as.Date("2020-02-06"), to =max(df$FROM_DATE), by = 3)

df1 <- df[FROM_DATE >= as.Date("2020-02-06"), .(Zone, SUM_Crime, 
                                                DATE_INTERVAL = paste0(date_seq[findInterval(x = FROM_DATE, vec = date_seq)], " - ",
                                                                       date_seq[findInterval(x = FROM_DATE, vec = date_seq)]+2)), 
          by = .(District)]
dcast(df1, Zone + District ~ DATE_INTERVAL, value.var = 'SUM_Crime', fill = NA)
#           Zone     District 2020-02-06 - 2020-02-08 2020-02-09 - 2020-02-11 2020-02-12 - 2020-02-14
# 1: Not in Zone Northwestern                       4                       4                      NA
# 2:     Zone 30 Northwestern                       6                       2                       0
# 3:     Zone 40 Northwestern                       5                       4                      NA

Data:

df <- read.table(text='OBJECTID,District,Zone,year_value,FROM_DATE,SUM_Crime
1,Northwestern,Not in Zone,2019,2/6/2019,4
                 2,Northwestern,Zone 30,2019,2/7/2019,6
                 3,Northwestern,Zone 40,2019,2/8/2019,5
                 4,Northwestern,Zone 30,2019,2/9/2019,2
                 5,Northwestern,Not in Zone,2019,2/10/2019,4
                 6,Northwestern,Zone 40,2019,2/11/2019,4
                 7,Northwestern,Zone 30,2019,2/12/2019,0', header = TRUE, stringsAsFactors = FALSE, sep = ",")
Sathish
  • 12,453
  • 3
  • 41
  • 59