Group by Identifier and then calculating averages based on a start and end date

Question

I have a time series data set that looks something like this:

I would like to group the ID and calculate the average value before the start date, during the time period from the start date to the end date and then after the end date.

Any help would be greatly appreciated Thank you!

"here is some code that creates an example data set in R" would be much more helpful than "my data looks something like this." Helpful hints here: https://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example — Jon Spring, Mar 15 '23 at 17:20
Can you please specify the logic for `before the start date`? For ID 45, all the rows have 'Start Date' as '2017-02-01' which implies there are no rows to average? — akrun, Mar 15 '23 at 17:20
@akrun so all of the IDs have a row from 2016-08 to some date in 2023. I would like to average the values that are specific to the rows in between the start date and end date — Al. So, Mar 15 '23 at 17:23

dufei · Accepted Answer · 2023-03-15T23:22:00.680

library(tidyverse)
library(lubridate)

# define sample data
df <- tibble::tribble(
    ~id,    ~month,       ~start,         ~end, ~value,
   "45", "2016-08", "2017-02-01", "2019-03-07",    300,
   "45", "2016-09", "2017-02-01", "2019-03-07",    345,
   "45", "2018-09", "2017-02-01", "2019-03-07",    888,
   "45", "2018-10", "2017-02-01", "2019-03-07",    232,
   "45", "2023-09", "2017-02-01", "2019-03-07",    456,
  "113", "2016-08", "2018-05-01", "2020-07-15",   1253,
  "113", "2016-09", "2018-05-01", "2020-07-15",   3423,
  "113", "2019-01", "2018-05-01", "2020-07-15",   1353,
  "113", "2019-02", "2018-05-01", "2020-07-15",   2362,
  "113", "2023-09", "2018-05-01", "2020-07-15",   2315
  )

# convert to dates
df <- df |> 
  mutate(month = ym(month),
         start = ymd(start),
         end = ymd(end))

# compute means for each period
df |> 
  mutate(period = case_when(
    month %within% interval(start, end) ~ "during",
    month < start ~ "before",
    month > end ~ "after"
  )) |> 
  summarise(
    mean = mean(value),
    .by = c(id, period)
  )
#> # A tibble: 6 × 3
#>   id    period  mean
#>   <chr> <chr>  <dbl>
#> 1 45    before  322.
#> 2 45    during  560 
#> 3 45    after   456 
#> 4 113   before 2338 
#> 5 113   during 1858.
#> 6 113   after  2315

^{Created on 2023-03-16 with reprex v2.0.2}

Alternatively, start with tidy dataframes and join them:

library(tidyverse)
library(lubridate)

# define sample data
df1 <- tibble::tribble(
   ~id,       ~start,         ~end,
  "45", "2017-02-01", "2019-03-07",
 "113", "2018-05-01", "2020-07-15"
)

df2 <- tibble::tribble(
    ~id,    ~month, ~value,
   "45", "2016-08",    300,
   "45", "2016-09",    345,
   "45", "2018-09",    888,
   "45", "2018-10",    232,
   "45", "2023-09",    456,
  "113", "2016-08",   1253,
  "113", "2016-09",   3423,
  "113", "2019-01",   1353,
  "113", "2019-02",   2362,
  "113", "2023-09",   2315
)

# merge month to intervals
during <- df1 |> 
  inner_join(df2, join_by(id, between(y$month, x$start, x$end))) |> 
  summarise(during = mean(value), .by = id)

before <- df1 |> 
  inner_join(df2, join_by(id, start > month)) |> 
  summarise(before = mean(value), .by = id)

after <- df1 |> 
  inner_join(df2, join_by(id, end < month)) |> 
  summarise(after = mean(value), .by = id)

# bind results
before |> 
  inner_join(during, join_by(id)) |> 
  inner_join(after, join_by(id))
#> # A tibble: 2 × 4
#>   id    before during after
#>   <chr>  <dbl>  <dbl> <dbl>
#> 1 45      322.   560    456
#> 2 113    2338   1858.  2315

^{Created on 2023-03-16 with reprex v2.0.2}

Group by Identifier and then calculating averages based on a start and end date

1 Answers1