-3

I have a time series data set that looks something like this:

enter image description here

I would like to group the ID and calculate the average value before the start date, during the time period from the start date to the end date and then after the end date.

Any help would be greatly appreciated Thank you!

Al. So
  • 53
  • 5
  • 1
    "here is some code that creates an example data set in R" would be much more helpful than "my data looks something like this." Helpful hints here: https://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example – Jon Spring Mar 15 '23 at 17:20
  • 1
    Can you please specify the logic for `before the start date`? For ID 45, all the rows have 'Start Date' as '2017-02-01' which implies there are no rows to average? – akrun Mar 15 '23 at 17:20
  • @akrun so all of the IDs have a row from 2016-08 to some date in 2023. I would like to average the values that are specific to the rows in between the start date and end date – Al. So Mar 15 '23 at 17:23

1 Answers1

1
library(tidyverse)
library(lubridate)

# define sample data
df <- tibble::tribble(
    ~id,    ~month,       ~start,         ~end, ~value,
   "45", "2016-08", "2017-02-01", "2019-03-07",    300,
   "45", "2016-09", "2017-02-01", "2019-03-07",    345,
   "45", "2018-09", "2017-02-01", "2019-03-07",    888,
   "45", "2018-10", "2017-02-01", "2019-03-07",    232,
   "45", "2023-09", "2017-02-01", "2019-03-07",    456,
  "113", "2016-08", "2018-05-01", "2020-07-15",   1253,
  "113", "2016-09", "2018-05-01", "2020-07-15",   3423,
  "113", "2019-01", "2018-05-01", "2020-07-15",   1353,
  "113", "2019-02", "2018-05-01", "2020-07-15",   2362,
  "113", "2023-09", "2018-05-01", "2020-07-15",   2315
  )

# convert to dates
df <- df |> 
  mutate(month = ym(month),
         start = ymd(start),
         end = ymd(end))

# compute means for each period
df |> 
  mutate(period = case_when(
    month %within% interval(start, end) ~ "during",
    month < start ~ "before",
    month > end ~ "after"
  )) |> 
  summarise(
    mean = mean(value),
    .by = c(id, period)
  )
#> # A tibble: 6 × 3
#>   id    period  mean
#>   <chr> <chr>  <dbl>
#> 1 45    before  322.
#> 2 45    during  560 
#> 3 45    after   456 
#> 4 113   before 2338 
#> 5 113   during 1858.
#> 6 113   after  2315

Created on 2023-03-16 with reprex v2.0.2

Alternatively, start with tidy dataframes and join them:

library(tidyverse)
library(lubridate)

# define sample data
df1 <- tibble::tribble(
   ~id,       ~start,         ~end,
  "45", "2017-02-01", "2019-03-07",
 "113", "2018-05-01", "2020-07-15"
)

df2 <- tibble::tribble(
    ~id,    ~month, ~value,
   "45", "2016-08",    300,
   "45", "2016-09",    345,
   "45", "2018-09",    888,
   "45", "2018-10",    232,
   "45", "2023-09",    456,
  "113", "2016-08",   1253,
  "113", "2016-09",   3423,
  "113", "2019-01",   1353,
  "113", "2019-02",   2362,
  "113", "2023-09",   2315
)

# merge month to intervals
during <- df1 |> 
  inner_join(df2, join_by(id, between(y$month, x$start, x$end))) |> 
  summarise(during = mean(value), .by = id)

before <- df1 |> 
  inner_join(df2, join_by(id, start > month)) |> 
  summarise(before = mean(value), .by = id)

after <- df1 |> 
  inner_join(df2, join_by(id, end < month)) |> 
  summarise(after = mean(value), .by = id)

# bind results
before |> 
  inner_join(during, join_by(id)) |> 
  inner_join(after, join_by(id))
#> # A tibble: 2 × 4
#>   id    before during after
#>   <chr>  <dbl>  <dbl> <dbl>
#> 1 45      322.   560    456
#> 2 113    2338   1858.  2315

Created on 2023-03-16 with reprex v2.0.2

dufei
  • 2,166
  • 1
  • 7
  • 18