How to compact block-duplicates of variables from a date-ordered dataset into a wide-table in R?

Question

I have the following data

structure(list(station = c("61R002", "61R002", "61R002", "61R002", 
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002", 
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002", 
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002", 
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002", 
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002", 
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002", 
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002", 
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002", 
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002", 
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002", 
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002", 
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002", 
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002", 
"61R002", "61R002", "61R002"), pollutant = c(17201L, 17201L, 
17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 
17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 
17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 
17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 
17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 
17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17204L, 
17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 
17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 
17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 
17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 
17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 
17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 17204L
), tag = c("002", "002", "002", "002", "002", "002", "002", "002", 
"002", "002", "002", "002", "002", "002", "002", "002", "002", 
"002", "002", "002", "002", "002", "002", "002", "002", "002", 
"002", "002", "002", "002", "002", "002", "002", "002", "002", 
"002", "002", "002", "002", "002", "002", "002", "002", "002", 
"002", "002", "002", "002", "002", "002", "002", "002", "002", 
"002", "002", "002", "002", "002", "002", "002", "002", "002", 
"002", "002", "002", "002", "002", "002", "002", "002", "002", 
"002", "002", "002", "002", "002", "002", "002", "002", "002", 
"002", "002", "002", "002", "002", "002", "002", "002", "002", 
"002", "002", "002", "002", "002", "002", "002", "002", "002"
), concentration = c(NA, 0.41, 0.41, 0.41, 0.41, 0.41, 0.41, 
0.42, 0.42, 0.42, 0.42, 0.42, 0.42, 0.42, 0.39, 0.39, 0.39, 0.39, 
0.39, 0.39, 0.39, 0.46, 0.46, 0.46, 0.46, 0.46, 0.46, 0.46, 0.33, 
0.33, 0.33, 0.33, 0.33, 0.33, 0.33, 0.4, 0.4, 0.4, 0.4, 0.4, 
0.4, 0.4, 0.38, 0.38, 0.38, 0.38, 0.38, 0.38, 0.38, NA, 0.38, 
0.38, 0.38, 0.38, 0.38, 0.38, 0.38, 0.38, 0.38, 0.38, 0.38, 0.38, 
0.38, 0.37, 0.37, 0.37, 0.37, 0.37, 0.37, 0.37, 0.38, 0.38, 0.38, 
0.38, 0.38, 0.38, 0.38, 0.31, 0.31, 0.31, 0.31, 0.31, 0.31, 0.31, 
0.33, 0.33, 0.33, 0.33, 0.33, 0.33, 0.33, 0.38, 0.38, 0.38, 0.38, 
0.38, 0.38, 0.38), date = structure(c(1514764800, 1514851200, 
1514937600, 1515024000, 1515110400, 1515196800, 1515283200, 1515369600, 
1515456000, 1515542400, 1515628800, 1515715200, 1515801600, 1515888000, 
1515974400, 1516060800, 1516147200, 1516233600, 1516320000, 1516406400, 
1516492800, 1516579200, 1516665600, 1516752000, 1516838400, 1516924800, 
1517011200, 1517097600, 1517184000, 1517270400, 1517356800, 1517443200, 
1517529600, 1517616000, 1517702400, 1517788800, 1517875200, 1517961600, 
1518048000, 1518134400, 1518220800, 1518307200, 1518393600, 1518480000, 
1518566400, 1518652800, 1518739200, 1518825600, 1518912000, 1514764800, 
1514851200, 1514937600, 1515024000, 1515110400, 1515196800, 1515283200, 
1515369600, 1515456000, 1515542400, 1515628800, 1515715200, 1515801600, 
1515888000, 1515974400, 1516060800, 1516147200, 1516233600, 1516320000, 
1516406400, 1516492800, 1516579200, 1516665600, 1516752000, 1516838400, 
1516924800, 1517011200, 1517097600, 1517184000, 1517270400, 1517356800, 
1517443200, 1517529600, 1517616000, 1517702400, 1517788800, 1517875200, 
1517961600, 1518048000, 1518134400, 1518220800, 1518307200, 1518393600, 
1518480000, 1518566400, 1518652800, 1518739200, 1518825600, 1518912000
), tzone = "UTC", class = c("POSIXct", "POSIXt"))), row.names = c(NA, 
-98L), class = c("tbl_df", "tbl", "data.frame"))

I would like to transform it as

tag   station start_date end_date   `17201` `17204`
<chr> <chr>   <date>     <date>       <dbl>   <dbl>
002   61R002  2018-01-02 2018-01-07    0.41    0.38
002   61R002  2018-01-08 2018-01-14    0.42    0.38
002   61R002  2018-01-15 2018-01-21    0.39    0.37
002   61R002  2018-01-22 2018-01-28    0.46    0.38
002   61R002  2018-01-29 2018-02-04    0.33    0.31
002   61R002  2018-02-05 2018-02-11    0.4     0.33
002   61R002  2018-02-12 2018-02-18    0.38    0.38

i.e. into non-overlapping date intervals per station.

How can I achieve this (with dplyr & pipe operator, for example) ?

Note that station & pollutant variables can take more values and interval between start_date & end_date are not fixed.

A previous question (and answer) allowed me to reach

station pollutant tag   concentration start_date          end_date           
<chr>       <int> <chr>         <dbl> <dttm>              <dttm>             
61R002      17201 002            0.41 2018-01-02 00:00:00 2018-01-07 00:00:00
61R002      17201 002            0.42 2018-01-08 00:00:00 2018-01-14 00:00:00
61R002      17201 002            0.39 2018-01-15 00:00:00 2018-01-21 00:00:00
61R002      17201 002            0.46 2018-01-22 00:00:00 2018-01-28 00:00:00
61R002      17201 002            0.33 2018-01-29 00:00:00 2018-02-04 00:00:00
61R002      17201 002            0.4  2018-02-05 00:00:00 2018-02-11 00:00:00
61R002      17201 002            0.38 2018-02-12 00:00:00 2018-02-18 00:00:00
61R002      17204 002            0.38 2018-01-02 00:00:00 2018-01-14 00:00:00
61R002      17204 002            0.37 2018-01-15 00:00:00 2018-01-21 00:00:00
61R002      17204 002            0.38 2018-01-22 00:00:00 2018-01-28 00:00:00
61R002      17204 002            0.31 2018-01-29 00:00:00 2018-02-04 00:00:00
61R002      17204 002            0.33 2018-02-05 00:00:00 2018-02-11 00:00:00
61R002      17204 002            0.38 2018-02-12 00:00:00 2018-02-18 00:00:00

Many thanks.

What is the logic of breaking `2018-01-02` to `2018-01-14` into 2 rows (`2018-01-02` to `2018-01-07` and `2018-01-08` to `2018-01-14` ? In general, when should a row be divided into more number of rows and how should it be divided? — Ronak Shah, Jul 29 '21 at 01:44
It comes from the fact that the data is based on more or less weekly measurements (but the period is not fixed). My reduced dataset only considers 2 pollutants, but the real case scenario has 8. So the breaking of the rows should be made as the shortest time periods found in the data. I do not know if I am clear... — Alessandro, Jul 29 '21 at 12:30

Alessandro · Accepted Answer · 2021-08-04T10:06:22.470

The solution is inspired by combining this answer and this answer.

data_new <- data %>%
  arrange(station, pollutant, date) %>%
  group_by(tag, station, pollutant, grp = rleid(concentration)) %>%
  summarise(concentration = first(concentration), start_date = min(date), end_date = max(date), .groups = 'drop') %>%
  select(-grp) %>%
  mutate(date_range = interval(start_date, end_date))
  xls <- NULL
for (station in unique(data$station))
{
  tmp <- data_new %>%
    filter(station == !!station)

  all_dates <- tmp %>%
    select(start_date, end_date, concentration) %>%
    pivot_longer(!concentration, names_to = "date_type", values_to="date") %>%
    arrange(date) %>%
    select(date) %>%
    distinct()

  tmp2 <- tmp %>%
    rowwise() %>%
    mutate(bounded_dates = list(filter(all_dates, all_dates$date %within% date_range) %>% pull(date)),
           bounded_intervals = list(int_diff(bounded_dates))) %>%
    select(tag, station, pollutant, concentration, bounded_intervals) %>%
    unnest(bounded_intervals) %>%
    filter(bounded_intervals != as.interval(days(1), start = int_start(bounded_intervals))) %>%
    mutate(start_date = as_date(int_start(bounded_intervals)),
           end_date = as_date(int_end(bounded_intervals))) %>%
    pivot_wider(id_cols = -bounded_intervals, names_from = "pollutant", values_from = "concentration")

  xls <- bind_rows(xls, tmp2)
}

gives me the desired output

tag   station start_date end_date   `17201` `17204`
<chr> <chr>   <date>     <date>       <dbl>   <dbl>
002   61R002  2018-01-02 2018-01-07    0.41    0.38
002   61R002  2018-01-08 2018-01-14    0.42    0.38
002   61R002  2018-01-15 2018-01-21    0.39    0.37
002   61R002  2018-01-22 2018-01-28    0.46    0.38
002   61R002  2018-01-29 2018-02-04    0.33    0.31
002   61R002  2018-02-05 2018-02-11    0.4     0.33
002   61R002  2018-02-12 2018-02-18    0.38    0.38

How to compact block-duplicates of variables from a date-ordered dataset into a wide-table in R?

1 Answers1

Linked