2

I want to get the missing part of several date's intervals in 2017.

here for example, each "id" of following dataframe:

df <- data.frame(id=c(rep("a",3),rep("b",2)),
                 start=c("2017-01-01","2017-01-10","2017-02-10","2017-03-01","2017-04-20"),
                 end=c("2017-01-15","2017-01-20","2017-02-20","2017-03-28","2017-04-29"))

id    start        end 
a     2017-01-01   2017-01-15 
a     2017-01-10   2017-01-20
a     2017-02-10   2017-02-20
b     2017-03-01   2017-03-28
b     2017-04-20   2017-04-29

I want to get:

df_final <- data.frame(id=c(rep("a",2),rep("b",3)),
                       start=c("2017-01-21","2017-02-21","2017-01-01","2017-03-29","2017-04-30"),
                       end=c("2017-02-09","2017-12-31","2017-02-28","2017-04-19","2017-12-31"))

id    start        end
a     2017-01-21   2017-02-09
a     2017-02-21   2017-12-31
b     2017-01-01   2017-02-28
b     2017-03-29   2017-04-19
b     2017-04-30   2017-12-31

Thank you!

Darren Tsai
  • 32,117
  • 5
  • 21
  • 51
DD chen
  • 169
  • 11

3 Answers3

2

First, confirm whether start and end are Date class.

df$start <- as.Date(df$start)
df$end <- as.Date(df$end)

Use by() to split the data into a list of two data frames according to the ids.

library(purrr)

by(df, df$id, function(x){
  year <- seq(as.Date("2017-01-01"), as.Date("2017-12-31"), 1)
  ind <- map2(x$start, x$end, function(start, end){
      which(year < start | year > end)
  }) %>% reduce(intersect)
  gap <- which(diff(ind) > 1)
  head <- ind[c(1, gap + 1)] ; tail <- ind[c(gap, length(ind))]
  return(data.frame(id = unique(x$id), start = year[head], end = year[tail]))
}) %>% reduce(rbind)

Description:

  • year : All days in 2017.
  • ind : Get rid of the dates between start and end along the rows and the outcome represents the indices of missing dates.
  • gap : The discontinuous indices.

Output:

#   id      start        end
# 1  a 2017-01-21 2017-02-09
# 2  a 2017-02-21 2017-12-31
# 3  b 2017-01-01 2017-02-28
# 4  b 2017-03-29 2017-04-19
# 5  b 2017-04-30 2017-12-31

I think my solution is still cumbersome. Hope to help you.

Darren Tsai
  • 32,117
  • 5
  • 21
  • 51
0

I encountered a similar problem recently, and I found that expanding the table to get one row for each relevant date, and then collapsing back down to ranges, was easier than trying to work out the correct logic from the range endpoints alone.

Here's how that approach would work. Alternatively, it might be possible to do something like this or this, but those approaches don't have the "not in range" issue you're dealing with.

library(dplyr)
library(fuzzyjoin)
library(lubridate)

df <- data.frame(id=c(rep("a",3),rep("b",2)),
                 start=c("2017-01-01","2017-01-10","2017-02-10","2017-03-01","2017-04-20"),
                 end=c("2017-01-15","2017-01-20","2017-02-20","2017-03-28","2017-04-29"))

# All the dates in 2017.
all.2017.dates = data.frame(date = seq.Date(as.Date("2017-01-01"), as.Date("2017-12-31"), by = "day"))

# Start by expanding the original dataframe so that we get one record for each
# id for each date in any of that id's ranges.
df.expanded = df %>%
  # Convert the strings to real dates.
  mutate(start.date = as.Date(start),
         end.date = as.Date(end)) %>%
  # Left join to 2017 dates on dates that are in the range of this record.
  fuzzy_left_join(all.2017.dates,
                  by = c("start.date" = "date", "end.date" = "date"),
                  match_fun = list(`<=`, `>=`)) %>%
  # Filter to distinct ids/dates.
  select(id, date) %>%
  distinct()

# Now, do an anti-join that gets dates NOT in an id's ranges, and collapse back
# down to ranges.
df.final = expand.grid(id = unique(df$id),
                       date = all.2017.dates$date) %>%
  # Anti-join on id and date.
  anti_join(df.expanded,
            by = c("id", "date")) %>%
  # Sort by id, then date, so that the lead/lag functions behave as expected.
  arrange(id, date) %>%
  # Check whether this record is an endpoint (i.e., is it adjacent to the
  # previous/next record?).
  mutate(prev.day.included = coalesce(date == lag(date) + 1 &
                                        id == lag(id), F),
         next.day.included = coalesce(date == lead(date) - 1 &
                                        id == lag(id), F)) %>%
  # Filter to just endpoint records.
  filter(!prev.day.included | !next.day.included) %>%
  # Fill in both start and end dates on "start" records.  The start date is the
  # date in the record; the end date is the date of the next record.
  mutate(start.date = as.Date(ifelse(!prev.day.included, date, NA),
                              origin = lubridate::origin),
         end.date = as.Date(ifelse(!prev.day.included, lead(date), NA),
                            origin = lubridate::origin)) %>%
  filter(!is.na(start.date))
A. S. K.
  • 2,504
  • 13
  • 22
0

Here's my solution:

library(tidyverse)
library(lubridate)
library(wrapr)

df %>%
  mutate_at(2:3, ymd) %>%
  group_by(id) %>%
  gather('start_end', 'date', start:end) %>%
  mutate(date = if_else(start_end == 'start', min(date), max(date))) %>%
  unique() %>%
  mutate(
    start = if_else(
      start_end == 'start',
      date %>% min() %>% year() %>% paste0('-01-01') %>% ymd(),
      date
    ),
    end = if_else(
      start_end == 'end',
      date %>% max() %>% year() %>% paste0('-12-31') %>% ymd(),
      date
  )) %>%
  filter(start != end) %>%
  select(id, start, end) %>%
  mutate(supp = TRUE) %>%
  bind_rows(mutate(df, supp = FALSE) %>% mutate_at(2:3, ymd)) %>%
  arrange(id, start) %>%
  mutate(rn = row_number()) %.>%
  left_join(., mutate(., rn = rn - 1), by = c('id', 'rn')) %>%
  na.omit() %>%
  mutate(
    start = case_when(
      (start.y >= end.x) & !supp.x ~ end.x + 1,
      (start.y >= end.x) &  supp.x ~ start.x,
      TRUE ~ as.Date(NA)
    ),
    end = case_when(
      (start.y >= end.x) &  supp.y ~ end.y,
      (start.y >= end.x) & !supp.y ~ start.y - 1,
      TRUE ~ as.Date(NA)
    )
  ) %>%
  select(id, start, end) %>%
  na.omit()
Paweł Chabros
  • 2,349
  • 1
  • 9
  • 12