0

I have a data frame (dfA) showing for each country when a specific status occurred for the first time.

What I would like to get is a data frame (dfGoal) which shows for every day from January 1, 2020 until today which status was in place at this specific day. In addition to that I also would like to see since when the status is in place.

Here you can see a minimal example of my data and how my data frame should look like at the end.

pacman::p_load(lubridate)

# data frame which shows which status occurred when for the first time
dfA <- data.frame(country = c("Poland", "Poland", "Poland",
                          "Spain", "Spain",
                          "Italy", "Italy", "Italy", "Italy"),
              status = c(0, 1, 2,
                         0, 2,
                         0, 1, 2, 0),
              since = c(20200101, 20200228, 20200312,
                       20200118, 20200301,
                       20200101, 20200212, 20200304, 20200401))

date_seq <- seq(ymd('2020-01-01'),ymd('2020-04-18'), by = '1 day')

# target data frame
dfGoal <- data.frame(curr_date = c(date_seq, date_seq, date_seq),
                 country = c(replicate(109, "Poland"), 
                             replicate(109, "Spain"),
                             replicate(109, "Italy")),
                 status = c(replicate(58, 0),
                            replicate(13, 1),
                            replicate(38, 2),
                            replicate(17, NA),
                            replicate(43, 0),
                            replicate(49, 2),
                            replicate(42, 0),
                            replicate(21, 1),
                            replicate(28, 2),
                            replicate(18, 0)),
                 since = c(replicate(58, 20200101),
                          replicate(13, 20200228),
                          replicate(38, 20200312),
                          replicate(17, NA),
                          replicate(43, 20200118),
                          replicate(49, 20200301),
                          replicate(42, 20200101),
                          replicate(21, 20200212),
                          replicate(28, 20200304),
                          replicate(18, 20200401)))

I tried to apply the suggestions made here, however I was not able to solve it.

Does anyone know how to do this with dplyr?

marc_s
  • 732,580
  • 175
  • 1,330
  • 1,459

2 Answers2

0

A data.table solution

library(data.table)
library(lubridate)
#> 
#> Attaching package: 'lubridate'
#> The following objects are masked from 'package:data.table':
#> 
#>     hour, isoweek, mday, minute, month, quarter, second, wday, week,
#>     yday, year
#> The following objects are masked from 'package:base':
#> 
#>     date, intersect, setdiff, union

dfA <- data.frame(country = c("Poland", "Poland", "Poland",
                              "Spain", "Spain",
                              "Italy", "Italy", "Italy", "Italy"),
                  status = c(0, 1, 2,
                             0, 2,
                             0, 1, 2, 0),
                  since = c(20200101, 20200228, 20200312,
                            20200118, 20200301,
                            20200101, 20200212, 20200304, 20200401))
setDT(dfA)


dfA[,since:=ymd(since)]

dfA[,end:={x <- shift(since,-1)
          x <- x-1
                x[is.na(x)] <- ymd(20200418)
                x},by=.(country)]
dfA
#>    country status      since        end
#> 1:  Poland      0 2020-01-01 2020-02-27
#> 2:  Poland      1 2020-02-28 2020-03-11
#> 3:  Poland      2 2020-03-12 2020-04-18
#> 4:   Spain      0 2020-01-18 2020-02-29
#> 5:   Spain      2 2020-03-01 2020-04-18
#> 6:   Italy      0 2020-01-01 2020-02-11
#> 7:   Italy      1 2020-02-12 2020-03-03
#> 8:   Italy      2 2020-03-04 2020-03-31
#> 9:   Italy      0 2020-04-01 2020-04-18


dfA[,.(country=country,status=status,since=since,
       curr_date=seq(from=since,to=end,by="day")),by=1:nrow(dfA)]
#>      nrow country status      since  curr_date
#>   1:    1  Poland      0 2020-01-01 2020-01-01
#>   2:    1  Poland      0 2020-01-01 2020-01-02
#>   3:    1  Poland      0 2020-01-01 2020-01-03
#>   4:    1  Poland      0 2020-01-01 2020-01-04
#>   5:    1  Poland      0 2020-01-01 2020-01-05
#>  ---                                          
#> 306:    9   Italy      0 2020-04-01 2020-04-14
#> 307:    9   Italy      0 2020-04-01 2020-04-15
#> 308:    9   Italy      0 2020-04-01 2020-04-16
#> 309:    9   Italy      0 2020-04-01 2020-04-17
#> 310:    9   Italy      0 2020-04-01 2020-04-18

Created on 2020-04-19 by the reprex package (v0.3.0)

Frank Zhang
  • 1,670
  • 7
  • 14
0

Here is a solution with tidyverse, using lubridate to convert dates from numeric.

You can use complete to add rows of dates between Jan 1 and today for each country, and then fill status and since columns down from the last value.

library(tidyverse)
library(lubridate)

dfA %>%
  mutate(since = ymd(since),
         curr_date = ymd(since)) %>%
  group_by(country) %>%
  complete(curr_date = seq(ymd(20200101), ymd(20200418), by = "1 day")) %>%
  fill(status, since, .direction = "down")

Output

# A tibble: 327 x 4
# Groups:   country [3]
   country curr_date  status since     
   <fct>   <date>      <dbl> <date>    
 1 Italy   2020-01-01      0 2020-01-01
 2 Italy   2020-01-02      0 2020-01-01
 3 Italy   2020-01-03      0 2020-01-01
 4 Italy   2020-01-04      0 2020-01-01
 5 Italy   2020-01-05      0 2020-01-01
 6 Italy   2020-01-06      0 2020-01-01
 7 Italy   2020-01-07      0 2020-01-01
 8 Italy   2020-01-08      0 2020-01-01
 9 Italy   2020-01-09      0 2020-01-01
10 Italy   2020-01-10      0 2020-01-01
# … with 317 more rows
Ben
  • 28,684
  • 5
  • 23
  • 45