0

I have this curve (COVID-19 cases per 100,000 inhabitants in California between 2020-09-01 and 2021-03-01):

enter image description here

It's clear that the dip at the end of December 2020 is an artifact of testing's having gone down during the winter holidays (the nadir occurs exactly on Christmas Day) rather than a true decline in cases.

What I would like to do is impute values via some sort of quadratic or parabolic interpolation to come up with plausible values for the real case rate (per 100k) between 2020-12-12 and 2021-01-13. How can I do this?

Here's the code I used to generate the plot:

x <- seq.Date(as.Date("2020-09-01"), as.Date("2021-03-01"), by=1)
y <- c(9.36,9.16,9.05,8.88,8.76,8.65,7.94,7.81,7.65,7.5,7.47,7.5,7.52,8.19,
       8.03,8.1,8.12,8.14,8.19,8.24,8.21,8.19,8.19,8.22,8.24,8.2,8.16,8.14,
       8.16,8.14,8.25,8.19,8.3,8.36,8.45,8.43,8.42,8.44,8.51,8.63,8.62,8.63,
       8.66,8.69,8.73,8.81,8.79,8.9,9.15,9.46,9.67,9.78,10.07,10.19,10.32,10.48,
       10.52,10.69,10.93,11.27,11.68,12.4,13.45,14.66,15.92,17.09,18.15,18.85,
       19.04,19.98,20.93,21.69,22.89,24.28,25.52,26.78,29.08,31.29,33.62,35.34,
       37.11,37.95,38.35,39.59,40.82,42.06,39.44,39.63,41.69,43.73,47.78,52.64,
       57.16,65.24,70.15,72.29,73.01,76.01,78.53,81.46,84.64,87.58,89.86,90.79,
       93.81,96.47,98.05,99.48,100.07,99.73,99.65,99.36,99.52,99.32,92.84,82.53,
       84.34,86.33,89.6,92.99,96.15,99.42,101.56,102.45,102.72,103.63,102.26,101,
       104.48,112.58,109.57,106.79,100.29,94.47,88.73,83.79,79.33,76.19,74.25,
       67.69,63.86,60.59,57.27,54.07,51.8,50.69,49.49,46.01,42.54,39.79,37.28,
       36.11,35.29,33.53,31.66,30.16,28.58,27.37,26.13,25.13,23.06,21.33,19.92,
       18.65,17.51,16.71,16.13,14.63,13.89,13.03,12.27,11.56,11.1,10.79,10.63,
       10.07,9.63,9.28,8.98,8.77,8.61,8.25)

df <- data.frame(x,y)

p <- ggplot(data=df) +
  geom_line(aes(x=as.Date(x, origin=as.Date("1970-01-01")),y=y))
p

I'm not really sure where to begin, so I'd appreciate it if someone tossed me a bone, here. Thanks! :)

dbcrow
  • 45
  • 1
  • 5
  • 1
    I would say this is a duplicate: https://stackoverflow.com/questions/3822535/fitting-polynomial-model-to-data-in-r – bricx May 17 '21 at 18:24
  • This is not a duplicate, unless I'm missing something. The link given (https://stackoverflow.com/questions/3822535/fitting-polynomial-model-to-data-in-r) deals with fitting polynomial curves to existing data. I'm asking about interpolating (i.e., imputing) data for a portion of the curve that is missing. – dbcrow May 17 '21 at 22:13
  • 1
    Ok, I meant what you are probably looking for is to fit a polynomial and then interpolate that polynomial. You will always need to fit a model of some kind to be able to interpolate. – bricx May 18 '21 at 16:14

1 Answers1

0

A colleague supplied this code:

    #Quadratic Interpolation 

library(magrittr)
library(dplyr)
library(ggplot2)
library(deSolve)
ca_pop <- 40129160L

x <- seq.Date(as.Date("2020-09-01"), as.Date("2021-03-01"), by=1)
y <- c(9.36,9.16,9.05,8.88,8.76,8.65,7.94,7.81,7.65,7.5,7.47,7.5,7.52,8.19,
       8.03,8.1,8.12,8.14,8.19,8.24,8.21,8.19,8.19,8.22,8.24,8.2,8.16,8.14,
       8.16,8.14,8.25,8.19,8.3,8.36,8.45,8.43,8.42,8.44,8.51,8.63,8.62,8.63,
       8.66,8.69,8.73,8.81,8.79,8.9,9.15,9.46,9.67,9.78,10.07,10.19,10.32,10.48,
       10.52,10.69,10.93,11.27,11.68,12.4,13.45,14.66,15.92,17.09,18.15,18.85,
       19.04,19.98,20.93,21.69,22.89,24.28,25.52,26.78,29.08,31.29,33.62,35.34,
       37.11,37.95,38.35,39.59,40.82,42.06,39.44,39.63,41.69,43.73,47.78,52.64,
       57.16,65.24,70.15,72.29,73.01,76.01,78.53,81.46,84.64,87.58,89.86,90.79,
       93.81,96.47,98.05,99.48,100.07,99.73,99.65,99.36,99.52,99.32,92.84,82.53,
       84.34,86.33,89.6,92.99,96.15,99.42,101.56,102.45,102.72,103.63,102.26,101,
       104.48,112.58,109.57,106.79,100.29,94.47,88.73,83.79,79.33,76.19,74.25,
       67.69,63.86,60.59,57.27,54.07,51.8,50.69,49.49,46.01,42.54,39.79,37.28,
       36.11,35.29,33.53,31.66,30.16,28.58,27.37,26.13,25.13,23.06,21.33,19.92,
       18.65,17.51,16.71,16.13,14.63,13.89,13.03,12.27,11.56,11.1,10.79,10.63,
       10.07,9.63,9.28,8.98,8.77,8.61,8.25)

df <- data.frame(x,y, day_num = 1:length(y))
leave_out <- which(df$x > "2020-12-18" & df$x < "2021-01-08")

#Plot curve with missing points
df[-leave_out,] %>% filter(x > "2020-10-15") %>%
  ggplot() +
  geom_point(aes(x=x,y=y), shape=1, alpha=.7, size=.6) + 
  theme_bw()

df_fit <- df %>%#[-leave_out,] %>%
  filter(x > "2020-10-15") %>%
  mutate(transform_day = 1 / day_num) 

#Plot df_fit
#ggplot(data=df_fit, aes(x=x, y=transform_day)) + geom_line()
  
#quad_model <- lm(y ~ (poly(transform_day, 2)), data = df_fit)
#y_fit <- predict(quad_model)
#model_fit <- data.frame(x = df_fit$x,y_fit)
#model_fit %>% filter(x > "2020-10-15") %>%
#  ggplot() +
#  geom_line(aes(x = x, y = y_fit)) +
#  geom_point(data = df_fit, aes(x = x, y =y)) + theme_bw()
halfway_ind <- round(mean(order(abs(y - 30))[1:2]))
halfway_ind #116
halfway_ind60 <- round(mean(order(abs(y - 60))[c(1,3)]))
halfway_ind60 #118
##Let's say 117 for the peak
df_fit$day_adj <- df_fit$day_num - 117
df_fit$model <- 150*375/ (df_fit$day_adj^2 + 375)
df_fit$cases <- df_fit$y * ca_pop / 1e5
df_fit %>% 
  ggplot() +
  geom_line(aes(x = day_adj, y = model)) +
  geom_point(aes(x = day_adj, y =y)) + theme_bw()
dbcrow
  • 45
  • 1
  • 5