2

I have a large data set, a sample is given below:

df <- data.frame(stringsAsFactors=FALSE,
                 Date = c("2015-10-26", "2015-10-26", "2015-10-26", "2015-10-26",
                          "2015-10-27", "2015-10-27", "2015-10-27"),
                 Ticker = c("ANZ", "CBA", "NAB", "WBC", "ANZ", "CBA", "WBC"),
                 Open = c(29.11, 77.89, 32.69, 31.87, 29.05, 77.61, 31.84),
                 High = c(29.17, 77.93, 32.76, 31.92, 29.08, 78.1, 31.95),
                 Low = c(28.89, 77.37, 32.42, 31.71, 28.9, 77.54, 31.65),
                 Close = c(28.9, 77.5, 32.42, 31.84, 28.94, 77.74, 31.77),
                 Volume = c(6350170L, 2251288L, 3804239L, 5597684L, 5925519L, 2424679L,
                            5448863L)
)
  • The problem I am trying to solve is the missing data for NAB on 27-10-2015
  • I want the last value to repeat itself for the missing dates:

        Date Ticker  Open  High   Low Close  Volume
    
    2 2015-10-27    NAB 32.69 32.76 32.42 32.42 3804239
    

Any ideas on how to do this?

I have already unsuccessfully tried gather + spread

cephalopod
  • 1,826
  • 22
  • 31

3 Answers3

4

What if you tried something like this?

library(zoo)
res <- expand.grid(Date = unique(df$Date), Ticker = unique(df$Ticker))
res2 <- merge(res, df, all.x = TRUE)
res2 <- res2[order(res2$Ticker, res2$Date),]
res3 <- na.locf(res2)

res3[order(res3$Date, res3$Ticker),]
#        Date Ticker  Open  High   Low Close  Volume
#1 2015-10-26    ANZ 29.11 29.17 28.89 28.90 6350170
#3 2015-10-26    CBA 77.89 77.93 77.37 77.50 2251288
#5 2015-10-26    NAB 32.69 32.76 32.42 32.42 3804239
#6 2015-10-26    WBC 31.87 31.92 31.71 31.84 5597684
#2 2015-10-27    ANZ 29.05 29.08 28.90 28.94 5925519
#4 2015-10-27    CBA 77.61 78.10 77.54 77.74 2424679
#8 2015-10-27    NAB 32.69 32.76 32.42 32.42 3804239
#7 2015-10-27    WBC 31.84 31.95 31.65 31.77 5448863

I'm assuming that if a Ticker/Day combo does not exist, you want to create one and LOCF it. This is what the expand.grid does.

Mike H.
  • 13,960
  • 2
  • 29
  • 39
  • Answers should include any packages you are using. I've never heard of `na.locf` and I get an error when I try to run it. – ngm Mar 15 '18 at 20:18
  • @ngm. Updated. This is from the `zoo` package a common R package – Mike H. Mar 15 '18 at 20:20
  • If it's possible that there are leading `NA` values then use `na.locf(res2, fill = NA)` to cause them to be left alone rather than dropped. – G. Grothendieck Mar 16 '18 at 14:53
3

tidyr::complete and tidyr::fill are built just for this situation:

library(tidyverse)
df %>%
  complete(Date,Ticker) %>%
  arrange(Ticker) %>%
  fill(names(.)) %>%
  arrange(Date)
# 
# # A tibble: 8 x 7
#         Date Ticker  Open  High   Low Close  Volume
#        <chr>  <chr> <dbl> <dbl> <dbl> <dbl>   <int>
# 1 2015-10-26    ANZ 29.11 29.17 28.89 28.90 6350170
# 2 2015-10-26    CBA 77.89 77.93 77.37 77.50 2251288
# 3 2015-10-26    NAB 32.69 32.76 32.42 32.42 3804239
# 4 2015-10-26    WBC 31.87 31.92 31.71 31.84 5597684
# 5 2015-10-27    ANZ 29.05 29.08 28.90 28.94 5925519
# 6 2015-10-27    CBA 77.61 78.10 77.54 77.74 2424679
# 7 2015-10-27    NAB 32.69 32.76 32.42 32.42 3804239
# 8 2015-10-27    WBC 31.84 31.95 31.65 31.77 5448863
moodymudskipper
  • 46,417
  • 11
  • 121
  • 167
1

Another potential solution (Note: I had to convert your date vector to Date format, but this could be reversed in the final output):

library(tidyr)
library(dplyr)

df <- data.frame(stringsAsFactors=FALSE,
                 Date = as.Date(c("2015-10-26", "2015-10-26", "2015-10-26", "2015-10-26",
                          "2015-10-27", "2015-10-27", "2015-10-27")),
                 Ticker = c("ANZ", "CBA", "NAB", "WBC", "ANZ", "CBA", "WBC"),
                 Open = c(29.11, 77.89, 32.69, 31.87, 29.05, 77.61, 31.84),
                 High = c(29.17, 77.93, 32.76, 31.92, 29.08, 78.1, 31.95),
                 Low = c(28.89, 77.37, 32.42, 31.71, 28.9, 77.54, 31.65),
                 Close = c(28.9, 77.5, 32.42, 31.84, 28.94, 77.74, 31.77),
                 Volume = c(6350170L, 2251288L, 3804239L, 5597684L, 5925519L, 2424679L,
                            5448863L))
tickers<- unique(df$Ticker)               
dates<- as.Date(df$Date)

possibilities<- as.data.frame(unique(expand.grid(dates,tickers)))
colnames(possibilities)<- c('Date','Ticker')

missing<- anti_join(possibilities,df[,c('Date','Ticker')])

missing_filled<- if(nrow(missing)>0){
replacement<-   cbind(missing,filter(df,Date==missing$Date-1,Ticker==missing$Ticker)[,3:7])
}

final<- arrange(rbind(df,replacement),Date)
Duke
  • 163
  • 8