22

I have two set of samples that are time independent. I would like to merge them and calculate the missing values for the times where I do not have values of both. Simplified example:

A <- cbind(time=c(10, 20, 30, 40, 50, 60, 70, 80, 90, 100),
           Avalue=c(1, 2, 3, 2, 1, 2, 3, 2, 1, 2))
B <- cbind(time=c(15, 30, 45, 60), Bvalue=c(100, 200, 300, 400))
C <- merge(A,B, all=TRUE)

   time Avalue Bvalue
1    10      1     NA
2    15     NA    100
3    20      2     NA
4    30      3    200
5    40      2     NA
6    45     NA    300
7    50      1     NA
8    60      2    400
9    70      3     NA
10   80      2     NA
11   90      1     NA
12  100      2     NA

By assuming linear change between each sample, it is possible to calculate the missing NA values. Intuitively it is easy to see that the A value at time 15 and 45 should be 1.5. But a proper calculation for B for instance at time 20 would be

100 + (20 - 15) * (200 - 100) / (30 - 15)

which equals 133.33333. The first parenthesis being the time between estimate time and the last sample available. The second parenthesis being the difference between the nearest samples. The third parenthesis being the time between the nearest samples.

How can I use R to calculate the NA values?

hlovdal
  • 26,565
  • 10
  • 94
  • 165

3 Answers3

24

Using the zoo package:

library(zoo)
Cz <- zoo(C)
index(Cz) <- Cz[,1]
Cz_approx <- na.approx(Cz)
Anatoliy
  • 1,350
  • 9
  • 9
  • Fantastic. I do not quite understand what the `index(Cz) <- Cz[,1] ` statement is doing, care to explain? – hlovdal Aug 25 '11 at 11:17
  • By default, the na.approx() function uses the index(obj) as points between which to interpolate each column of the dataframe. Default index is 1:12, so I replaced it with actual time measurements using index(). However, if you would like to preserve the default index, you can invoke `na.approx(Cz, x=Cz$time)`. – Anatoliy Aug 25 '11 at 11:26
  • library(zoo); ?index "Description: Generic functions for extracting the index of an object and replacing it." You're manipulating parts of a zoo object. Always a good idea to RTFM before asking questions. – Carl Witthoft Aug 25 '11 at 11:27
  • 8
    Note that converting the data frame to zoo could also be written as `Cz <- read.zoo(C)` which automatically assumes the first column holds the times. Also zoo's `na.approx` has a default method that works on ordinary vectors so even without converting `C` to zoo we could do this: `C$Bvalue <- na.approx(C$Bvalue, C$time, na.rm = FALSE)`. – G. Grothendieck Aug 25 '11 at 12:00
  • 2
    Might consider adding a `na.fill(na.approx(Cz), "extend")` around that command too, so leading and trailing NAs wouldn't cause extra difficulties. – puslet88 Mar 16 '15 at 16:48
3

The proper way to do this statistically and still get valid confidence intervals is to use Multiple Imputation. See Rubin's classic book, and there's an excellent R package for this (mi).

Ari B. Friedman
  • 71,271
  • 35
  • 175
  • 235
0

An ugly and probably inefficient Base R solution:

# Data provided:
A <- cbind(time=c(10, 20, 30, 40, 50, 60, 70, 80, 90, 100),
           Avalue=c(1, 2, 3, 2, 1, 2, 3, 2, 1, 2))
B <- cbind(time=c(15, 30, 45, 60), Bvalue=c(100, 200, 300, 400))
C <- merge(A,B, all=TRUE)

# Scalar valued at the minimum time difference: -> min_time_diff

min_time_diff <- min(diff(C$time))

# Adjust frequency of the series to hold all steps in range: -> df

df <- merge(C, 
            data.frame(time = seq(min_time_diff, 
                                 max(C$time), 
                                 by = min_time_diff)),
           by = "time",
           all = TRUE)



# Linear interpolation function handling ties,
# returns interpolated vector the same length 
# a the input vector: -> vector

l_interp_vec <- function(na_vec){

  approx(x = na_vec,

         method = "linear",

         ties = "constant",

         n = length(na_vec))$y

}

# Applied to a dataframe, replacing NA values
# in each of the numeric vectors, 
# with interpolated values. 
# input is dataframe: -> dataframe()

interped_df <- data.frame(lapply(df, function(x){

      if(is.numeric(x)){

        # Store a scalar of min row where x isn't NA: -> min_non_na

        min_non_na <- min(which(!(is.na(x))))

        # Store a scalar of max row where x isn't NA: -> max_non_na

        max_non_na <- max(which(!(is.na(x))))

        # Store scalar of the number of rows needed to impute prior 
        # to first NA value: -> ru_lower

        ru_lower <- ifelse(min_non_na > 1, min_non_na - 1, min_non_na)

        # Store scalar of the number of rows needed to impute after
        # the last non-NA value: -> ru_lower

        ru_upper <- ifelse(max_non_na == length(x), 

                           length(x) - 1, 

                           (length(x) - (max_non_na + 1)))

        # Store a vector of the ramp to function: -> l_ramp_up: 

        ramp_up <- as.numeric(
          cumsum(rep(x[min_non_na]/(min_non_na), ru_lower))
          )

        # Apply the interpolation function on vector "x": -> y

        y <- as.numeric(l_interp_vec(as.numeric(x[min_non_na:max_non_na])))

        # Create a vector that combines the ramp_up vector 
        # and y if the first NA is at row 1: -> z

        if(length(ramp_up) > 1 & max_non_na != length(x)){

          # Create a vector interpolations if there are 
          # multiple NA values after the last value: -> lower_l_int

          lower_l_int <- as.numeric(cumsum(rep(mean(diff(c(ramp_up, y))),
                                               ru_upper+1)) +
                                  as.numeric(x[max_non_na]))

          # Store the linear interpolations in  a vector: -> z

          z <- as.numeric(c(ramp_up, y, lower_l_int))

        }else if(length(ramp_up) > 1 & max_non_na == length(x)){

          # Store the linear interpolations in  a vector: -> z

          z <- as.numeric(c(ramp_up, y))

        }else if(min_non_na == 1 & max_non_na != length(x)){

          # Create a vector interpolations if there are 
          # multiple NA values after the last value: -> lower_l_int

          lower_l_int <- as.numeric(cumsum(rep(mean(diff(c(ramp_up, y))),
                                               ru_upper+1)) +
                                  as.numeric(x[max_non_na]))


          # Store the linear interpolations in  a vector: -> z

          z <- as.numeric(c(y, lower_l_int))

        }else{

          # Store the linear interpolations in  a vector: -> z

          z <- as.numeric(y)

        }

        # Interpolate between points in x, return new x:

        return(as.numeric(ifelse(is.na(x), z, x)))

      }else{

        x

      }

    }

  )

)

# Subset interped df to only contain 
# the time values in C, store a data frame: -> int_df_subset

int_df_subset <- interped_df[interped_df$time %in% C$time,]
hello_friend
  • 5,682
  • 1
  • 11
  • 15