0

I'd like to modify the effects of the maxgaps argument of the .fill_short_gaps function in the R zoo library (used in na.locf and na.approx), as described in uday's comment here.

The following example illustrates existing behavior in the context of na.locf.

x <- c(rep(NA, 2), 1:4, rep(NA, 4), 7:8, rep(NA, 2), 9:10)
y <- na.locf(x, na.rm=FALSE, maxgap=2)
rbind(x, y)

Which results in

   [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14] [,15] [,16]

x    NA   NA    1    2    3    4   NA   NA   NA    NA     7     8    NA    NA     9    10

y    NA   NA    1    2    3    4   NA   NA   NA    NA     7     8     8     8     9    10

However, I'd like the group of four internal NAs in 7:10 to be filled forward with maxgap values, with the rest as NAs. E.g.:

    [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14] [,15] [,16]
x   NA   NA    1    2    3    4   NA   NA   NA    NA     7     8    NA    NA     9    10
z   NA   NA    1    2    3    4    4    4   NA    NA     7     8     8     8     9    10

For reference, here is the .fill_short_gaps function from the zoo package:

## x = series with gaps
## fill = same series with filled gaps
.fill_short_gaps <- function(x, fill, maxgap) {
    if (maxgap <= 0)
        return(x)
    if (maxgap >= length(x))
        return(fill)
    naruns <- rle(is.na(x))

    # This is the part I want to modify.  Currently sets all runs > maxgap
    # to FALSE (meaning don't fill)
    naruns$values[naruns$lengths > maxgap] <- FALSE

    naok <- inverse.rle(naruns)
    ifelse(naok, fill, x)
}

Using x as the example, naruns looks like this:

naruns

Run Length Encoding
    lengths: int [1:6] 2 4 4 2 2 2
    values : logi [1:6] TRUE FALSE TRUE FALSE TRUE FALSE

One approach to solving my problem would involve inserting values in the appropriate places in the naruns vectors so that naok can be created correctly. This would look like:

Run Length Encoding
    lengths: int [1:7] 2 4 2 2 2 2 2
    values : logi [1:7] TRUE FALSE TRUE FALSE FALSE TRUE FALSE 

That is, the 4 (TRUE) in position 3 would be split into 2 (TRUE) and 2 (FALSE), that is, at positions identified by which(naruns$values & naruns$lengths > maxgap) but I'm not sure of a good way to insert values in the correct positions.

I've considered several clumsy approaches to doing this, but they've ended in dead ends. I know from looking at the answers to other (unrelated) questions on SO that many can come up with something much more robust and scalable than anything I am likely to emit in a reasonable timespan. Thanks for any help.

Community
  • 1
  • 1
Inhabitant
  • 859
  • 6
  • 6

1 Answers1

0

Turns out I was able to answer my own question, once I had identified the problem more narrowly. They key problem in the approach I described above was to insert values into a vector at specified positions. I found a good way to do that at How to insert elements into a vector?. Using the accepted answer there, I created this function that fills runs of NAs up to a specified maximum number and leaves the rest as NAs.

It's based on zoo::na.locf except that I replaced the .fill_short_gaps with the version shown below:

fillNa <- function (object, na.rm = TRUE, fromLast, rev, maxfill = Inf, 
                    rule = 2, ...) {

    ## x = series with gaps
    ## fill = same series with filled gaps
    .fill_short_gaps <- function(x, fill, maxfill) {
        if (maxfill <= 0)
            return(x)
        if (maxfill >= length(x))
            return(fill)

        naruns <- rle(is.na(x))
        len <- length(naruns$lengths)

        # Identify which runs are greater than maxfill
        fill.idx <- which(naruns$values & naruns$lengths > maxfill)

        # Create a new naruns
        naruns2 <- vector('list', 2)
        attr(naruns2, 'class') <- 'rle'

        idx <- c(seq_along(naruns$values), fill.idx + 0.5)
        naruns2$values <- c(naruns$values, rep(FALSE, length(fill.idx)))
        naruns2$values[fill.idx] <- TRUE
        naruns2$values <- naruns2$values[order(idx)]

        fill.lngth <- naruns$lengths[fill.idx]
        naruns2$lengths <- c(naruns$lengths, (fill.lngth - maxfill))
        naruns2$lengths[fill.idx] <- maxfill
        naruns2$lengths <- naruns2$lengths[order(idx)]

        naok <- rep(NA, length(x))

        naok <- inverse.rle(naruns2)
        ifelse(naok, fill, x)
    }

    L <- list(...)
    if ("x" %in% names(L) || "xout" %in% names(L)) {
        if (!missing(fromLast)) {
            stop("fromLast not supported if x or xout is specified")
        }
        return(na.approx(object, na.rm = na.rm, maxfill = maxfill, 
                         method = "constant", rule = rule, ...))
    }
    na.locf.0 <- function(x) {
        L <- !is.na(x)
        idx <- if (fromLast) 
                   rev(c(NA, rev(which(L)))[cumsum(rev(L)) + 1])
               else c(NA, which(L))[cumsum(L) + 1]
        na.index <- function(x, i) {
            L <- !is.na(i)
            x[!L] <- NA
            x[L] <- coredata(x)[i[L]]
            x
        }
        xf <- na.index(x, idx)
        .fill_short_gaps(x, xf, maxfill = maxfill)
    }

    if (!missing(rev)) {
        warning("na.locf.default: rev= deprecated. Use fromLast= instead.")
        if (missing(fromLast)) 
            fromLast <- rev
    }
    else if (missing(fromLast)) 
        fromLast <- FALSE
    rev <- base::rev
    object[] <- if (length(dim(object)) == 0) 
                    na.locf.0(object)
                else apply(object, length(dim(object)), na.locf.0)
    if (na.rm) 
        na.trim(object, is.na = "all")
    else object
}
Community
  • 1
  • 1
Inhabitant
  • 859
  • 6
  • 6