9

I would like to replace up to n consecutive NA values in vector with latest non-NA value.

For example, if:

a <- c(1,NA,NA,NA,NA,NA,2,NA,1,NA,NA,NA)
n <- 2

I would like to obtain:

c(1,1,1,NA,NA,NA,2,2,1,1,1,NA)

n is maximum number of NA values that can be replaced by given element).

I know na.locf() function, but I don't know how to set the limit n. Is it possible to do it?

Henrik
  • 65,555
  • 14
  • 143
  • 159
quarandoo
  • 369
  • 4
  • 9
  • Some additional alternatives here: [Fill NA in a time series only to a limited number](https://stackoverflow.com/questions/25940241/fill-na-in-a-time-series-only-to-a-limited-number) – Henrik Mar 07 '21 at 15:55

5 Answers5

10

Here's an option using na.locf and rle

library(zoo)
r <- rle(is.na(a))
a <- na.locf(a)
is.na(a) <- sequence(r$lengths) > n & rep(r$values, r$lengths)
a
# [1]  1  1  1 NA NA NA  2  2  1  1  1 NA

So here I first computed the run lengths of elements in a (including the NA entries), then replaced all NA's using na.locf and finally turned those elements back to NA's where the run lengths were greater than n and the elements were NA.

talat
  • 68,970
  • 21
  • 126
  • 157
  • This answers this question, but note that it doesn't work if the first value in a is NA. In that case, you need to have `a <- na.locf(a, na.rm = FALSE)` instead of `a <- na.locf(a)`. – Joe Silverstein Mar 03 '22 at 10:15
3

As another idea, we can find the last indices of "a" without NAs:

is = seq_along(a)
i = cummax((!is.na(a)) * is)
i
# [1] 1 1 1 1 1 1 7 7 9 9 9 9

Replace the last non-NA index with the current index if last non-NA is more than "n" steps away:

wh = (is - i) > n
i[wh] = is[wh]
i
# [1]  1  1  1  4  5  6  7  7  9  9  9 12

And subset "a":

a[i]
# [1]  1  1  1 NA NA NA  2  2  1  1  1 NA
alexis_laz
  • 12,884
  • 4
  • 27
  • 37
  • @989 : I might be missing something, but shouldn't the result be `c(1, 1, 1, 1, 1, NA, 2, 2, 1, 1, 1, NA)` in that case? I get this result after `a[i]` from above. – alexis_laz Apr 05 '17 at 09:14
2

You could do this using split and replace in base R

f <- function(a, n) {
  # split the vector based on the position of non-NA values
  l <- split(a, cumsum(seq_along(a) %in% which(!is.na(a))))
  unlist(lapply(l, function(r) replace(r, 1:(n+1), r[1])[seq_along(r)]),use.names = FALSE)
}

f(a, n = 2)
#[1]  1  1  1 NA NA NA  2  2  1  1  1 NA

f(a, n = 3)
#[1]  1  1  1  1 NA NA  2  2  1  1  1  1

Benchmarking (random generated vector of size 7467)

library(microbenchmark)
library(dplyr)
library(zoo)
set.seed(123)
a <- unlist(replicate(1000, c(sample(10, 2), rep(NA, sample.int(10, 1)))))
length(a)
# [1] 7467
n <- 3
f_989 <- function(a, n) {
  # split the vector based on the position of non-NA values
  l <- split(a, cumsum(seq_along(a) %in% which(!is.na(a))))
  unlist(lapply(l, function(r) replace(r, 1:(n+1), r[1])[seq_along(r)]),use.names = FALSE)
}
f_zx8754 <- function(a, n)
data.frame(a) %>% mutate(gr = cumsum(!is.na(a))) %>% 
  group_by(gr) %>% 
  mutate(res = if_else(row_number() <= n + 1, na.locf(a), a)) %>% 
  .$res
f_docendo_discimus <- function(a, n){
    r <- rle(is.na(a))
    a <- na.locf(a)
    is.na(a) <- sequence(r$lengths) > n & rep(r$values, r$lengths)
    a
}
f_akrun <- function(a,n) 
ave(a, cumsum(!is.na(a)), FUN = function(x) replace(x, pmin(length(x), seq(n+1)), x[1]))

f_alexis_laz=function(a,n){
    is = seq_along(a)
    i = cummax((!is.na(a)) * is)
    wh = (is - i) > n
    i[wh] = is[wh]
    a[i]
}
r <- f_989(a,n)
identical(r, f_zx8754(a,n))
# [1] TRUE
identical(r, f_docendo_discimus(a,n))
# [1] TRUE
identical(r, f_akrun(a,n))
# [1] TRUE
identical(r, f_alexis_laz(a,n))
# [1] TRUE
res <- microbenchmark("f1"=f_989(a,n), "f2"=f_zx8754(a,n), 
"f3"=f_docendo_discimus(a,n), "f4"=f_akrun(a,n), "f5"=f_alexis_laz(a,n))

print(res, order="mean")

# Unit: microseconds
 # expr        min         lq       mean      median          uq        max neval
   # f5    129.804    137.014    161.106    141.6715    151.7375   1031.511   100
   # f3   1249.351   1354.215   1471.478   1392.9750   1482.2140   2553.086   100
   # f1   4736.895   5093.852   5630.367   5345.3450   6069.9260   8848.513   100
   # f4  22165.601  23936.866  24660.990  24485.6725  24883.6440  29453.177   100
   # f2 205854.339 215582.174 221524.448 218643.9540 224211.0435 261512.922   100
989
  • 12,579
  • 5
  • 31
  • 53
1

We can use a base R approach by creating a grouping variable with cumsum and diff, then using the grouping variable in ave we replace the NA values based on the condition given by 'n'

ave(a, cumsum(c(TRUE, diff(is.na(a)) < 0)), 
      FUN = function(x) replace(x, is.na(x) & seq_along(x) <= n + 1, x[1]))
#[1]  1  1  1 NA NA NA  2  2  1  1  1 NA

Or more compact option

ave(a, cumsum(!is.na(a)), FUN = function(x) replace(x, pmin(length(x), seq(n+1)), x[1]))
#[1]  1  1  1 NA NA NA  2  2  1  1  1 NA
akrun
  • 874,273
  • 37
  • 540
  • 662
  • 1
    I don't seem to get the correct result when I use `a = c(1,1,1,NA,NA,NA,2,NA,1,NA,NA,NA)` with your first approach (I get `[1] 1 1 1 NA NA NA 2 2 1 1 1 NA`) – talat Apr 04 '17 at 13:29
  • @docendodiscimus I think it was based on the grouping variable. My initial thought is that the OP's example will be having a single non-NA element before each block of NAs – akrun Apr 04 '17 at 15:08
0

Using dplyr::group_by and zoo::na.locf:

library(dplyr)
library(zoo)

data.frame(a) %>% 
  mutate(gr = cumsum(!is.na(a))) %>% 
  group_by(gr) %>% 
  mutate(res = if_else(row_number() <= n + 1, na.locf(a), a)) %>% 
  .$res

# [1]  1  1  1 NA NA NA  2  2  1  1  1 NA
zx8754
  • 52,746
  • 12
  • 114
  • 209