Remove leading NAs to align data

Question

I have a large data.frame with 'staggered' data and would like to align it. What I mean is I would like to take something like

and remove the leading (top) NAs from all columns to get

I know about the na.trim function from the zoo package, but this didn't work on either the initial data.frame presented above or its transpose. For this I used, with transposed dataframe t.df,

t.df <- na.trim(t.df, sides = 'left')

This only returned an empty data.frame, and wouldn't work the way I wanted anyway since it would create vectors of different lengths. Can anyone point me to a package or function that might be more helpful?

Here is the code for my example used above:

# example of what I have

var1 <- c(1,2,3,4,5,6,7,8,9,10)
var2 <- c(6,2,4,7,3,NA,NA,NA,NA,NA)
var3 <- c(NA,NA,8,6,3,7,NA,NA,NA,NA)
var4 <- c(NA,NA,NA,NA,5,NA,2,6,2,9)

df <- data.frame(var1, var2, var3, var4)


# transpose and (unsuccessful) attempt to remove leading NAs

t.df <- t(df)

t.df <-  na.trim(t.df, sides = 'left')

Eventually you can use `rle(is.na(var4))`. – jogo Apr 15 '16 at 06:27 — jogo, Apr 15 '16 at 06:27

akrun · Accepted Answer · 2016-04-15T11:56:40.830

We can loop over the columns (lapply(..) and apply na.trim. Then, pad NAs at the end of the each of the list elements by assigning length as the maximum length from the list elements.

library(zoo)
lst <- lapply(df, na.trim)
df[] <- lapply(lst, `length<-`, max(lengths(lst)))
df
#   var1 var2 var3 var4
#1     1    6    8    5
#2     2    2    6   NA
##     3    4    3    2
#4     4    7    7    6
#5     5    3   NA    2
#6     6   NA   NA    9
#7     7   NA   NA   NA
#8     8   NA   NA   NA
#9     9   NA   NA   NA
#10   10   NA   NA   NA

Or as @G.Grothendieck mentioned in the comments

replace(df, TRUE, do.call("merge", lapply(lst, zoo)))

This would also work in place of the last line: `replace(df, TRUE, do.call("merge", lapply(lst, zoo)))` — G. Grothendieck, Apr 15 '16 at 11:42

jogo · Answer 2 · 2016-04-15T06:44:58.080

You can do with base functions:

my.na.trim <- function(x) {
  r <- rle(is.na(x))
  if (!r$value[1]) return(x)
  x[c(((r$length[1]+1):length(x)), 1:r$length[1])]
}

df[,] <- lapply(df, my.na.trim)
df
#    var1 var2 var3 var4
# 1     1    6    8    5
# 2     2    2    6   NA
# 3     3    4    3    2
# 4     4    7    7    6
# 5     5    3   NA    2
# 6     6   NA   NA    9
# 7     7   NA   NA   NA
# 8     8   NA   NA   NA
# 9     9   NA   NA   NA
# 10   10   NA   NA   NA

alternative coding for the function:

my.na.trim <- function(x) {
  r <- rle(is.na(x))
  if (!r$value[1]) return(x)
  r1 <- r$length[1]
  c(tail(x, -r1), head(x, r1))
}

RHertel · Answer 3 · 2016-04-15T08:40:53.353

We can use the cbind.na() function from the qpcR package and combine it with the na.trim() function from the zoo package:

do.call(qpcR:::cbind.na, lapply(df, zoo::na.trim))
#      var1 var2 var3 var4
# [1,]    1    6    8    5
# [2,]    2    2    6   NA
# [3,]    3    4    3    2
# [4,]    4    7    7    6
# [5,]    5    3   NA    2
# [6,]    6   NA   NA    9
# [7,]    7   NA   NA   NA
# [8,]    8   NA   NA   NA
# [9,]    9   NA   NA   NA
#[10,]   10   NA   NA   NA

score 3 · Answer 4 · answered Apr 15 '16 at 10:39

If speed is a matter you can use this data.table solution.

library(data.table)

dt_foo <- function(dt) {
  shift_v <- sapply(dt, function(col) min(which(+(is.na(col)) == 0))-1)
  shift_expr <- parse(text = paste0("list(", paste("shift(", names(shift_v), ", n = ", shift_v, ", type = 'lead')", collapse = ", "), ")"))
  dt[, names(shift_v) := eval(shift_expr), with = F]
  dt[]
}

Some benchmarking follows.

library(zoo)
library(microbenchmark)

set.seed(1)
DT <- as.data.table(matrix(sample(c(0:9L, NA), 1e8, T, prob = c(rep(.01, 10), .9)), ncol = 1000))

zoo_foo <- function(df) {
  lst <- lapply(df, na.trim)
  df[] <- lapply(lst, `length<-`, max(lengths(lst)))
  df
}

my.na.trim <- function(x) {
  r <- rle(is.na(x))
  if (!r$value[1]) return(x)
  x[c(((r$length[1]+1):length(x)), 1:r$length[1])]
}

microbenchmark(dt_foo(copy(DT)), zoo_foo(DT),
  as.data.frame(lapply(DT, my.na.trim)), times = 10)

Unit: seconds
                                  expr      min       lq     mean   median       uq      max neval cld
                      dt_foo(copy(DT)) 1.468749 1.618289 1.690293 1.699926 1.725534 1.893018    10 a  
                           zoo_foo(DT) 6.493227 6.516247 6.834768 6.779045 7.190705 7.319058    10   c
 as.data.frame(lapply(DT, my.na.trim)) 4.988514 5.013340 5.384399 5.385273 5.508889 6.517748    10  b

Remove leading NAs to align data

4 Answers4

Linked