13

I have a large data.frame with 'staggered' data and would like to align it. What I mean is I would like to take something like

enter image description here

and remove the leading (top) NAs from all columns to get

enter image description here

I know about the na.trim function from the zoo package, but this didn't work on either the initial data.frame presented above or its transpose. For this I used, with transposed dataframe t.df,

t.df <- na.trim(t.df, sides = 'left')

This only returned an empty data.frame, and wouldn't work the way I wanted anyway since it would create vectors of different lengths. Can anyone point me to a package or function that might be more helpful?

Here is the code for my example used above:

# example of what I have

var1 <- c(1,2,3,4,5,6,7,8,9,10)
var2 <- c(6,2,4,7,3,NA,NA,NA,NA,NA)
var3 <- c(NA,NA,8,6,3,7,NA,NA,NA,NA)
var4 <- c(NA,NA,NA,NA,5,NA,2,6,2,9)

df <- data.frame(var1, var2, var3, var4)


# transpose and (unsuccessful) attempt to remove leading NAs

t.df <- t(df)

t.df <-  na.trim(t.df, sides = 'left')
zx8754
  • 52,746
  • 12
  • 114
  • 209
ndem763
  • 320
  • 1
  • 11

4 Answers4

11

We can loop over the columns (lapply(..) and apply na.trim. Then, pad NAs at the end of the each of the list elements by assigning length as the maximum length from the list elements.

library(zoo)
lst <- lapply(df, na.trim)
df[] <- lapply(lst, `length<-`, max(lengths(lst)))
df
#   var1 var2 var3 var4
#1     1    6    8    5
#2     2    2    6   NA
##     3    4    3    2
#4     4    7    7    6
#5     5    3   NA    2
#6     6   NA   NA    9
#7     7   NA   NA   NA
#8     8   NA   NA   NA
#9     9   NA   NA   NA
#10   10   NA   NA   NA

Or as @G.Grothendieck mentioned in the comments

replace(df, TRUE, do.call("merge", lapply(lst, zoo)))
akrun
  • 874,273
  • 37
  • 540
  • 662
4

You can do with base functions:

my.na.trim <- function(x) {
  r <- rle(is.na(x))
  if (!r$value[1]) return(x)
  x[c(((r$length[1]+1):length(x)), 1:r$length[1])]
}

df[,] <- lapply(df, my.na.trim)
df
#    var1 var2 var3 var4
# 1     1    6    8    5
# 2     2    2    6   NA
# 3     3    4    3    2
# 4     4    7    7    6
# 5     5    3   NA    2
# 6     6   NA   NA    9
# 7     7   NA   NA   NA
# 8     8   NA   NA   NA
# 9     9   NA   NA   NA
# 10   10   NA   NA   NA

alternative coding for the function:

my.na.trim <- function(x) {
  r <- rle(is.na(x))
  if (!r$value[1]) return(x)
  r1 <- r$length[1]
  c(tail(x, -r1), head(x, r1))
}
jogo
  • 12,469
  • 11
  • 37
  • 42
3

We can use the cbind.na() function from the qpcR package and combine it with the na.trim() function from the zoo package:

do.call(qpcR:::cbind.na, lapply(df, zoo::na.trim))
#      var1 var2 var3 var4
# [1,]    1    6    8    5
# [2,]    2    2    6   NA
# [3,]    3    4    3    2
# [4,]    4    7    7    6
# [5,]    5    3   NA    2
# [6,]    6   NA   NA    9
# [7,]    7   NA   NA   NA
# [8,]    8   NA   NA   NA
# [9,]    9   NA   NA   NA
#[10,]   10   NA   NA   NA
RHertel
  • 23,412
  • 5
  • 38
  • 64
3

If speed is a matter you can use this data.table solution.

library(data.table)

dt_foo <- function(dt) {
  shift_v <- sapply(dt, function(col) min(which(+(is.na(col)) == 0))-1)
  shift_expr <- parse(text = paste0("list(", paste("shift(", names(shift_v), ", n = ", shift_v, ", type = 'lead')", collapse = ", "), ")"))
  dt[, names(shift_v) := eval(shift_expr), with = F]
  dt[]
}

Some benchmarking follows.

library(zoo)
library(microbenchmark)

set.seed(1)
DT <- as.data.table(matrix(sample(c(0:9L, NA), 1e8, T, prob = c(rep(.01, 10), .9)), ncol = 1000))

zoo_foo <- function(df) {
  lst <- lapply(df, na.trim)
  df[] <- lapply(lst, `length<-`, max(lengths(lst)))
  df
}

my.na.trim <- function(x) {
  r <- rle(is.na(x))
  if (!r$value[1]) return(x)
  x[c(((r$length[1]+1):length(x)), 1:r$length[1])]
}

microbenchmark(dt_foo(copy(DT)), zoo_foo(DT),
  as.data.frame(lapply(DT, my.na.trim)), times = 10)

Unit: seconds
                                  expr      min       lq     mean   median       uq      max neval cld
                      dt_foo(copy(DT)) 1.468749 1.618289 1.690293 1.699926 1.725534 1.893018    10 a  
                           zoo_foo(DT) 6.493227 6.516247 6.834768 6.779045 7.190705 7.319058    10   c
 as.data.frame(lapply(DT, my.na.trim)) 4.988514 5.013340 5.384399 5.385273 5.508889 6.517748    10  b 
inscaven
  • 2,514
  • 19
  • 29