1

I have a large list of 30000+ elements. There are vectors of different lenght and I want to convert the list into a dataframe, where each vector represents one line and its values are spread into multiple columns. There is a mock example of the list:

lst <- list(a = c(1,2,4,5,6), c = c(7,8,9), c = c(10,11))

My desired output looks like this:

#  [,1]  [,2] [,3] [,4] [,5] [,6]
#a    1    2    3    4    5    6
#b    7    8    9   NA   NA   NA
#c   10   11   NA   NA   NA   NA
Vojtěch Kania
  • 143
  • 1
  • 9
  • Nice to see a short and to the point reproducible example with desired output! – asachet Oct 16 '19 at 14:18
  • I feel like you really want a matrix rather than a data.frame - hard to tell for sure without knowing more about what you're doing, but keep in mind that, even in R, tabular data does not have to be in a data frame if it is not column-oriented. – asachet Oct 16 '19 at 14:21
  • I added timings – slava-kohut Oct 16 '19 at 14:35

4 Answers4

2

You could do:

t(as.data.frame(lapply(lst, "length<-", max(lengths(lst)))))

#    [,1] [,2] [,3] [,4] [,5]
#a      1    2    4    5    6
#c      7    8    9   NA   NA
#c.1   10   11   NA   NA   NA

Or as @Andrew pointed out, you can do:

t(sapply(lst, "length<-", max(lengths(lst))))

#  [,1] [,2] [,3] [,4] [,5]
#a    1    2    4    5    6
#c    7    8    9   NA   NA
#c   10   11   NA   NA   NA
Matt
  • 2,947
  • 1
  • 9
  • 21
  • No need to wrap in as.data.frame if you are transposing it after. You can use `sapply` considering `t` will convert to a matrix anyway. – Andrew Oct 16 '19 at 14:25
  • returns error: `Error in (function (..., row.names = NULL, check.rows = FALSE, check.names = TRUE, : arguments imply differing number of rows: 1, 0` – PM0087 Apr 22 '20 at 15:09
2

Here is a one base R option:

# Create a vector for number of times an NA needs to be padded
na_nums <- max(lengths(lst)) - lengths(lst)

# Transpose results after patting NA's using mapply
t(mapply(c, lst, sapply(na_nums, rep, x = NA)))
  [,1] [,2] [,3] [,4] [,5]
a    1    2    4    5    6
c    7    8    9   NA   NA
c   10   11   NA   NA   NA
Andrew
  • 5,028
  • 2
  • 11
  • 21
2

This was my first impulse.

max_len <- max(vapply(lst, 
                      FUN = length, 
                      FUN.VALUE = numeric(1)))

lst <- lapply(lst, 
              function(x, max_len) c(x, rep(NA, max_len - length(x))), 
              max_len)

# Form a matrix
do.call("rbind", lst)

It's a bit verbose, and some of the other solutions are rather elegant. Since you say your list is in excess of 30,000 elements, I was curious how these would perform on a list of length 30,000.

If this is something you need to do often, you may want to adopt andrew's approach.

lst <- list(a = c(1,2,4,5,6), c = c(7,8,9), c = c(10,11))
# build out a list of 30,000 elements.
lst <- lst[sample(1:3, 30000, replace = TRUE)]

library(microbenchmark)
microbenchmark(
  benjamin = {
    max_len <- max(vapply(lst, 
                          FUN = length, 
                          FUN.VALUE = numeric(1)))

    lst <- lapply(lst, 
                  function(x, max_len) c(x, rep(NA, max_len - length(x))), 
                  max_len)

    # Form a matrix
    do.call("rbind", lst)
  }, 
  slava = {
    Reduce(function(x,y){
      n <- max(length(x), length(y))
      length(x) <- n
      length(y) <- n
      rbind(x,y,deparse.level = 0)
    },
    lst)
  }, 
  andrew = {
    na_nums <- max(lengths(lst)) - lengths(lst)

    # Transpose results after patting NA's using mapply
    t(mapply(c, lst, sapply(na_nums, rep, x = NA)))
  }, 
  matt = {
    t(as.data.frame(lapply(lst, "length<-", max(lengths(lst)))))
  }
)

Unit: milliseconds
     expr         min          lq       mean      median          uq        max neval
 benjamin    77.08337    91.42793   117.9376   106.97656   122.53898   191.6612     5
    slava 32383.10840 32962.57589 32976.6662 33071.40314 33180.70634 33285.5372     5
   andrew    60.91803    66.74401    87.1645    71.92043    77.78805   158.4520     5
     matt  1685.09158  1702.19796  1759.2741  1737.01949  1760.86237  1911.1993     5
Benjamin
  • 16,897
  • 6
  • 45
  • 65
1

The trick is to make vectors of equal length. Also, seems like you want to hava a matrix on output.

Reduce(function(x,y){
  n <- max(length(x), length(y))
  length(x) <- n
  length(y) <- n
  rbind(x,y,deparse.level = 0)
},
       list(a = c(1,2,4,5,6), c = c(7,8,9), c = c(10,11)))

Output

# [,1] [,2] [,3] [,4] [,5]
# [1,]    1    2    4    5    6
# [2,]    7    8    9   NA   NA
# [3,]   10   11   NA   NA   NA

You can reset the row names at this point.

UPDATE Timings for those whose are interested:

lst <- list(a = c(1,2,4,5,6), c = c(7,8,9), c = c(10,11))

convert <-function(lst){
  Reduce(function(x,y){
    n <- max(length(x), length(y))
    length(x) <- n
    length(y) <- n
    rbind(x,y,deparse.level = 0)
  },
  lst)
}

convert2 <- function(lst){
  t(sapply(lst, "length<-", max(lengths(lst))))
}

convert3 <- function(lst){
t(as.data.frame(lapply(lst, "length<-", max(lengths(lst)))))
}

microbenchmark::microbenchmark(convert(lst),
                               convert2(lst),
                               convert3(lst))

#Unit: microseconds
#          expr     min       lq      mean   median      uq      max neval
#  convert(lst)  41.962  50.0725 106.47314  62.2375  68.408 4392.895   100
# convert2(lst)  28.209  33.6755  69.93855  40.7280  45.136 2298.002   100
# convert3(lst) 292.673 306.6005 381.59504 319.1180 333.399 2887.929   100
slava-kohut
  • 4,203
  • 1
  • 7
  • 24