4

I am interested in finding the fastest way to convert a matrix into a long-format data.frame.

I here compare three solutions to achieve this but I wonder if faster methods, using for example data.table, exist.

Here is the reproducible code of the three methods I compared:

# Generate matrix -------------------------------------------------------------
set.seed(1)
ex <- matrix(data = round(runif(100000), 1), nrow = 1000, ncol = 100)
rownames(ex) <- paste0("row", 1:nrow(ex))
colnames(ex) <- paste0("col", 1:ncol(ex))
ex[1:5, 1:5]

     col1 col2 col3 col4 col5
row1  0.3  0.5  0.9  0.8  0.2
row2  0.4  0.7  1.0  0.5  0.5
row3  0.6  0.4  0.9  0.2  0.0
row4  0.9  1.0  0.4  0.4  0.5
row5  0.2  0.1  0.2  0.8  0.9

# table solution --------------------------------------------------------------
df1 <- as.data.frame(as.table(ex))

# reshape solution ------------------------------------------------------------
df2 <- reshape2::melt(ex)

# dplyr solution --------------------------------------------------------------
library(dplyr)
library(tidyr)
df3 <- ex %>%
  as.data.frame() %>%
  tibble::rownames_to_column("Var1") %>%
  gather("Var2", "value", -Var1)

# check for equality ----------------------------------------------------------
colnames(df1)[colnames(df1) == "Freq"] <- "value"
head(df1)
  Var1 Var2 value
1 row1 col1   0.3
2 row2 col1   0.4
3 row3 col1   0.6
4 row4 col1   0.9
5 row5 col1   0.2
6 row6 col1   0.9

df1$Var1 <- as.character(df1$Var1)
df1$Var2 <- as.character(df1$Var2)
df2$Var1 <- as.character(df2$Var1)
df2$Var2 <- as.character(df2$Var2)

identical(df1, df2); identical(df1, df3)
TRUE

# Microbenchmark --------------------------------------------------------------
library(microbenchmark)
comp <- microbenchmark(
  table = {
    df1 <- as.data.frame(as.table(ex))
  },

  reshape = {
    df2 <- reshape2::melt(ex)
  },

  dplyr = {
    df3 <- ex %>%
      as.data.frame() %>%
      tibble::rownames_to_column("Var1") %>%
      gather("Var2", "value", -Var1)
  }
)

library(ggplot2)
autoplot(comp)

enter image description here

The reshape approach is so far the fastest.

P. Denelle
  • 790
  • 10
  • 24
  • You can try `melt(as.data.table(ex)[, rn := seq_len(.N)], id.var = 'rn')` – akrun Feb 06 '20 at 19:07
  • Or `dt = melt(data.table(ex, keep.rownames = TRUE) , id.vars = c("rn"))` – Kipras Kančys Feb 06 '20 at 19:15
  • Or `data.frame(Var1 = rownames(ex), Var2 = colnames(ex), value = c(ex))` perhaps – markus Feb 06 '20 at 19:18
  • Related: [Convert a matrix with dimnames into a long format data.frame](https://stackoverflow.com/questions/10234734/convert-a-matrix-with-dimnames-into-a-long-format-data-frame) – Henrik Feb 21 '21 at 12:05

1 Answers1

4

So many options:

library(dplyr)
library(tidyr)
library(data.table)

library(microbenchmark)
library(ggplot2)

set.seed(1)
ex <- matrix(data = round(runif(100000), 1), nrow = 1000, ncol = 100)
rownames(ex) <- paste0("row", 1:nrow(ex))
colnames(ex) <- paste0("col", 1:ncol(ex))


comp <- microbenchmark(
    table = {
        df1 <- as.data.frame(as.table(ex))
    },

    reshape = {
        df2 <- reshape2::melt(ex)
    },

    dplyr = {
        df3 <- ex %>%
            as.data.frame() %>%
            tibble::rownames_to_column("Var1") %>%
            gather("Var2", "value", -Var1)
    },

    data.table = {
        dt = melt(data.table(ex, keep.rownames = TRUE) , id.vars = c("rn"))
    },

    data.table2 = {
        melt(as.data.table(ex)[, rn := seq_len(.N)], id.var = 'rn')
    },

    data.table3 = {
        data.table(Var1 = rownames(ex), Var2 = colnames(ex), value = c(ex))
    }

)

autoplot(comp)

enter image description here

Kipras Kančys
  • 1,617
  • 1
  • 15
  • 20
  • Thanks for the comparison. `data.table3` solution seems to give different results than the other solutions. – P. Denelle Feb 06 '20 at 23:16