7

Suppose I have a 3-dimensional array g with dimensions [x,y,z]. reshape2::melt(g) would produce a data frame with columns giving indices of x,y,z and value where value contains the values in each entry of the prior array.

Given that reshape2 is superseded, is there a "one function" alternative to the functionality of reshape2::melt in base R or a more actively supported tidyverse package that I'm missing?

reshape2 recommends people use tidyr instead but I can't seem to find solutions to multi-dimensional arrays in tidyr. There is cubelyr but doesn't seem like that is very active these days either.

I can write a custom solution, just looking for something stable with the easy functionality of reshape2::melt for this kind of problem

library(reshape2)

g_as_array <- array(rnorm(9), dim = c(3,3,3)) # create a 3D array

g_as_data_frame <- reshape2::melt(g_as_array) # melt down to "tidy" format

head(g_as_data_frame)
#>   Var1 Var2 Var3      value
#> 1    1    1    1  1.4092362
#> 2    2    1    1 -2.1606972
#> 3    3    1    1  0.4334404
#> 4    1    2    1  0.2390544
#> 5    2    2    1 -0.9673617
#> 6    3    2    1  0.5668378

Created on 2022-08-25 by the reprex package (v2.0.1)

DanO
  • 600
  • 3
  • 11
  • 4
    Base approach and data.table answers here: https://stackoverflow.com/questions/63311405/seeking-r-function-to-melt-5-dimensional-array-like-pivot-longer `as.data.frame(ftable(g_as_array))` or `data.table::as.data.table(g_as_array)` – Jon Spring Aug 25 '22 at 17:10
  • That's great, thank you. For the base approach would just need to convert factor labels back to numeric indices then e.g. `as.data.frame(ftable(g_as_array)) %>% dplyr::mutate(dplyr::across(dplyr::starts_with("Var"), as.numeric))` – DanO Aug 26 '22 at 17:02
  • `pivot_longer` and `pivot_wider` from tidyr seem to be the main alternatives these days. – Ted M. Sep 15 '22 at 19:59
  • 1
    True in general but `pivot_longer` and `pivot_wider` only work for two-dimensional table-like data, not multi-dimensional arrays, unless I'm missing some functionality there? – DanO Sep 20 '22 at 21:36
  • 2
    An ideal answer would preserve dimnames if they existed – Ben Bolker May 31 '23 at 20:46
  • 1
    `as.data.frame.table(g_as_array)` will give the results but using `LETTERS` instead of `numbers` – Onyambu Jun 01 '23 at 19:30

4 Answers4

9
a <- array(1:27, dim = c(3,3,3))

library(reshape2)
DF1 <- melt(a)

DF2 <- data.frame(
  expand.grid(lapply(dim(a), seq_len)),
  value = as.vector(a)
)

identical(DF1, DF2)
#[1] TRUE

If the array has dimension names:

a <-array(letters[1:27], dim = c(3, 3, 3), dimnames = list(letters[1:3],
                                                           letters[4:6],
                                                           letters[7:9]))

library(reshape2)
DF1 <- melt(a)
    
DF2 <- data.frame(
  expand.grid(dimnames(a)),
  value = as.vector(a)
)

identical(DF1, DF2)
#[1] TRUE

If not all dimensions have names, you would need to fill in the missing names first, e.g.:

Map(\(x, y) if (is.null(x)) seq_len(y) else x , dimnames(a), dim(a))
Roland
  • 127,288
  • 10
  • 191
  • 288
7

An option would be to use arrayInd.

A <- array(1:8, c(2,2,2))

data.frame(arrayInd(seq_along(A), dim(A)), value = as.vector(A))
#  X1 X2 X3 value
#1  1  1  1     1
#2  2  1  1     2
#3  1  2  1     3
#4  2  2  1     4
#5  1  1  2     5
#6  2  1  2     6
#7  1  2  2     7
#8  2  2  2     8

Or quite similar to @ThomasIsCoding using which.

data.frame(which(array(TRUE, dim(A)), arr.ind = TRUE), value = as.vector(A))
#  dim1 dim2 dim3 value
#1    1    1    1     1
#2    2    1    1     2
#3    1    2    1     3
#4    2    2    1     4
#5    1    1    2     5
#6    2    1    2     6
#7    1    2    2     7
#8    2    2    2     8

If the array has dimension names.

A <- array(1:8, c(2,2,2), list(X=c("a","b"), Y=c("c","d"), Z=c("e","f")))

i <- arrayInd(seq_along(A), dim(A), dimnames(A), TRUE)
data.frame(mapply(`[`, dimnames(A), asplit(i, 2)), value = as.vector(A))
#  X Y Z value
#1 a c e     1
#2 b c e     2
#3 a d e     3
#4 b d e     4
#5 a c f     5
#6 b c f     6
#7 a d f     7
#8 b d f     8

But this can be achieved, as shown in the comments, with as.data.frame(ftable(A)) @Jon Spring or as.data.frame.table(A) @Onyambu.
If you look at the source of as.data.frame.table you see that it is using expand.grid.

as.data.frame.table(A)    #@Onyambu.
#as.data.frame(ftable(A)) #@Jon Spring
#  X Y Z Freq
#1 a c e    1
#2 b c e    2
#3 a d e    3
#4 b d e    4
#5 a c f    5
#6 b c f    6
#7 a d f    7
#8 b d f    8

But if numeric indices are wanted this can be used.

sapply(as.data.frame.table(A), unclass)
#     X Y Z Freq
#[1,] 1 1 1    1
#[2,] 2 1 1    2
#[3,] 1 2 1    3
#[4,] 2 2 1    4
#[5,] 1 1 2    5
#[6,] 2 1 2    6
#[7,] 1 2 2    7
#[8,] 2 2 2    8

Or more robust and giving a data.frame:

tt <- as.data.frame.table(A)
tt[-length(tt)] <- lapply(tt[-length(tt)], unclass)
tt
#  Var1 Var2 Var3 Freq
#1    1    1    1    1
#2    2    1    1    2
#3    1    2    1    3
#4    2    2    1    4
#5    1    1    2    5
#6    2    1    2    6
#7    1    2    2    7
#8    2    2    2    8

#or
list2DF(lapply(as.data.frame.table(A), unclass))

Or a variant - Thanks to @Onyambu for the hint!

type.convert(as.data.frame.table(`dimnames<-`(A, NULL),
             base = list(as.character(seq_len(max(dim(A)))))), as.is = TRUE)
#  Var1 Var2 Var3 Freq
#1    1    1    1    1
#2    2    1    1    2
#3    1    2    1    3
#4    2    2    1    4
#5    1    1    2    5
#6    2    1    2    6
#7    1    2    2    7
#8    2    2    2    8

Another option is to calculate it "by hand" with %% and %/%.

cbind(1 + mapply(`%%`,
    Reduce(`%/%`, dim(A)[-length(dim(A))], 0:(length(A)-1), accumulate = TRUE),
    dim(A)), Value=as.vector(A))
#           Value
#[1,] 1 1 1     1
#[2,] 2 1 1     2
#[3,] 1 2 1     3
#[4,] 2 2 1     4
#[5,] 1 1 2     5
#[6,] 2 1 2     6
#[7,] 1 2 2     7
#[8,] 2 2 2     8

#Alternative
. <- 0:(length(A)-1)
cbind(1 +
    t(t(cbind(., outer(., cumprod(dim(A)[-length(dim(A))]), `%/%`))) %% dim(A)),
    Value=A)

or using rep.

list2DF(c(Map(\(i, j, n) rep(rep(1:i, each=j), length.out=n),
    dim(A),
    c(1, cumprod(dim(A)[-length(dim(A))])),
    length(A)), Value=list(as.vector(A))))
#        Value
#1 1 1 1     1
#2 2 1 1     2
#3 1 2 1     3
#4 2 2 1     4
#5 1 1 2     5
#6 2 1 2     6
#7 1 2 2     7
#8 2 2 2     8

Or basically the same but keeping names and make use of auto repetition.

d <- setNames(dim(A), names(dimnames(A)))
do.call(data.frame, c(
  Map(\(i,j) rep(1:i, each=j), d, c(1, cumprod(d[-length(d)]))),
  Value=list(as.vector(A) ), fix.empty.names = FALSE) )
  X Y Z Value
1 1 1 1     1
2 2 1 1     2
3 1 2 1     3
4 2 2 1     4
5 1 1 2     5
6 2 1 2     6
7 1 2 2     7
8 2 2 2     8

Benchmark

A <- array(0, c(1e5, 12, 30), list(T=NULL, Month=NULL, Year=NULL))

bench::mark(check=FALSE,
reshape2 = reshape2::melt(A),
expand.grid = {data.frame(  #@Roland
  expand.grid(lapply(dim(A), seq_len)),
  value = as.vector(A)) },
data.frame.table = {tt <- as.data.frame.table(A)
  tt[-length(tt)] <- lapply(tt[-length(tt)], unclass)
  tt},
rep = {d <- setNames(dim(A), names(dimnames(A)))
do.call(data.frame, c(
  Map(\(i,j) rep(1:i, each=j), d, c(1, cumprod(d[-length(d)]))),
  Value=list(as.vector(A) ), fix.empty.names = FALSE) )} )
#  expression            min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc
#  <bch:expr>       <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>
#1 reshape2            812ms    812ms      1.23    1.21GB     1.23     1     1
#2 expand.grid         733ms    733ms      1.36    1.21GB     2.73     1     2
#3 data.frame.table    605ms    605ms      1.65    1.23GB     3.31     1     2
#4 rep                 293ms    331ms      3.02  691.99MB     1.51     2     1

In this case the variant using rep is the fastest and allocates the lowest amount of memory.

GKi
  • 37,245
  • 2
  • 26
  • 48
  • also `data.frame(arrayInd(seq_along(A), dim(A)), value = c(A))` – Onyambu Jun 02 '23 at 04:21
  • Thanks! I had similar but instead of `c` with `as.vector`(see comment Roland in his answer) - see history of edits. But `arrayInd` "wants" a logical vector so I provide one. – GKi Jun 02 '23 at 04:25
  • Why would you say `arrayInd` wants a logical vector? `arrayInd` takes in an Integer valued vector. Not a logical vector. Please read the help page for array ind. – Onyambu Jun 02 '23 at 05:18
  • Yes you are right. Thanks! I have changed it back. – GKi Jun 02 '23 at 10:50
  • `sapply...unclass` is quite risky, I would suggest `type.convert(as.data.frame.table(A, base = list(as.character(1:20))), as.is = TRUE)` – Onyambu Jun 02 '23 at 19:56
4

Here are some base R alternatives with the which trick that should work for general arrays, i.e., numeric and character:

  1. which(1^is.na(g) > 0, arr.ind = TRUE)
cbind(as.data.frame(which(1^is.na(g) > 0, arr.ind = TRUE)), value = c(g))
  1. which(TRUE | is.na(g), arr.ind = TRUE)
cbind(as.data.frame(which(TRUE | is.na(g), arr.ind = TRUE)), value = c(g))
  1. nchar(g, "width") > -1
cbind(as.data.frame(which(nchar(g, "width") > -1, arr.ind = TRUE)), value = c(g))

and we will obtain

   dim1 dim2 dim3 value
1     1    1    1     a
2     2    1    1     b
3     3    1    1     c
4     1    2    1     d
5     2    2    1     e
6     3    2    1     f
7     1    3    1     g
8     2    3    1     h
9     3    3    1     i
10    1    1    2     j
11    2    1    2     k
12    3    1    2     l
13    1    2    2     m
14    2    2    2     n
15    3    2    2     o
16    1    3    2     p
17    2    3    2     q
18    3    3    2     r
19    1    1    3     s
20    2    1    3     t
21    3    1    3     u
22    1    2    3     v
23    2    2    3     w
24    3    2    3     x
25    1    3    3     y
26    2    3    3     z
27    3    3    3  <NA>

Dummy Data

> (g <- array(letters[1:27], dim = c(3, 3, 3)))
, , 1

     [,1] [,2] [,3]
[1,] "a"  "d"  "g"
[2,] "b"  "e"  "h"
[3,] "c"  "f"  "i"

, , 2

     [,1] [,2] [,3]
[1,] "j"  "m"  "p"
[2,] "k"  "n"  "q"
[3,] "l"  "o"  "r"

, , 3

     [,1] [,2] [,3]
[1,] "s"  "v"  "y"
[2,] "t"  "w"  "z"
[3,] "u"  "x"  NA
ThomasIsCoding
  • 96,636
  • 9
  • 24
  • 81
3

Benchmarking, Just for Fun

Here are some interesting benchmarking observations for arrays of different dimensions (without considering the dimension names for simplifications), where multiple existing solutions to the posted questions are taken into account.

Disclaimer: We DON'T provide a conclusion which is the "best", but you (not limited to the OP but for everyone that might need this sort of functionality, i.e., indexing of multi-dimensional arrays) have the degree of freedom to define the one suits your purpose best.


Below is the benchmarking function with respect to the dimension argument of a random array

library(microbenchmark)
library(data.table)

fbench <- function(dims) {
    # dummy data for test
    set.seed(0)
    g <- array(sample(prod(dims)), dim = dims)

    # list of approaches
    expgrd <- function() {
        data.frame(expand.grid(lapply(dim(g), seq_len)), value = as.vector(g))
    }

    arrind <- function() {
        data.frame(arrayInd(seq_along(g), dim(g)), value = as.vector(g))
    }

    which0 <- function() {
        data.frame(which(array(TRUE, dim(g)), arr.ind = TRUE), value = as.vector(g))
    }

    which1 <- function() {
        cbind(as.data.frame(which(1^is.na(g) > 0, arr.ind = TRUE)), value = c(g))
    }

    which2 <- function() {
        cbind(as.data.frame(which(TRUE | is.na(g), arr.ind = TRUE)), value = c(g))
    }

    which3 <- function() {
        cbind(as.data.frame(which(nchar(g, "width") > -1, arr.ind = TRUE)), value = c(g))
    }

    dftable0 <- function() {
        list2DF(lapply(as.data.frame.table(g), unclass))
    }

    dftable1 <- function() {
        list2DF(lapply(as.data.frame(ftable(g)), unclass))
    }

    dttable <- function() {
        as.data.table(g, sorted = FALSE, na.rm = FALSE)
    }

    rem0 <- function() {
        as.data.frame(cbind(1 + mapply(
            `%%`,
            Reduce(`%/%`, dim(g)[-length(dim(g))], 0:(length(g) - 1), accumulate = TRUE),
            dim(g)
        ), Value = as.vector(g)))
    }

    rem1 <- function() {
        . <- 0:(length(g) - 1)
        as.data.frame(cbind(
            1 +
                t(t(cbind(., outer(., cumprod(dim(g)[-length(dim(g))]), `%/%`))) %% dim(g)),
            Value = g
        ))
    }

    reprep <- function() {
        list2DF(c(Map(
            \(i, j, n) rep(rep(1:i, each = j), length.out = n),
            dim(g),
            c(1, cumprod(dim(g)[-length(dim(g))])),
            length(g)
        ), Value = list(as.vector(g))))
    }

    # benchmarking module
    # benchmarking module
    mbm <- microbenchmark(
        expgrd(),
        arrind(),
        which0(),
        which1(),
        which2(),
        which3(),
        dftable0(),
        dftable1(),
        dttable(),
        rem0(),
        rem1(),
        reprep(),
        times = 50L,
        check = "equivalent"
    )

    boxplot(mbm, main = sprintf("dim = [%s]", toString(dims)), las = 2)
}

  1. For dim <- rep(5, 3), we run fbench(dims) and obtain enter image description here

  2. For dims <- rep(5, 4), we run fbench(dims) and obtain enter image description here

  3. For dims <- rep(5, 5), we run fbench(dims) and obtain enter image description here

  4. For dims <- rep(5, 6), we run fbench(dims) and obtain enter image description here

  5. For dims <- rep(5, 7), we run fbench(dims) and obtain enter image description here

  6. For dims <- rep(5, 8), we run fbench(dims) and obtain enter image description here

ThomasIsCoding
  • 96,636
  • 9
  • 24
  • 81
  • My takeaway from this would be "`dftable0` is usually pretty good"... – Ben Bolker Jun 04 '23 at 21:05
  • @BenBolker well...I would say that the size of array matters to the performance, where `dftable0` is always the middle-class regardless of the size :) – ThomasIsCoding Jun 04 '23 at 21:15
  • I guess (1) I don't expect this component to be a significant performance bottleneck (2) `dftable0` never seems to be *terrible* and (3) I prefer the `list2DF` solutions on aesthetic grounds ... – Ben Bolker Jun 04 '23 at 21:20
  • Maybe you can add `reshape2::melt` and the variants using `%%` and `%/%` or `rep`? – GKi Jun 05 '23 at 04:31
  • @BenBolker Yes, that's fair enough :) – ThomasIsCoding Jun 05 '23 at 07:37
  • @GKi yes, added. Interesting that `rep` has such a strong performance! Cool! – ThomasIsCoding Jun 05 '23 at 08:17
  • Thanks! Performance will change by size and might not be that important. But anyway nice comparison. – GKi Jun 05 '23 at 08:50
  • Now I'm thinking about adding the `rep`-based solution to `gtools` ... – Ben Bolker Jun 05 '23 at 14:09
  • @BenBolker Fine to read that my code will maybe be used in `gtools`! – GKi Jun 05 '23 at 16:13
  • Hmm. I thought these would all preserve dimnames but apparently `reprep` doesn't ... ?? – Ben Bolker Jun 06 '23 at 17:14
  • @BenBolker Nope. none of the approaches for this benchmark will preserve the dimension names. – ThomasIsCoding Jun 06 '23 at 20:46
  • I think that's not true - `a <- array(1:8, dim = c(2,2,2), dimnames=list(d1 = letters[1:2], d2 = LETTERS[1:2], d3 = c("x", "y"))); as.data.frame.table(a)` is built-in, is never worst, and preserves dimnames and dimname-names ... – Ben Bolker Jun 06 '23 at 20:53
  • 1
    @BenBolker Yes, you are right on that point. However, in this benchmark, I enforce all approaches to have an uniform output for the fair comparison, i.e., integer indexing manner, instead of dimension names. That's why `as.data.frame.table(a)` is followed by `unclass` in my benchmarking script. – ThomasIsCoding Jun 06 '23 at 21:10