Aggregate a data frame based on unordered three columns

Question

I would like to do the exact same thing as posted here: Aggregate a data frame based on unordered pairs of columns But with 3 columns instead of two.

   X1   X2    X3   count
   A    B     C      1
   B    A     C      1
   D    N     A      1
   A    D     N      1
   C    B     A      1
   N    D     A      1 
   A    D     N      1

Ideally I'd like to have the following:

  X1    X2     X3    count
  A     B      C      3
  A     D      N      4

I've tried adapting the code but I cannot make it work and haven't found a workaround neither with dplyr nor data.table.

require(data.table)
DT <- data.table(DF)
DT.OUT <- DT[, list(size=sum(count)), 
    by=list(id1 = pmin(X1,X2,X3),id2 = c(X1,X2,X3)[!c(c(X1,X2,X3) %in% c(pmax(X1,X2,X3), pmin(X1,X2,X3)))] ,id3 = pmax(X1,X2,X3))]

May I ask how big is your dataset? – chinsoon12 Jun 30 '20 at 22:35 — chinsoon12, Jun 30 '20 at 22:35
10,865 rows and the same columns as shown (X1,X2,X3,count) – adrian1121 Jun 30 '20 at 22:41 — adrian1121, Jun 30 '20 at 22:41

chinsoon12 · Answer 1 · 2020-06-30T23:30:33.130

Here is another option:

DT[, (cols) := {
        m <- as.matrix(.SD)
        as.data.table(matrix(m[order(row(m), m)], nrow=.N, byrow=TRUE))
    }, .SDcols=cols][, 
        sum(count), cols]

sample data and timing code:

library(dplyr)
library(tidyr)
library(data.table)

set.seed(0L)
nr <- 10865L
nc <- 3L
cols <- paste0("V", 1L:nc)
DT <- as.data.table(matrix(sample(LETTERS, nr*length(cols), TRUE), nrow=nr))[, count := 1L]
    
microbenchmark::microbenchmark(times=1L,
    a0 = aggregate(DT$count, as.data.frame(t(apply(DT[,1:3], 1, sort))), sum),
    a1 = {
        DT %>%
            rowwise() %>%
            mutate(grp = toString(sort(c(V1, V2, V3)))) %>%
            ungroup() %>%
            group_by(grp) %>%
            summarize(count = sum(count)) %>%
            separate(grp, into = c("V1", "V2", "V3"))
    },
    a2 = {
        unique(melt(DT[, rn := .I], id.var = c('rn', 'count'))[, 
            grp :=  toString(sort(unique(value))), rn], by = c("rn", "grp"))[, 
                .(count = sum(count)), grp]
    },
    a3 = {
        DT[, c(sort(.SD), list(count = count)), 1:nrow(DT), 
            .SDcols = V1:V3][, .(count = sum(count)), .(V1, V2, V3)]
    },
    a4 = DT[, (cols) := {
        m <- as.matrix(.SD)
        as.data.table(matrix(m[order(row(m), m)], nrow=.N, byrow=TRUE))
        }, .SDcols=cols][, 
            sum(count), cols]
)

timings:

Unit: milliseconds
 expr       min        lq      mean    median        uq       max neval
   a0  295.3281  295.3281  295.3281  295.3281  295.3281  295.3281     1
   a1  281.8369  281.8369  281.8369  281.8369  281.8369  281.8369     1
   a2  284.0112  284.0112  284.0112  284.0112  284.0112  284.0112     1
   a3 4131.7221 4131.7221 4131.7221 4131.7221 4131.7221 4131.7221     1
   a4   11.7304   11.7304   11.7304   11.7304   11.7304   11.7304     1

Nice approach with the use of `matrix` conversion. Would that have any impact on memory in big datasets? — akrun, Jul 01 '20 at 00:02
@akrun Interesting question. I also think that there is an impact because of the conversion to matrix — chinsoon12, Jul 01 '20 at 00:27

akrun · Answer 2 · 2020-06-30T23:28:24.657

An option is to melt with data.table

library(data.table)
unique(melt(DT[, rn := .I], id.var = c('rn', 'count'))[, 
  grp :=  toString(sort(unique(value))), rn], by = c("rn", "grp"))[, 
    .(count = sum(count)), grp]
#       grp count
#1: A, B, C     3
#2: A, D, N     4

Or another option is to loop over the rows to do the sort and then do a group by sum

DT[, c(sort(.SD), list(count = count)), 1:nrow(DT), 
     .SDcols = X1:X3][, .(count = sum(count)), .(X1, X2, X3)]
#   X1 X2 X3 count
#1:  A  B  C     3
#2:  A  D  N     4

d.b · Accepted Answer · 2020-07-01T00:01:04.073

Use apply to sort the first three columns rowwise and then use aggregate to obtain the sum of count

aggregate(d$count, as.data.frame(t(apply(d[1:3], 1, sort))), sum)
#  V1 V2 V3 x
#1  A  B  C 3
#2  A  D  N 4

OR

d %>%
    rowwise() %>%
    mutate(grp = toString(sort(c(X1, X2, X3)))) %>%
    ungroup() %>%
    group_by(grp) %>%
    summarize(count = sum(count)) %>%
    separate(grp, into = c("x1", "x2", "x3"))
## A tibble: 2 x 4
#  x1    x2    x3    count
#  <chr> <chr> <chr> <int>
#1 A     B     C         3
#2 A     D     N         4

OR

f = as.character(sort(unique(unlist(d[1:3]))))
d %>%
    rowwise() %>%
    mutate(grp = toString(f[f %in% c(X1, X2, X3)])) %>%
    ungroup() %>%
    group_by(grp) %>%
    summarize(count = sum(count)) %>%
    separate(grp, into = c("x1", "x2", "x3"))
## A tibble: 2 x 4
#  x1    x2    x3    count
#  <chr> <chr> <chr> <int>
#1 A     B     C         3
#2 A     D     N         4

Excellent, very elegant solutions! Thanks so much! – adrian1121 Jul 01 '20 at 08:05 — adrian1121, Jul 01 '20 at 08:05

det · Answer 4 · 2020-07-01T08:24:33.257

df[, letters := pmap_chr(list(X1, X2, X3), ~c(...) %>% sort() %>% str_c(collapse = ""))]
df[, .N, by = letters]

gives:

   letters N
1:     ABC 3
2:     ADN 4

If you want X1, X2 and X3 columns then u can use one of:

extract(l, letters, into = c("X1", "X2", "X3"), "(.)(.)(.)")
separate(l, letters, into = c("X1", "X2", "X3"), sep = 1:3)

where l is data.frame containing variable letters and N. Then result is:

   X1 X2 X3 N
1:  A  B  C 3
2:  A  D  N 4

It's easy to adapt this algorithm to more than 3 columns.

Aggregate a data frame based on unordered three columns

4 Answers4