0

I have a data frame of term frequencies and some other random demographic variables. I want to utilize two grouping variables, drop the ones I do not need, and sum the frequencies based on the grouping variables.

Here is similar to what I have

df <- data.frame(user= c(1:9),
                 Group1 = c("a", "a", "a", "b", "b","b","c", "c", "c"),
                 Group2 = c("d", "e", "d", "e", "d", "e", "e", "e", "e"),
                 term1 = c(0, 1, 1, 0, 1, 1, 0, 0, 0),
                 term2 = c(1, 0, 1, 1, 0, 1, 0, 1, 1),
                 term3 = c(0, 1, 0, 0, 0, 0, 1, 1, 0))

and here is what I am trying to get.

desired <- data.frame(Group1 = c("a", "a", "b", "b", "c", "c"),
                      Group2 = c("d", "e", "d", "e", "d", "e"),
                      term1 = c(1, 1, 1, 1, 0, 0),
                      term2 = c(2, 0, 0, 2, 0, 2),
                      term3 = c(0, 1, 0, 0, 0, 2))

My real frame has about 4000 term columns, so naming each one individual in a dplyr function does not seem feasible.

Thank you!

OKLM
  • 85
  • 7
  • Related - [Aggregate / summarize multiple variables per group (e.g. sum, mean)](https://stackoverflow.com/questions/9723208/aggregate-summarize-multiple-variables-per-group-e-g-sum-mean) – Ronak Shah Jun 04 '21 at 03:49

3 Answers3

2

You can try aggregate + expand.grid + merge

merge(
  with(df, expand.grid(Group1 = unique(Group1), Group2 = unique(Group2))),
  aggregate(. ~ Group1 + Group2, df[-1], sum),
  all = TRUE
)

which gives

  Group1 Group2 term1 term2 term3
1      a      d     1     2     0
2      a      e     1     0     1
3      b      d     1     0     0
4      b      e     1     2     0
5      c      d    NA    NA    NA
6      c      e     0     2     2

If you want to have NAs as 0, you can try

> res <- merge(
  with(df, expand.grid(Group1 = unique(Group1), Group2 = unique(Group2))),
  aggregate(. ~ Group1 + Group2, df[-1], sum),
  all = TRUE
)

> replace(res, is.na(res), 0)
  Group1 Group2 term1 term2 term3
1      a      d     1     2     0
2      a      e     1     0     1
3      b      d     1     0     0
4      b      e     1     2     0
5      c      d     0     0     0
6      c      e     0     2     2
ThomasIsCoding
  • 96,636
  • 9
  • 24
  • 81
2

We can group by 'Group1, 'Group2', get the sum of 'term' columns in summarise and expand the data with complete for the missing combinations

library(dplyr)
library(tidyr)
df %>%
     group_by(Group1, Group2) %>% 
     summarise(across(starts_with('term'), sum), .groups = 'drop') %>%
     complete(Group1, Group2, fill = list(term1 = 0, term2 = 0, term3 = 0))

-output

# A tibble: 6 x 5
  Group1 Group2 term1 term2 term3
  <chr>  <chr>  <dbl> <dbl> <dbl>
1 a      d          1     2     0
2 a      e          1     0     1
3 b      d          1     0     0
4 b      e          1     2     0
5 c      d          0     0     0
6 c      e          0     2     2
akrun
  • 874,273
  • 37
  • 540
  • 662
1

If you don't need to compete all varible, setDT(df)[,lapply(.SD[,-1], sum),.(Group1,Group2)] is enough. Otherwise, you can use complete in package tidyr (as used in the first answer) to fill the lacking varible.

library(data.table)
library(tidyr)

setDT(df)[,lapply(.SD[,-1], sum),.(Group1,Group2)] %>%
    complete(Group1, Group2, fill = list(term1 = 0, term2 = 0, term3 = 0))
#> # A tibble: 6 x 5
#>   Group1 Group2 term1 term2 term3
#>   <chr>  <chr>  <dbl> <dbl> <dbl>
#> 1 a      d          1     2     0
#> 2 a      e          1     0     1
#> 3 b      d          1     0     0
#> 4 b      e          1     2     0
#> 5 c      d          0     0     0
#> 6 c      e          0     2     2
Peace Wang
  • 2,399
  • 1
  • 8
  • 15