3

I would like to assign the columns to a specific group, Cluster1, Cluster2 or Cluster3, depending on the highest values for the columns. Suggestion on functions (dplyr?) to look into is appreciated.

  Group                Sample1         Sample2        Sample3
1 Cluster1             0.1             0               0.1
2 Cluster2             0.4             0.3             0.01
3 Cluster3             0               0.9             0.92

Expected output

Sample1 Cluster2
Sample2 Cluster3
Sample3 Cluster3
df <- structure(list(Group = c("Cluster1", "Cluster2", "Cluster3"), 
  Sample1 = c(0.1, 0.4, 0), Sample2 = c(0, 0.3, 0.9), Sample3 = c(0.1, 
  0.01, 0.92)), class = "data.frame", row.names = c("1", "2", "3"))
Darren Tsai
  • 32,117
  • 5
  • 21
  • 51
user2300940
  • 2,355
  • 1
  • 22
  • 35

5 Answers5

3

With max.col + t, we can create a data.frame like

data.frame(
    Sample = names(df)[-1],
    Group = df$Group[max.col(t(df[-1]))]
)

which produces

   Sample    Group
1 Sample1 Cluster2
2 Sample2 Cluster3
3 Sample3 Cluster3
ThomasIsCoding
  • 96,636
  • 9
  • 24
  • 81
3

Another way using which.max in lapply to subset df$Group.

cbind(lapply(df[-1], \(x) df$Group[which.max(x)]))
#        [,1]      
#Sample1 "Cluster2"
#Sample2 "Cluster3"
#Sample3 "Cluster3"

Or using vapply.

cbind(vapply(df[-1], \(x) df$Group[which.max(x)], ""))

Or using the index and create a data.frame.

data.frame(Sample = names(df)[-1],
  Group = df$Group[vapply(df[-1], which.max, 0L)])
#   Sample    Group
#1 Sample1 Cluster2
#2 Sample2 Cluster3
#3 Sample3 Cluster3

Benchmark

library(dplyr)
library(tidyr)

bench::mark(check = FALSE,
"Darren Tsai" = {df %>%
  pivot_longer(-Group) %>%
    slice_max(value, by = name)},
Sotos = {df %>% 
 pivot_longer(-1) %>% 
 group_by(name) %>% 
   summarise(Group = Group[value == max(value)])},
GuedesBF = {df |> summarise(across(starts_with("sample"),
                       ~Group[which.max(.x)])) |> 
      pivot_longer(everything())},
ThomasIsCoding = data.frame(
    Sample = names(df)[-1],
    Group = df$Group[max.col(t(df[-1]))]
),
GKi1 = cbind(lapply(df[-1], \(x) df$Group[which.max(x)])),
GKi2 = cbind(vapply(df[-1], \(x) df$Group[which.max(x)], "")),
GKi3 = data.frame(Sample = names(df)[-1],
                  Group = df$Group[vapply(df[-1], which.max, 0L)])
)

Result

  expression          min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc
  <bch:expr>     <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>
1 Darren Tsai      5.66ms   5.74ms      173.    3.31MB     8.55    81     4
2 Sotos            6.42ms   6.56ms      152.    1.11MB    11.1     68     5
3 GuedesBF         5.81ms   5.91ms      169.  402.38KB     8.55    79     4
4 ThomasIsCoding 292.09µs 308.49µs     3204.   96.38KB    10.3   1563     5
5 GKi1             35.3µs  38.89µs    25198.    11.1KB    12.6   9995     5
6 GKi2            36.09µs  39.67µs    24528.    7.96KB    14.7   9994     6
7 GKi3           215.37µs 225.92µs     4348.        0B    10.2   2123     5
GKi
  • 37,245
  • 2
  • 26
  • 48
2

Convert to long first and aggregate, i.e.

library(dplyr)
library(tidyr)

df %>% 
 pivot_longer(-1) %>% 
 group_by(name) %>% 
 summarise(Group = Group[value == max(value)])

# A tibble: 3 × 2
  name    Group   
  <chr>   <chr>   
1 Sample1 Cluster2
2 Sample2 Cluster3
3 Sample3 Cluster3
Sotos
  • 51,121
  • 6
  • 32
  • 66
2

With tidyverse, you can stack those Sample columns and then slice_max() by groups.

library(dplyr)
library(tidyr)

df %>%
  pivot_longer(-Group) %>%
  slice_max(value, by = name)

# # A tibble: 3 × 3
#   Group    name    value
#   <chr>    <chr>   <dbl>
# 1 Cluster2 Sample1  0.4 
# 2 Cluster3 Sample2  0.9 
# 3 Cluster3 Sample3  0.92

You can determine whether ties should be kept or not by adjusting the argument with_ties(default to TRUE) in slice_max.

Darren Tsai
  • 32,117
  • 5
  • 21
  • 51
0

We cansummarise with which.max and finally use pivot_longer if needed. This way we do not have to group_by, which could be slower if we have too many sample columnns

library(dplyr)
library(tidyr)

df |> summarise(across(starts_with("sample"),
                       ~Group[which.max(.x)])) |> 
      pivot_longer(everything())

# A tibble: 3 × 2
  name    value   
  <chr>   <chr>   
1 Sample1 Cluster2
2 Sample2 Cluster3
3 Sample3 Cluster3
GuedesBF
  • 8,409
  • 5
  • 19
  • 37