1

Currently I am working on a data set and I want to know which regions have the same pattern. To clarify this I made an input dataset and an output dataset.

# Input data:
  Region F1 F2 F3
1      A  1  2  3
2      B  1  2  2
3      B  1  2  2
4      A  1  2  3
5      B  3  2  1
6      C  1  2  2
7      C  1  2  3
8      C  3  2  1
9      D  3  2  1

# Output data:
  F1 F2 F3 Number_Pattern Name_Region Total_Region
1  1  2  3              3        A, C            2
2  1  2  2              3        B, C            2
3  3  2  1              3     B, C, D            3

So far I have only counted how many patterns there are in the input dataset. I am unable to get the feature Region behind the patterns with number and name (such as the output data).

library(dplyr)

# Input data
input <- data.frame(
  Region = c('A', 'B', 'B', 'A', 'B', 'C', 'C', 'C', 'D'),
  F1 = c(1, 1, 1, 1, 3, 1, 1, 3, 3),
  F2 = c(2, 2, 2, 2, 2, 2, 2, 2, 2),
  F3 = c(3, 2, 2, 3, 1, 2, 3, 1, 1)
)

output <- input %>%
  select(Region, F1, F2, F3) %>%
  group_by(F1, F2, F3) %>%
  dplyr::summarise(Number_Pattern =n(), .groups ='drop')

Guest987
  • 15
  • 4

3 Answers3

1

You can use unique to get all unique Region in a group and n_distinct to count it.

library(dplyr)

input %>%
  group_by(F1, F2, F3) %>%
  #Or if there are many columns
  #group_by(across(starts_with('F'))) %>%
  summarise(Number_Pattern = n(), 
            Name_Region = toString(unique(Region)), 
            Total_Region  = n_distinct(Region))


#     F1    F2    F3 Number_Pattern Name_Region Total_Region
#  <dbl> <dbl> <dbl>          <int> <chr>              <int>
#1     1     2     2              3 B, C                   2
#2     1     2     3              3 A, C                   2
#3     3     2     1              3 B, C, D                3
Ronak Shah
  • 377,200
  • 20
  • 156
  • 213
0

here is a data.table approach

library( data.table )
setDT(input)[, .( Number_Pattern = .N, 
                  Name_Region = paste0( unique(Region), collapse = ", "),
                  Total_Region = uniqueN(Region) ), 
               by = .(F1, F2, F3) ]


# Output data:
#   F1 F2 F3 Number_Pattern Name_Region Total_Region
# 1  1  2  3              3        A, C            2
# 2  1  2  2              3        B, C            2
# 3  3  2  1              3     B, C, D            3
Wimpel
  • 26,031
  • 1
  • 20
  • 37
0

Same logic in base R:

output <- aggregate(
  Region ~F1 + F2 + F3, 
  input, 
  function(x) {y <-unique(x); c(Number_Pattern = length(x), Name_Region = toString(y), Total_Region = length(y))}
)
cbind(output[paste0("F", 1:3)], data.frame(output[[4]]))

#   F1 F2 F3 Number_Pattern Name_Region Total_Region
# 1  3  2  1              3     B, C, D            3
# 2  1  2  2              3        B, C            2
# 3  1  2  3              3        A, C            2
s_baldur
  • 29,441
  • 4
  • 36
  • 69