0

I want to create a summary of the proportions of 1's and 2's in the groups of the following data. I'm referring to the column using nme as I will be using this within a loop

Data:

df <- data.frame(
  x = sample(1:2,100,replace=T),
  g = c( rep(1,20), rep(2,20), rep(3,20), rep(4,20), rep(5,20))
)

First attempt

loop_value <- 1
nme <- names(df)[loop_value]
df %>% 
  group_by(g) %>%
  select(nme, g) %>%
  summarise(s1 = sum(nme==1), 
                p1 = sum(nme==1)/length(nme)
                )

Second attempt

loop_value <- 1
nme <- names(df)[loop_value]
df %>% 
  group_by(g) %>%
  select(nme, g) %>%
  summarise(s1 = sum(df[nme]==1), 
                p1 = sum(df[nme]==1)/length(df[nme]))

Output from first attempt

# A tibble: 5 x 3
      g    s1    p1
  <dbl> <int> <dbl>
1     1     0     0
2     2     0     0
3     3     0     0
4     4     0     0
5     5     0     0

And from second

# A tibble: 5 x 3
      g    s1    p1
  <dbl> <int> <dbl>
1     1    58    58
2     2    58    58
3     3    58    58
4     4    58    58
5     5    58    58

For p1 in group 1 I would expect 0.65 based on

> prop.table(table(df[df$g==1,]$x))

   1    2 
0.65 0.35 
baxx
  • 3,956
  • 6
  • 37
  • 75

2 Answers2

2

Try using summarise_at and pass a list of functions to calculate number of occurrence of 1 in the data and proportion of 1 in the data for each group.

library(dplyr)

df %>% 
  group_by(g) %>%
  summarise_at(nme, list(~sum(. == 1), ~mean(. == 1)))

# A tibble: 5 x 3
#      g   sum  mean
#  <dbl> <int> <dbl>
#1     1     6  0.3 
#2     2     7  0.35
#3     3     9  0.45
#4     4    10  0.5 
#5     5    10  0.5 

data

set.seed(1234)
df <- data.frame(x = sample(1:2,100,replace=T),
                 g = c( rep(1,20), rep(2,20), rep(3,20), rep(4,20), rep(5,20)))
Ronak Shah
  • 377,200
  • 20
  • 156
  • 213
1

When I need to loop over variables, I find it easier to put in long format first, and then summarize.

out1 <- df %>% gather("variable", "value", x) %>%
  mutate(value=value==1) %>%
  group_by(g, variable) %>% summarize(s=sum(value), p=mean(value))
    out1
# # A tibble: 5 x 4
# # Groups:   g [5]
#       g variable     s     p
#   <dbl> <chr>    <int> <dbl>
# 1     1 x            6  0.3 
# 2     2 x            7  0.35
# 3     3 x            9  0.45
# 4     4 x           10  0.5 
# 5     5 x           10  0.5 

I then will gather/spread again if I want just one row per group.

out1 %>% gather("stat", "value", s, p) %>% 
  mutate(X=paste0(variable, "_", stat)) %>%
  select(-variable, -stat) %>% spread(X, value)
# # A tibble: 5 x 3
# # Groups:   g [5]
#       g   x_p   x_s
#   <dbl> <dbl> <dbl>
# 1     1  0.3      6
# 2     2  0.35     7
# 3     3  0.45     9
# 4     4  0.5     10
# 5     5  0.5     10

A bit longer way around than possible other ways, but easier for me to keep track of what's going on.

Aaron left Stack Overflow
  • 36,704
  • 7
  • 77
  • 142