Add group counts in dplyr R outputs wrong numbers?

Question

Here is my sample df:

df <- structure(list(user_id = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), obs_id = c(1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), timestamp = c(135560962809215, 
135560977720600, 135560994815985, 135561010710946, 135561027891754, 
135561044085292, 135561060963292, 135561079116292, 135561096005254, 
135561112681985, 135561129308946, 135561145911561, 135561162521485, 
135561179346754, 135561196266869, 135561207020177, 135561208190561, 
135561684298600, 135561696513330, 135561712869100, 135561729868407, 
135561746919830, 135561762999292, 135561779818446, 135561796528676, 
135561812920676, 135561830544369, 135561846396561, 135561863018138, 
135561886197176, 135561896428599, 135561913210561, 135561930595830, 
135561946400638, 135561962972830, 135561979715292, 135561991182253, 
135561992557715, 135562792606330, 135562813626137, 135562830549483, 
135562847068137, 135562863564560, 135562887464368, 135562896809753, 
135562913609945, 135562930201291, 135562946752868, 135562963382137, 
135562979985022), x = c(866.4551, 866.4551, 865.9743, 865.4467, 
861.27234, 858.7928, 860.4923, 860.8814, 863.5331, 868.2798, 
873.31616, 878.6383, 885.2693, 897.54736, 911.0174, 924.72656, 
924.72656, 852.2168, 852.2168, 851.1328, 849.8168, 843.0342, 
840.08405, 839.5462, 839.83057, 842.2078, 844.60345, 846.9745, 
850.0212, 853.7801, 858.75287, 865.1152, 872.62573, 880.48303, 
888.81494, 898.601, 908.6426, 908.6426, 870.1465, 870.1393, 869.60895, 
869.08875, 863.813, 861.976, 862.23035, 861.96906, 864.5737, 
868.3425, 875.642, 880.27234), y = c(1142.71, 1139.997, 1133.6244, 
1124.1506, 1075.3293, 1041.501, 1014.3225, 979.9761, 952.1288, 
922.7904, 892.8203, 863.80347, 830.62524, 789.3959, 756.1295, 
714.53613, 714.53613, 1182.2754, 1181.1726, 1175.6511, 1166.3911, 
1127.9597, 1093.8245, 1069.8573, 1045.0938, 1022.8574, 1002.0753, 
982.60486, 967.4147, 953.06824, 935.83545, 916.78284, 889.82056, 
867.2317, 843.1273, 820.83777, 789.87305, 789.87305, 1219.6729, 
1216.9186, 1210.3121, 1200.0981, 1146.955, 1113.3568, 1086.0355, 
1056.4296, 1028.7742, 997.1078, 964.5531, 936.01086), size = c(0.027450982, 
0.03137255, 0.03137255, 0.03137255, 0.03137255, 0.03529412, 0.03529412, 
0.039215688, 0.039215688, 0.043137256, 0.039215688, 0.043137256, 
0.043137256, 0.043137256, 0.050980397, 0.050980397, 0.050980397, 
0.03137255, 0.027450982, 0.03137255, 0.03529412, 0.03529412, 
0.03529412, 0.039215688, 0.039215688, 0.039215688, 0.043137256, 
0.039215688, 0.039215688, 0.039215688, 0.043137256, 0.043137256, 
0.039215688, 0.043137256, 0.039215688, 0.04705883, 0.050980397, 
0.050980397, 0.023529414, 0.023529414, 0.023529414, 0.027450982, 
0.03137255, 0.03529412, 0.03529412, 0.039215688, 0.03529412, 
0.039215688, 0.039215688, 0.043137256), pressure = c(1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1), digit = c(2131165279, 2131165279, 2131165279, 
2131165279, 2131165279, 2131165279, 2131165279, 2131165279, 2131165279, 
2131165279, 2131165279, 2131165279, 2131165279, 2131165279, 2131165279, 
2131165279, 2131165279, 2131165279, 2131165279, 2131165279, 2131165279, 
2131165279, 2131165279, 2131165279, 2131165279, 2131165279, 2131165279, 
2131165279, 2131165279, 2131165279, 2131165279, 2131165279, 2131165279, 
2131165279, 2131165279, 2131165279, 2131165279, 2131165279, 2131165279, 
2131165279, 2131165279, 2131165279, 2131165279, 2131165279, 2131165279, 
2131165279, 2131165279, 2131165279, 2131165279, 2131165279), 
    state = c(1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    -1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, -1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), press_id = c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
    2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
    3L, 3L, 3L, 3L), n = c(124L, 124L, 124L, 124L, 124L, 124L, 
    124L, 124L, 124L, 124L, 124L, 124L, 124L, 124L, 124L, 124L, 
    124L, 110L, 110L, 110L, 110L, 110L, 110L, 110L, 110L, 110L, 
    110L, 110L, 110L, 110L, 110L, 110L, 110L, 110L, 110L, 110L, 
    110L, 110L, 145L, 145L, 145L, 145L, 145L, 145L, 145L, 145L, 
    145L, 145L, 145L, 145L)), .Names = c("user_id", "obs_id", 
"timestamp", "x", "y", "size", "pressure", "digit", "state", 
"press_id", "n"), row.names = c(NA, -50L), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"), vars = c("user_id", "press_id"
), drop = TRUE, indices = list(0:16, 17:37, 38:49), group_sizes = c(17L, 
21L, 12L), biggest_group_size = 21L, labels = structure(list(
    user_id = c(1L, 1L, 1L), press_id = 1:3), row.names = c(NA, 
-3L), class = "data.frame", vars = c("user_id", "press_id"), drop = TRUE, .Names = c("user_id", 
"press_id")))

According to the recipe provided by Akrun here I have done the following:

df %>% group_by(user_id, press_id) %>% mutate(c = n())

But I get 124 for the first group and it's not the right number. It should be 17 for user_id = 1 + press_id = 1.

Please advise what is wrong here, I have used the recipe by Akrun. Ungrouped before.

You need to check your outcome again. If I run this code, I get 17 for the `user_id = 1 + press_id = 1`, and 21 for the next group. Maybe restart your R session. — phiver, Sep 13 '18 at 11:59
I can't replicate your issue. If I do `df %>% group_by(user_id, press_id) %>% summarise(c = n())` I get `17` for first group. Also, `124` seems impossible given that you have only `50` rows! — AntoniosK, Sep 13 '18 at 11:59
What do you get if you run your code on the dataset you posted? If you apply your code to another dataset I have no idea if it's correct or not :) Maybe `124` is the correct count for your first group. — AntoniosK, Sep 13 '18 at 12:11
Did you change anything with the colnames maybe? It could be, that your col named n shifted into c, since n is 124? — TinglTanglBob, Sep 13 '18 at 12:12
No, what you see is after calculation that's why you see 124. Where can I put the whole 695 rows for you to check? — SteveS, Sep 13 '18 at 12:13
@AntoniosK where can I share the dataset for you to see it's wrong. — SteveS, Sep 13 '18 at 12:20
You can use `dput` for that dataset, or check yourself using `df %>% filter(user_id == 1 & press_id == 1)` and see how many rows it returns for your first group. — AntoniosK, Sep 13 '18 at 12:22

score 0 · Answer 1 · answered Sep 13 '18 at 12:00

no error here

df %>% group_by( user_id, press_id) %>% mutate( c = n() )
# # A tibble: 50 x 12
# # Groups:   user_id, press_id [3]
#   user_id obs_id timestamp     x     y   size pressure      digit state press_id     n     c
# <int>  <int>     <dbl> <dbl> <dbl>  <dbl>    <dbl>      <dbl> <dbl>    <int> <int> <int>
# 1       1      1   1.36e14  866. 1143. 0.0275        1 2131165279     1        1   124    17
# 2       1      1   1.36e14  866. 1140. 0.0314        1 2131165279     0        1   124    17
# 3       1      1   1.36e14  866. 1134. 0.0314        1 2131165279     0        1   124    17
# 4       1      1   1.36e14  865. 1124. 0.0314        1 2131165279     0        1   124    17
# 5       1      1   1.36e14  861. 1075. 0.0314        1 2131165279     0        1   124    17
# 6       1      1   1.36e14  859. 1042. 0.0353        1 2131165279     0        1   124    17
# 7       1      1   1.36e14  860. 1014. 0.0353        1 2131165279     0        1   124    17
# 8       1      1   1.36e14  861.  980. 0.0392        1 2131165279     0        1   124    17
# 9       1      1   1.36e14  864.  952. 0.0392        1 2131165279     0        1   124    17
# 10       1      1   1.36e14  868.  923. 0.0431        1 2131165279     0        1   124    17

Got it, it was obs_id who did the mess. – SteveS Sep 13 '18 at 12:23 — SteveS, Sep 13 '18 at 12:23

Add group counts in dplyr R outputs wrong numbers?

1 Answers1