when I try to summeraize my data I get some strange results. But using group_by and than summeraize gives me only one observation, even though I have more different groups. Adding an Argument to group_by only gives one more Variable in the result. However, like other post suggest (e.g. Why does summarize or mutate not work with group_by when I load `plyr` after `dplyr`?) it could be plyr. But actually I didn't load plyr and in my code I directly refer to dplyr. How to I get my expected result (one value each group)? In the following my dput of my 1) code, 2) my original tabel: comb_extract_all and 3) my resulting table.
comb_extract_all_agg <- comb_extract_all %>% dplyr::group_by("SurveyId", "hhid", "CLUSTER") %>%
dplyr::summarize(hc70 =mean(hc70, na.rm = TRUE)) %>%
ungroup()
dput(comb_extract_all[1:10,1:10])
structure(list(hhid = c(" 1 27", " 1 27", " 1 27",
" 1 27", " 1 67", " 1 67", " 1 67",
" 1 67", " 1 67", " 1225"), hv001 = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), hv002 = c(27L, 27L, 27L,
27L, 67L, 67L, 67L, 67L, 67L, 225L), hv005 = c(1707326L, 1707326L,
1707326L, 1707326L, 1707326L, 1707326L, 1707326L, 1707326L, 1707326L,
1707326L), hv007 = c(2003L, 2003L, 2003L, 2003L, 2003L, 2003L,
2003L, 2003L, 2003L, 2003L), hv021 = c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), hv023 = structure(c(6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L), labels = structure(1:23, names = c("tigray urban",
"tigray rural", "affar urban", "affar rural", "amhara urban",
"amhara rural", "oromiya urban", "oromiya rural", "somali urban",
"somali rural", "benishangul-gumuz urban", "benishangul-gumuz rural",
"s.n.n.p. urban", "s.n.n.p. rural", "gambela urban", "gambela rural",
"harari urban", "harari rural", "addis ababa", "dire dawa urban",
"dire dawa rural", "somali oversample urban", "somali oversample rural"
)), label = "Stratification used in sample design", class = c("haven_labelled",
"vctrs_vctr", "integer")), hv024 = structure(c(3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L), labels = c(tigray = 1L, affar = 2L,
amhara = 3L, oromiya = 4L, somali = 5L, `benishangul-gumuz` = 6L,
snnp = 7L, gambela = 12L, harari = 13L, `addis ababa` = 14L,
`dire dawa` = 15L), label = "Region", class = c("haven_labelled",
"vctrs_vctr", "integer")), hv025 = structure(c(2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L), labels = structure(1:2, names = c("urban",
"rural")), label = "Type of place of residence", class = c("haven_labelled",
"vctrs_vctr", "integer")), hc70 = structure(c(NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_), labels = c(`height out of plausible limits` = 9996,
`age in days out of plausible limits` = 9997, `flagged cases` = 9998,
missing = 9999), label = "Height/Age standard deviation (new WHO)", class = c("haven_labelled",
"vctrs_vctr", "double"))), row.names = c("ETPR61FL.1", "ETPR61FL.2",
"ETPR61FL.3", "ETPR61FL.4", "ETPR61FL.5", "ETPR61FL.6", "ETPR61FL.7",
"ETPR61FL.8", "ETPR61FL.9", "ETPR61FL.10"), class = "data.frame")
dput(comb_extract_all_agg)
structure(list(`"SurveyId"` = "SurveyId", `"hhid"` = "hhid",
`"CLUSTER"` = "CLUSTER", hc70 = 683.255964376358), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -1L))
Edit for comment:
comb_extract_all |> select(SurveyId, CLUSTER, hhid, hc70) |> slice_sample(n = 5) |> dput()
structure(list(SurveyId = c("ET2016DHS", "ET2016DHS", "ET2019DHS",
"ZW2010DHS", "ET2005DHS"), CLUSTER = structure(c(561L, 211L,
143L, 166L, 241L), label = "Cluster number"), hhid = structure(c(" 561 51",
" 211 301", " 01430032", " 166 25", " 241246"
), label = "Case Identification"), hc70 = structure(c(NA, NA,
NA, NA, 619), labels = c(`height out of plausible limits` = 9996,
`age in days out of plausible limits` = 9997, `flagged cases` = 9998,
missing = 9999), label = "Height/Age standard deviation (new WHO)", class = c("haven_labelled",
"vctrs_vctr", "double"))), row.names = c("ETPR71FL.64958", "ETPR71FL.24246",
"ETPR81FL.19619", "ZWPR62FL.17019", "147379"), class = "data.frame")