My dataframe contains:
- a column
deceased
on which I compute aggregated means later on (mortality ratios, by gender) - a weighting variable
n.group
- a categorical
sex
(1: female, 2: male)
I don't understand why the means and weighted-means m.mortf, w.mortf
are wrong when calculated below in one single mutate
/summarize
expression.
Dataframe:
red11 <- structure(list(hosptg = structure(c(3L, 3L, 1L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 1L, 3L, 3L, 3L, 3L, 2L, 3L, 3L, 3L, 3L, 1L, 1L, 3L,
3L, 3L, 3L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 3L, 3L,
3L, 3L, 2L, 3L, 1L, 3L, 1L, 3L, 3L, 1L, 3L, 3L, 3L, 3L, 3L, 1L,
3L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("1",
"2", "3"), class = "factor"), quarter.adm = structure(c(4L, 11L,
3L, 12L, 7L, 8L, 12L, 9L, 1L, 11L, 7L, 1L, 2L, 2L, 10L, 10L,
8L, 11L, 6L, 1L, 4L, 6L, 10L, 10L, 6L, 11L, 11L, 7L, 3L, 6L,
10L, 12L, 7L, 6L, 6L, 3L, 6L, 12L, 4L, 4L, 12L, 1L, 6L, 5L, 11L,
9L, 4L, 4L, 3L, 10L, 4L, 8L, 10L, 3L, 7L, 1L, 12L, 5L, 4L, 6L,
6L, 3L, 9L, 7L, 8L, 3L, 7L, 8L, 7L, 6L, 5L, 11L, 9L, 11L, 1L,
4L, 6L, 5L, 5L, 6L, 5L, 5L, 11L, 3L, 4L, 12L, 12L, 1L, 9L, 9L,
6L, 9L, 1L, 4L, 8L, 1L, 5L, 2L, 9L, 11L), .Label = c("2011Q1",
"2011Q2", "2011Q3", "2011Q4", "2012Q1", "2012Q2", "2012Q3", "2012Q4",
"2013Q1", "2013Q2", "2013Q3", "2013Q4"), class = "factor"), g.mdc = c("08",
"05", "09", "08", "14", "15", "15", "11", "09", "01", "08", "11",
"16", "14", "08", "06", "08", "06", "06", "08", "15", "14", "14",
"08", "11", "09", "08", "08", "06", "06", "06", "08", "03", "05",
"05", "15", "02", "05", "08", "04", "04", "10", "06", "01", "08",
"05", "03", "06", "01", "01", "06", "08", "08", "04", "12", "05",
"01", "15", "08", "01", "08", "01", "05", "15", "15", "01", "06",
"15", "01", "08", "01", "05", "08", "02", "15", "03", "06", "05",
"05", "03", "09", "08", "11", "12", "06", "04", "08", "01", "06",
"01", "08", "06", "15", "05", "08", "07", "08", "13", "08", "08"
), sex = structure(c(2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L,
2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L,
2L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 1L,
2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 1L,
1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L), .Label = c("0", "1"), class = "factor"),
age = c(23L, 83L, 51L, 54L, 37L, 0L, 0L, 82L, 45L, 88L, 84L,
58L, 41L, 33L, 71L, 79L, 67L, 42L, 73L, 66L, 0L, 26L, 38L,
65L, 31L, 87L, 38L, 38L, 77L, 44L, 54L, 74L, 38L, 70L, 44L,
0L, 78L, 65L, 56L, 85L, 70L, 83L, 89L, 46L, 39L, 34L, 5L,
85L, 18L, 5L, 41L, 73L, 18L, 41L, 75L, 77L, 36L, 0L, 84L,
83L, 58L, 93L, 83L, 0L, 0L, 2L, 49L, 0L, 55L, 46L, 40L, 81L,
60L, 51L, 0L, 22L, 78L, 69L, 75L, 65L, 31L, 15L, 79L, 87L,
72L, 78L, 48L, 16L, 81L, 63L, 84L, 17L, 0L, 60L, 60L, 74L,
44L, 44L, 53L, 71L), deceased = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L), .Label = c("0", "1"), class = "factor"),
n.group = c(3L, 2L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 1L, 2L, 1L,
1L, 1L, 3L, 2L, 3L, 1L, 3L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L,
2L, 1L, 3L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 3L, 1L, 2L, 1L,
1L, 2L, 2L, 2L, 1L, 3L, 3L, 1L, 3L, 3L, 3L, 1L, 1L, 1L, 1L,
1L, 3L, 1L, 3L, 3L, 2L, 1L, 3L, 3L, 1L, 3L, 1L, 3L, 2L, 2L,
2L, 1L, 2L, 1L, 3L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 3L, 2L, 1L,
1L, 1L, 1L, 3L, 1L, 2L, 1L, 3L, 1L, 2L, 1L, 2L, 2L)), class = c("tbl_df",
"tbl", "data.frame"), .Names = c("hosptg", "quarter.adm", "g.mdc",
"sex", "age", "deceased", "n.group"), row.names = c(NA, -100L
))
Grouping - using mutate
:
red111 <- red11 %>%
group_by(hosptg, quarter.adm, g.mdc) %>%
mutate(n=n(),
female = mean(sex == '1', na.rm=T),
age = mean(age, na.rm=T),
m.mortf = mean(deceased == '1', na.rm=T),
w.mortf = weighted.mean(deceased == '1', n.group, na.rm=T))
Grouping - using summarize
(i.e. aggregation):
red211 <- red11 %>%
group_by(hosptg, quarter.adm, g.mdc) %>%
summarize(n=n(),
female = mean(sex == '1', na.rm=T),
age = mean(age, na.rm=T),
m.mortf = mean(deceased == '1', na.rm=T),
w.mortf = weighted.mean(deceased == '1', n.group, na.rm=T))
I would have expected the ratio being the same and foremost keeping the initial mean. I understand what the aggregation does this is also illustrated by the sum(redxx$n)
but I struggle comprehending the full background.
Initial data frame mean:
mean(red11$deceased == 1, na.rm=T) [1] 0.02
Mutate mean and sum:
sum(red211$n) [1] 170
> mean(red111$female) [1] 0.52
> mean(red111$w.mortf) [1] 0.02
> mean(red111$m.mortf) [1] 0.02
Summarized mean and sum:
sum(red211$n) [1] 100
mean(red211$female) [1] 0.4977169
mean(red211$w.mortf) [1] 0.02739726
mean(red211$m.mortf) [1] 0.02739726
What I would like to have is an aggregated data frame (i.e. reduced number of lines) maintaining the initial mean throughout. And, why does the weighting variable not compensate for it?
EDIT: My basic intention is that I am using a big data file where I have single entries where a case may be deceased. Then I calculate mortality ratios. But this can logically only be done at aggregated level. That is why I create a data frame like red211. Thereafter I base my regression models on it. But them again means are based on that second data frame and not the original values. Thus my results are distorted in size. That is why I am "desperately" looking for a solution that will get me closer to my original mean values. I hope this helps.
The model I use is a straight forward difference in difference:
lm(w.mortf ~ treatment * year, data = red)
where: treatment is the treatment group / year the intervention year / red the aggregated data frame
===========================================================
w.mortf m.mortf
-----------------------------------------------------------
(Intercept) 0.037 (0.001) *** 0.037 (0.001) ***
year 0.003 (0.001) * 0.003 (0.001) *
tg1 -0.003 (0.001) * -0.003 (0.001) *
year:tg1 -0.001 (0.002) -0.001 (0.002)
-----------------------------------------------------------
Adj. R^2 0.000 0.000
Num. obs. 126031 126031
RMSE 0.172 0.179
===========================================================
The original data frame mean is approx. 0.018 - thus I think to far away from being interpretable - or where I am misled?
The figure below illustrates the issue. Where 2012Q1 should be the reference value findable based on the above regression.