@Achim Zeileis this is a reprex for the behavior I'm seeing. The actual model is fit on ~15k obs, not 100, but I subsetted it down a lot for interpretability.
Here, the count prediction is less than the response prediction, and it doesn't make intuitive sense to me how that'd be the case. I'm 95% sure I'm just misunderstanding what the predict(..., type = 'response')
function is doing, and how it differs from just multiplying the predicted count by the probability of not clearing the hurdle stage. Any pointers would be much appreciated!
data <- structure(list(y = c(0, 5, 2, 2, 0, 0, 1, 1, 2, 0, 0, 1, 0, 0,
0, 3, 3, 0, 34, 1, 1, 0, 0, 83, 0, 5, 6, 6, 3, 0, 0, 1, 0, 1,
0, 17, 0, 0, 0, 1, 0, 0, 1, 0, 26, 19, 16, 3, 9, 0, 1, 0, 1,
11, 0, 42, 0, 2, 2, 3, 0, 0, 1, 0, 0, 3, 0, 4, 1, 0, 0, 0, 1,
0, 4, 0, 11, 2, 4, 5, 0, 0, 0, 1, 3, 38, 0, 0, 1, 0, 0, 8, 19,
12, 0, 0, 0, 0, 0, 1),
x = c(1, 21, 7, 3, 0, 0, 16, 19, 0, 4,
3, 5, 0, 0, 0, 4, 2, 6, 31, 0, 10, 0, 0, 280, 2, 43, 8, 46, 10,
0, 0, 1, 0, 11, 0, 27, 0, 6, 0, 0, 5, 8, 4, 0, 9, 23, 12, 0,
35, 0, 2, 0, 1, 0, 2, 35, 0, 0, 6, 1, 0, 0, 19, 0, 0, 14, 0,
2, 0, 0, 0, 1, 0, 3, 10, 6, 13, 5, 7, 11, 3, 3, 2, 0, 0, 19,
1, 0, 4, 0, 0, 6, 1, 0, 0, 0, 0, 1, 0, 5)),
row.names = c(NA, -100L), class = c("tbl_df", "tbl", "data.frame"))
library(pscl)
#> Classes and Methods for R developed in the
#> Political Science Computational Laboratory
#> Department of Political Science
#> Stanford University
#> Simon Jackman
#> hurdle and zeroinfl functions by Achim Zeileis
model <- hurdle(
y ~ x,
data = data,
dist = 'negbin'
)
newdata <- data.frame(x = 5)
(
response_pred <- predict(model, newdata = newdata, type = 'response')
)
#> 1
#> 2.912046
(
count_pred <- predict(model, newdata = newdata, type = 'count')
)
#> 1
#> 0.002132521
coef(model)
#> count_(Intercept) count_x zero_(Intercept) zero_x
#> -6.4765439 0.0652187 -0.9765759 0.3145331
(
count_pred_from_coef <- exp(0.0652187 * 5 - 6.4765439)
)
#> [1] 0.002132521
(
predprob_zero <- predict(model, newdata = newdata, type = 'prob')[1]
)
#> [1] 0.3552388
(
zero_prob_from_coef <- 1 - (exp(0.3145331 * 5 - 0.9765759) / (1 + exp(0.3145331 * 5 - 0.9765759)))
)
#> [1] 0.3552388
## still way smaller than the response. Is this formula wrong?
count_pred * predprob_zero
#> 1
#> 0.0007575543
Created on 2021-04-06 by the reprex package (v1.0.0)