For each value of x (educ in this case) I want to plot the distribution of y (income) and add the regression line of y ~ x.
df <- structure(list(
income = c(16L, 18L, 26L, 16L, 34L, 22L, 42L,
42L, 16L, 20L, 66L, 26L, 20L, 30L, 20L, 30L, 32L, 16L, 20L, 58L,
30L, 26L, 20L, 40L, 32L, 22L, 20L, 56L, 32L, 30L, 30L, 48L, 40L,
84L, 50L, 38L, 30L, 76L, 48L, 36L, 40L, 44L, 30L, 60L, 24L, 88L,
46L, 50L, 50L, 22L, 26L, 46L, 22L, 24L, 64L, 62L, 24L, 50L, 32L,
34L, 52L, 24L, 22L, 20L, 30L, 24L, 120L, 22L, 82L, 18L, 26L,
104L, 28L, 32L, 38L, 44L, 22L, 18L, 24L, 56L),
educ = c(10L, 7L, 9L, 11L, 14L, 12L, 16L, 16L, 9L, 10L, 16L, 12L, 10L, 15L,
10L, 19L, 16L, 11L, 10L, 16L, 12L, 10L, 8L, 12L, 10L, 11L, 10L,
14L, 12L, 11L, 14L, 14L, 7L, 18L, 10L, 12L, 12L, 16L, 16L, 11L,
11L, 12L, 10L, 15L, 9L, 17L, 16L, 16L, 14L, 11L, 12L, 16L, 9L,
9L, 14L, 16L, 10L, 13L, 10L, 16L, 18L, 12L, 14L, 13L, 14L, 13L,
18L, 10L, 16L, 12L, 12L, 14L, 12L, 12L, 14L, 12L, 12L, 10L, 12L,
20L),
race = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("b", "h", "w"), class = "factor"),
race2 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), z1 = c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L
),
z2 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L)), row.names = c(NA, -80L), class = c("tbl_df",
"tbl", "data.frame"))
So far, i have used ggridges
package to plot the distribution of y at each value of x. Nonetheless,
by doing so, I actually have to change the coordinates of each variable (x becomes y and viceversa).
To 'revert' this, I flipped the coordinates and as a result I get this:
ggplot(df, aes(x = income, y = educ, group = educ)) +
geom_density_ridges(jittered_points = TRUE,
position = position_points_jitter(height = 0),
point_size = 1.5,
point_shape = 1,
alpha = 0.3) +
coord_flip()
The problem is that, if I add a regression line to the plot, I get a regression line for each value of
educyr (as I had to group them for applying geom_density_ridges()
). Furthermore, the regression line its actually x ~ y instead of y ~ x.
To try to solve this, I found the regression line for x ~ y equivalent to y ~ x, so that the regression line looks eactly the same as if I had apply geom_smooth()
but with educyr as x and hrinc as y.
fit <- lm(df$income ~ df$educ)
slope <- 1/fit$coefficients[[2]]
intercept <- fit$coefficients[[1]]/fit$coefficients[[2]] * -1
ggplot(df, aes(x = income, y = educ, group = educ)) +
geom_density_ridges(jittered_points = TRUE,
position = position_points_jitter(height = 0),
point_size = 1.5,
point_shape = 1,
alpha = 0.3) +
stat_function(fun=function(x) intercept + slope*x, color = "red") +
scale_y_continuous(breaks=seq(0, 20, 5), limits=c(8, 20)) +
coord_flip()
Which is the same as I would have get if I had used:
ggplot(df, aes(x = educ, y = income)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
I was wondering if there is a better way, to do this. Specificaly, if there is a way to plot the distribution of y for each value of x using ggplot2
but without using ggridges
, so I don´t need to reverse the coordinates.