Add vline to geom_density and shade confidence interval of mean R

Question

After reading through different posts, I found out how to add a vline of mean to density plots as shown here. Using the data provided in the above link:

1) How can one add 95% confidence intervals around the mean using geom_ribbon? CIs can be computed as

#computation of the standard error of the mean
sem<-sd(x)/sqrt(length(x))
#95% confidence intervals of the mean
c(mean(x)-2*sem,mean(x)+2*sem)

2) How can one limit the vline to the region under the curve? You will see in the picture below that vline plots outside the curve.

Sample data very close to my real problem can be found at https://www.dropbox.com/s/bvvfdpgekbjyjh0/test.csv?dl=0

UPDATE

Using real data in the link above, I have tried the following using @beetroot's answer.

# Find the mean of each group
dat=me
library(dplyr)
library(plyr)
cdat <- ddply(data,.(direction,cond), summarise, rating.mean=mean(rating,na.rm=T))# summarize by season and variable
cdat

#ggplot
p=ggplot(data,aes(x = rating)) + 
  geom_density(aes(colour = cond),size=1.3,adjust=4)+
  facet_grid(.~direction, scales="free")+
  xlab(NULL) + ylab("Density")
p=p+coord_cartesian(xlim = c(0, 130))+scale_color_manual(name="",values=c("blue","#00BA38","#F8766D"))+
  scale_fill_manual(values=c("blue", "#00BA38", "#F8766D"))+
  theme(legend.title = element_text(colour="black", size=15, face="plain"))+
  theme(legend.text = element_text(colour="black", size = 15, face = "plain"))+
  theme(title = red.bold.italic.text, axis.title = red.bold.italic.text)+
  theme(strip.text.x = element_text(size=20, color="black",face="plain"))+ # facet labels
  ggtitle("SAMPLE A") +theme(plot.title = element_text(size = 20, face = "bold"))+
    theme(axis.text = blue.bold.italic.16.text)+ theme(legend.position = "none")+
  geom_vline(data=cdat, aes(xintercept=rating.mean, color=cond),linetype="dotted",size=1)
p

## implementing @beetroot's code to restrict lines under the curve and shade CIs around the mean
# I will use ddply for mean and CIs
cdat <- ddply(data,.(direction,cond), summarise, rating.mean=mean(rating,na.rm=T),
              sem = sd(rating,na.rm=T)/sqrt(length(rating)),
              ci.low = mean(rating,na.rm=T) - 2*sem,
              ci.upp = mean(rating,na.rm=T) + 2*sem)# summarize by direction and variable


#In order to limit the lines to the outline of the curves you first need to find out which y values
#of the curves correspond to the means, e.g. by accessing the density values with ggplot_build and 
#using approx:

   cdat.dens <- ggplot_build(ggplot(data, aes(x=rating, colour=cond)) +
                              facet_grid(.~direction, scales="free")+
                              geom_density(aes(colour = cond),size=1.3,adjust=4))$data[[1]] %>%
  mutate(cond = ifelse(group==1, "A",
                       ifelse(group==2, "B","C"))) %>%
  left_join(cdat) %>%
  select(y, x, cond, rating.mean, sem, ci.low, ci.upp) %>%
  group_by(cond) %>%
  mutate(dens.mean = approx(x, y, xout = rating.mean)[[2]],
         dens.cilow = approx(x, y, xout = ci.low)[[2]],
         dens.ciupp = approx(x, y, xout = ci.upp)[[2]]) %>%
  select(-y, -x) %>%
  slice(1)

 cdat.dens

#---
 #You can then combine everything with various geom_segments:

   ggplot(data, aes(x=rating, colour=cond)) +
   geom_density(data = data, aes(x = rating, colour = cond),size=1.3,adjust=4) +facet_grid(.~direction, scales="free")+
   geom_segment(data = cdat.dens, aes(x = rating.mean, xend = rating.mean, y = 0, yend = dens.mean, colour = cond),
                linetype = "dashed", size = 1) +
   geom_segment(data = cdat.dens, aes(x = ci.low, xend = ci.low, y = 0, yend = dens.cilow, colour = cond),
                linetype = "dotted", size = 1) +
   geom_segment(data = cdat.dens, aes(x = ci.upp, xend = ci.upp, y = 0, yend = dens.ciupp, colour = cond),
                linetype = "dotted", size = 1)

Gives this:

You will notice the mean and CIs are not aligned as in the original plot. What am I not doing right @beetroot?

To limit to the curve, you'll have to precalculate the densities. — Axeman, Feb 01 '17 at 08:35
Also, see http://stackoverflow.com/questions/12429333/how-to-shade-a-region-under-a-curve-using-ggplot2 — Axeman, Feb 01 '17 at 08:36
I just realised I didn't follow up on your questions.. did you eventually figure it out? — erc, Aug 29 '18 at 09:56
@beetroot the accepted solution below worked really well for my data. Thank you for following up on this. — code123, Aug 29 '18 at 19:10

score 6 · Accepted Answer · edited May 23 '17 at 12:24

Using the data from the link, you can calculate the mean, se and ci like so (I suggest using dplyr, the successor of plyr):

set.seed(1234)
dat <- data.frame(cond = factor(rep(c("A","B"), each=200)), 
                  rating = c(rnorm(200),rnorm(200, mean=.8)))

library(ggplot2)
library(dplyr)
cdat <- dat %>%
  group_by(cond) %>%
  summarise(rating.mean = mean(rating),
            sem = sd(rating)/sqrt(length(rating)),
            ci.low = mean(rating) - 2*sem,
            ci.upp = mean(rating) + 2*sem)

In order to limit the lines to the outline of the curves you first need to find out which y values of the curves correspond to the means, e.g. by accessing the density values with ggplot_build and using approx:

cdat.dens <- ggplot_build(ggplot(dat, aes(x=rating, colour=cond)) + geom_density())$data[[1]] %>%
  mutate(cond = ifelse(group == 1, "A", "B")) %>%
  left_join(cdat) %>%
  select(y, x, cond, rating.mean, sem, ci.low, ci.upp) %>%
  group_by(cond) %>%
  mutate(dens.mean = approx(x, y, xout = rating.mean)[[2]],
         dens.cilow = approx(x, y, xout = ci.low)[[2]],
         dens.ciupp = approx(x, y, xout = ci.upp)[[2]]) %>%
  select(-y, -x) %>%
  slice(1)

> cdat.dens
Source: local data frame [2 x 8]
Groups: cond [2]

   cond rating.mean        sem     ci.low     ci.upp dens.mean dens.cilow dens.ciupp
  <chr>       <dbl>      <dbl>      <dbl>      <dbl>     <dbl>      <dbl>      <dbl>
1     A -0.05775928 0.07217200 -0.2021033 0.08658471 0.3865929   0.403623  0.3643583
2     B  0.87324927 0.07120697  0.7308353 1.01566320 0.3979347   0.381683  0.4096153

You can then combine everything with various geom_segments:

ggplot() +
  geom_density(data = dat, aes(x = rating, colour = cond)) +
  geom_segment(data = cdat.dens, aes(x = rating.mean, xend = rating.mean, y = 0, yend = dens.mean, colour = cond),
             linetype = "dashed", size = 1) +
  geom_segment(data = cdat.dens, aes(x = ci.low, xend = ci.low, y = 0, yend = dens.cilow, colour = cond),
             linetype = "dotted", size = 1) +
  geom_segment(data = cdat.dens, aes(x = ci.upp, xend = ci.upp, y = 0, yend = dens.ciupp, colour = cond),
               linetype = "dotted", size = 1)

As Axeman pointed out you can create a polygon based on the ribbon area as explained in this answer.

So for your data you can subset and add the additional rows like so:

ribbon <- ggplot_build(ggplot(dat, aes(x=rating, colour=cond)) + geom_density())$data[[1]] %>%
  mutate(cond = ifelse(group == 1, "A", "B")) %>%
  left_join(cdat.dens) %>%
  group_by(cond) %>%
  filter(x >= ci.low & x <= ci.upp) %>%
  select(cond, x, y)

ribbon <- rbind(data.frame(cond = c("A", "B"), x = c(-0.2021033, 0.7308353), y = c(0, 0)), 
                as.data.frame(ribbon), 
                data.frame(cond = c("A", "B"), x = c(0.08658471, 1.01566320), y = c(0, 0)))

And add geom_polygon to the plot:

ggplot() +
  geom_polygon(data = ribbon, aes(x = x, y = y, fill = cond), alpha = .5) +
  geom_density(data = dat, aes(x = rating, colour = cond)) +
  geom_segment(data = cdat.dens, aes(x = rating.mean, xend = rating.mean, y = 0, yend = dens.mean, colour = cond),
             linetype = "dashed", size = 1) +
  geom_segment(data = cdat.dens, aes(x = ci.low, xend = ci.low, y = 0, yend = dens.cilow, colour = cond),
             linetype = "dotted", size = 1) +
  geom_segment(data = cdat.dens, aes(x = ci.upp, xend = ci.upp, y = 0, yend = dens.ciupp, colour = cond),
               linetype = "dotted", size = 1)

Here's the adapted code for your real data. It's just a bit tricky to incorporate two groups instead of one:

cdat <- dat %>%
  group_by(direction, cond) %>%
  summarise(rating.mean = mean(rating, na.rm = TRUE),
            sem = sd(rating, na.rm = TRUE)/sqrt(length(rating)),
            ci.low = mean(rating, na.rm = TRUE) - 2*sem,
            ci.upp = mean(rating, na.rm = TRUE) + 2*sem)

cdat.dens <- ggplot_build(ggplot(dat, aes(x=rating, colour=interaction(direction, cond))) + geom_density())$data[[1]] %>%
  mutate(cond = ifelse((group == 1 | group == 2 | group == 3 | group == 4), "A",
                        ifelse((group == 5 | group == 6 | group == 7 | group == 8), "B", "C")),
         direction = ifelse((group == 1 | group == 5 | group == 9), "EAST",
                            ifelse((group == 2 | group == 6 | group == 10), "NORTH",
                                   ifelse((group == 3 | group == 7 | group == 11), "SOUTH", "WEST")))) %>%
  left_join(cdat) %>%
  select(y, x, cond, direction, rating.mean, sem, ci.low, ci.upp) %>%
  group_by(cond, direction) %>%
  mutate(dens.mean = approx(x, y, xout = rating.mean)[[2]],
         dens.cilow = approx(x, y, xout = ci.low)[[2]],
         dens.ciupp = approx(x, y, xout = ci.upp)[[2]]) %>%
  select(-y, -x) %>%
  slice(1)

ggplot() +
  geom_density(data = dat, aes(x = rating, colour = cond)) +
  geom_segment(data = cdat.dens, aes(x = rating.mean, xend = rating.mean, y = 0, yend = dens.mean, colour = cond),
               linetype = "dashed", size = 1) +
  geom_segment(data = cdat.dens, aes(x = ci.low, xend = ci.low, y = 0, yend = dens.cilow, colour = cond),
               linetype = "dotted", size = 1) +
  geom_segment(data = cdat.dens, aes(x = ci.upp, xend = ci.upp, y = 0, yend = dens.ciupp, colour = cond),
               linetype = "dotted", size = 1) +
  facet_wrap(~direction)

wow! this looks amazing. I will try it on my data and let you know. Thanks again. — code123, Feb 01 '17 at 15:52
Please can you use your code on my data which I have now provided above? My actual data has `cond` `A`,`B`,`C` and an additional variable `direction` which is `east`,`west`,`south`,`north`. The ultimate aim is to display the densities using `facet_grid(.~direction, scales="free")` which I can definitely do. I have even more groups to anylyse but your answer to the newly provided data `test.csv` in the link above should get me started. Thanks. — code123, Feb 01 '17 at 20:38
Please include in your question a `dput()` of your data and, show what you have done and explain where you got stuck applying the code from my answer. — erc, Feb 02 '17 at 07:40
I just added a modified code showing what I have done so far with example plot compared to the initial plot. The data is too big to produce a `dput` but can be found on the libk above. Thanks for throwing more light on the analysis. — code123, Feb 03 '17 at 21:00
When I run `cdat <- dat %>% group_by(direction, cond) %>% summarise(rating.mean = mean(rating, na.rm = TRUE), sem = sd(rating, na.rm = TRUE)/sqrt(length(rating)), ci.low = mean(rating, na.rm = TRUE) - 2*sem, ci.upp = mean(rating, na.rm = TRUE) + 2*sem)` I get just `1 Obs of 4 variables`. Then when I run `cdat.dens` I get `Error: No common variables. Please specify by param.` Does summarize ignore `direction or cond`? — code123, Feb 04 '17 at 05:05

krassowski · Answer 2 · 2021-06-20T20:05:05.407

If you want to draw the mean line without building the plot object and without manipulating the data prior to plotting you can use stat_summary():

(
    ggplot(data = dat, aes(x = rating, colour = cond))
    + geom_density()
    + stat_summary(
        aes(y = rating, x = 0),
        geom = 'rect',
        fun.data = density_mean_line(dat$rating),
        key_glyph = "vline",
        size = 1
    )
)

giving:

where:

density_mean_line = function(values) {
    values_range = range(values, na.rm=TRUE)
    function(x) {
        density_data = StatDensity$compute_group(
            data.frame(x=x),
            scales=list(
                x=scale_x_continuous(limits = values_range)
            )
        )
        mean_x = mean(x)
        data.frame(
            xmin=mean_x,
            xmax=mean_x,
            ymin=0,
            ymax=approx(density_data$x, density_data$density, xout=mean_x)$y
        )
    }
}

And dat is defined as in erc's answer:

set.seed(1234)
dat <- data.frame(cond = factor(rep(c("A","B"), each=200)), 
                  rating = c(rnorm(200),rnorm(200, mean=.8)))

This technique can also be used to generate solid area (of the same colour as the density outline):

(
    ggplot(data = dat, aes(x = rating, colour = cond, group = cond))
    + stat_summary(
        aes(y = rating, x = 0, fill = cond),
        geom = 'rect',
        fun.data = density_ci(dat$rating),
        size=1
    )
    + stat_summary(
        aes(y = rating, x = 0),
        geom = 'rect',
        fun.data = density_mean_line(dat$rating),
        key_glyph = "vline",
        size = 0.5,
        color='grey20'
    )
    + geom_density()
)

where:

density_ci = function(values, resolution=100) {
    values_range = range(values, na.rm=TRUE)
    function(x) {
        density_data = StatDensity$compute_group(
            data.frame(x=x),
            scales=list(
                x=scale_x_continuous(limits = values_range)
            )
        )
        mean_x = mean(x)
        sem = sd(x) / sqrt(length(x))
        ci_lower = mean_x - 1.96 * sem
        ci_upper = mean_x + 1.96 * sem

        x_values = seq(ci_lower, ci_upper, length.out=resolution)

        data.frame(
            xmin=x_values,
            xmax=x_values,
            ymin=rep(0, resolution),
            ymax=approx(density_data$x, density_data$density, xout=x_values)$y
        )
    }
}

Add vline to geom_density and shade confidence interval of mean R

2 Answers2

Linked