0

I am trying to add group-wise lines on a plot, where the values are calculated from another dataframe.

My data looks like this:

demo_df <- structure(list(Sample_Run = structure(c(3L, 4L, 2L, 1L, 5L, 3L,
1L, 4L, 5L, 2L, 3L, 2L, 1L, 5L, 4L), .Label = c("Sample1", "Sample2",
"Sample3", "Sample4", "Sample5", "Sample6", "Sample7", "Sample8",
"Sample9", "Sample10", "Sample11", "Sample12", "Sample13", "Sample14",
"Sample15", "Sample16", "Sample17", "Sample18", "Sample19", "Sample20"
), class = "factor"), MUT_ID = structure(c(1L, 1L, 1L, 1L, 1L,
3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L), .Label = c("AKT1 c.49G>A",
"APC c.4348C>T", "APC c.4666_4667insA", "ATM c.1058_1059delGT",
"BRAF c.1799T>A", "CTNNB1 c.121A>G", "EGFR c.2236_2250del15",
"EGFR c.2310_2311insGGT", "EGFR c.2369C>T", "EGFR c.2573T>G",
"ERBB2 c.2324_2325ins12", "FGFR3 c.746C>G", "FLT3 c.2503G>T",
"GNA11 c.626A>T", "GNAQ c.626A>C", "GNAS c.601C>T", "JAK2 c.1849G>T",
"KIT c.2447A>T", "KRAS c.35G>A", "MPL c.1544G>T", "NPM1 c.863_864insTCTG",
"NRAS c.182A>G", "PDGFRA c.1694_1695insA", "PDGFRA c.2525A>T",
"PIK3CA c.1633G>A", "PIK3CA c.3140A>G", "PIK3CA c.3204_3205insA",
"PTEN c.741_742insA", "PTEN c.800delA", "RET c.2753T>C", "SMAD4 c.1394_1395insT",
"TP53 c.524G>A", "TP53 c.723delC", "TP53 c.743G>A", "TP53 c.818G>A"
), class = "factor"), FREQ = c(0.091, 0.077, 0.09, 0.096, 0.114,
0.081, 0.071, 0.076, 0.084, 0.083, 0.08, 0.082, 0.087, 0.085,
0.094)), .Names = c("Sample_Run", "MUT_ID", "FREQ"), row.names = c(1L,
4L, 5L, 7L, 8L, 46L, 47L, 48L, 50L, 51L, 91L, 93L, 94L, 96L,
97L), class = "data.frame")

demo_sd <- aggregate(demo_df[["FREQ"]], by=list(as.factor(demo_df[["MUT_ID"]])), FUN=sd)
names(demo_sd) <- c("MUT_ID", "sd")

demo_mean <- aggregate(demo_df[["FREQ"]], by=list(as.factor(demo_df[["MUT_ID"]])), FUN=mean)
names(demo_mean) <- c("MUT_ID", "mean")

demo_mean_sd <- merge(x = demo_sd, y = demo_mean)
demo_mean_sd[["sd_interval_upper"]] <- demo_mean_sd[["mean"]] + (2 * demo_mean_sd[["sd"]])
demo_mean_sd[["sd_interval_lower"]] <- demo_mean_sd[["mean"]] - (2 * demo_mean_sd[["sd"]])

output:

> demo_df
   Sample_Run              MUT_ID  FREQ
1     Sample3        AKT1 c.49G>A 0.091
4     Sample4        AKT1 c.49G>A 0.077
5     Sample2        AKT1 c.49G>A 0.090
7     Sample1        AKT1 c.49G>A 0.096
8     Sample5        AKT1 c.49G>A 0.114
46    Sample3 APC c.4666_4667insA 0.081
47    Sample1 APC c.4666_4667insA 0.071
48    Sample4 APC c.4666_4667insA 0.076
50    Sample5 APC c.4666_4667insA 0.084
51    Sample2 APC c.4666_4667insA 0.083
91    Sample3       APC c.4348C>T 0.080
93    Sample2       APC c.4348C>T 0.082
94    Sample1       APC c.4348C>T 0.087
96    Sample5       APC c.4348C>T 0.085
97    Sample4       APC c.4348C>T 0.094

> demo_mean_sd
               MUT_ID          sd   mean sd_interval_upper sd_interval_lower
1        AKT1 c.49G>A 0.013390295 0.0936        0.12038059        0.06681941
2       APC c.4348C>T 0.005412947 0.0856        0.09642589        0.07477411
3 APC c.4666_4667insA 0.005431390 0.0790        0.08986278        0.06813722

I can make the base plot like this:

library("ggplot2")
ggplot(data =  demo_df, 
       aes(y = FREQ, x = Sample_Run, color = MUT_ID, group = MUT_ID) ) + 
    geom_point() +
    geom_line(alpha = 0.3) +
    facet_grid(MUT_ID~.) +
    scale_y_continuous(limits = c(0, NA))

Looks like this:

example_plot

But I need to add lines for the standard deviation, and mean, from the demo_mean_sd dataframe. It should look something like this:

example_plot2

However, I keep hitting problems trying to get the lines to draw on the plot, due to the lack of a common x axis. For example, I tried something like this:

ggplot(data =  demo_df, 
       aes(y = FREQ, x = Sample_Run, color = MUT_ID, group = MUT_ID) ) + 
    geom_point() +
    geom_line(alpha = 0.3) +
    facet_grid(MUT_ID~.) +
    scale_y_continuous(limits = c(0, NA)) +
    geom_ribbon(data = demo_mean_sd, aes(ymin = sd_interval_lower, ymax = sd_interval_upper))

error:

Error in eval(expr, envir, enclos) : object 'Sample_Run' not found

I have not been able to figure out how to do this with other plot types either due to the faceting.

user5359531
  • 3,217
  • 6
  • 30
  • 55
  • possibly related? https://stackoverflow.com/questions/12033319/plot-mean-and-sd-of-dataset-per-x-value-using-ggplot2 – Matias Andina Apr 19 '18 at 22:32
  • For `geom_ribbon` you are trying to add the ribbon to the plots without specifying the x-axis (Sample_Run). You are only specifying the y-values. – steveb Apr 19 '18 at 22:43
  • I'd suggest that the x-axis should be `MUT_ID`, use `geom_jitter` to show the samples as points and `stat_summary` to add mean + sd, don't use a ribbon. And don't create separate data frames; ggplot2 works best when all data is in one long format frame. – neilfws Apr 19 '18 at 22:50

2 Answers2

2

One approach is to merge your data frames before creating the plot. You can merge on MUT_ID using dplyr with the following:

demo_df_merged <-
  demo_df %>%
  left_join(., demo_mean_sd, by = "MUT_ID")

Once merged, you can plot the upper and lower bounds, as well as the mean, normally using geom_line

ggplot(data =  demo_df_merged, 
       aes(x = Sample_Run, color = MUT_ID, group = MUT_ID) ) + 
  geom_point(aes(y = FREQ)) +
  geom_line(alpha = 0.3, aes(y = FREQ)) +
  geom_line(aes(y = mean)) +
  geom_line(size = 2, aes(y = sd_interval_upper)) +
  geom_line(size = 2, aes(y = sd_interval_lower)) +
  facet_grid(MUT_ID~.) +
  scale_y_continuous(limits = c(0, NA))

Example plot with lines

steveb
  • 5,382
  • 2
  • 27
  • 36
abanker
  • 74
  • 3
2

First, ggplot2 works best when all data is in one long-format dataframe, so I would avoid creating more data frames in the first instance.

Second, I would visualise this data differently. Ribbons suggest some sort of progression along the x-axis, usually time, which is not the case here. If you're interested in means of replicates for a mutation, I'd suggest that the x-axis should be MUT_ID, the replicates should be points, then use stat_summary to add the mean with error bars.

Something like this:

library(tidyverse)
# mean_sdl gives you the mean + 2 SD
demo_df %>% 
  ggplot(aes(MUT_ID, FREQ)) + 
    geom_jitter(aes(color = Sample_Run), 
                width = 0.2, 
                size = 2) + 
    stat_summary(fun.data = mean_sdl, 
                 geom = "errorbar", 
                 width = 0.2) + 
    stat_summary(fun.y = mean, 
                 geom = "point", 
                 fill = "red", 
                 size = 4, 
                 shape = 23) + 
    theme_bw()

enter image description here

neilfws
  • 32,751
  • 5
  • 50
  • 63