1

I have a data frame:

df <- structure(list(Sample = c("1: FL_643", "2: FL_645", "3: FL_647","4: FL_656", "5: FL_658", "6: cKO_644", "7: cKO_646", "8: cKO_654","9: cKO_655", "10: cKO_657", "1: FL_643", "2: FL_645", "3: FL_647", "4: FL_656", "5: FL_658", "6: cKO_644", "7: cKO_646", "8: cKO_654", "9: cKO_655", "10: cKO_657"), Genotype = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L), .Label = c("miR-15/16 FL", "miR-15/16 cKO"), class = "factor"), 
Tissue = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Thymus", 
"iLN", "Spleen", "Skin", "Colon"), class = "factor"), variable = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L), .Label = c("Cells/SC/Live/CD8—,, CD4+,Freq. of Parent", 
"Cells/SC/Live/CD8—,, CD4+/Foxp3+,Freq. of Parent", "Cells/SC/Live/CD8—,, CD4+/Foxp3+,Median,<BV421-A>,CD127", 
"Cells/SC/Live/CD8—,, CD4+/Foxp3+/CD25+,Freq. of Parent", 
"Cells/SC/Live/CD8—,, CD4+/Foxp3+/CD25-,Freq. of Parent", 
"Cells/SC/Live/CD8—,, CD4+/Foxp3-,Freq. of Parent", "Cells/SC/Live/CD8—,, CD4+/Foxp3-,Median,<BV421-A>,CD127", 
"Cells/SC/Live/CD8—,, CD4+/Foxp3-/CD62L—,, CD44—,Freq. of Parent", 
"Cells/SC/Live/CD8—,, CD4+/Foxp3-/CD62L—,, CD44+,Freq. of Parent", 
"Cells/SC/Live/CD8—,, CD4+/Foxp3-/CD62L+,, CD44—,Freq. of Parent", 
"Cells/SC/Live/CD8—,, CD4+/Foxp3-/CD62L+,, CD44+,Freq. of Parent", 
"Cells/SC/Live/CD8—,, CD4+/Foxp3-/CD44+,Freq. of Parent", 
"Cells/SC/Live/CD8+,, CD4—,Freq. of Parent", "Cells/SC/Live/CD8+,, CD4—,Median,<BV421-A>,CD127", 
"Cells/SC/Live/CD8+,, CD4—/CD62L—,, CD44—,Freq. of Parent", 
"Cells/SC/Live/CD8+,, CD4—/CD62L—,, CD44+,Freq. of Parent", 
"Cells/SC/Live/CD8+,, CD4—/CD62L+,, CD44—,Freq. of Parent", 
"Cells/SC/Live/CD8+,, CD4—/CD62L+,, CD44+,Freq. of Parent", 
"Cells/SC/Live/CD8+,, CD4—/CD62L+,, CD44+,Freq. of Parent_1", 
"Cells/SC/Live,Count", "Cells/SC/Live/CD8—,, CD4+,Count", 
"Cells/SC/Live/CD8—,, CD4+/Foxp3+,Count", "Cells/SC/Live/CD8—,, CD4+/Foxp3+/CD25+,Count", 
"Cells/SC/Live/CD8—,, CD4+/Foxp3+/CD25-,Count", "Cells/SC/Live/CD8—,, CD4+/Foxp3-,Count", 
"Cells/SC/Live/CD8—,, CD4+/Foxp3-/CD62L—,, CD44—,Count", 
"Cells/SC/Live/CD8—,, CD4+/Foxp3-/CD62L—,, CD44+,Count", 
"Cells/SC/Live/CD8—,, CD4+/Foxp3-/CD62L+,, CD44—,Count", 
"Cells/SC/Live/CD8—,, CD4+/Foxp3-/CD62L+,, CD44+,Count", 
"Cells/SC/Live/CD8+,, CD4—,Count", "Cells/SC/Live/CD8+,, CD4—/CD62L—,, CD44—,Count", 
"Cells/SC/Live/CD8+,, CD4—/CD62L—,, CD44+,Count", "Cells/SC/Live/CD8+,, CD4—/CD62L+,, CD44—,Count", 
"Cells/SC/Live/CD8+,, CD4—/CD62L+,, CD44+,Count"), class = "factor"), 
value = c(41.2, 35.5, 39.5, 33.2, 39.1, 35.5, 35.7, 33.9, 
39.7, 42.4, 10.9, 12.1, 10.9, 12.5, 12.3, 12.8, 14.1, 15.8, 
14.6, 12.5)), .Names = c("Sample", "Genotype", "Tissue", "variable", "value"), row.names = c(NA, -20L), class = "data.frame")

And am using the following function to plot various combinations of the data

library(ggplot2)
library(ggpubr)
plot_it <- function(Tissue,
                    row_add = (1:nrow(temp)), 
                    y.lab = "Did you forget to add a label?",
                    font_choice = "Helvetica",
                    font_size = 12,
                    stat_test = "t.test",
                    p_display = "p.signif",
                    legend_position = c("right")) {

  # Subset data frame based on row_add
  rownames(temp) <- NULL
  df <- droplevels(temp[c(row_add),])
  rownames(df) <- NULL
  View(temp)

  # Define color and shape of variables
  color.groups <- c("black","red")
  names(color.groups) <- unique(df$Genotype)
  shape.groups <- c(16, 1)
  names(shape.groups) <- unique(df$Genotype)

  # Generate data frame of reference y-values for p-value labels and bracket positions
  dmax = df %>% group_by(variable) %>% 
    summarise(value=max(value, na.rm=TRUE),
              Genotype=NA)

  # For tweaking position of brackets
  e = max(dmax$value)*0.1
  r = 0.6
  w = 0.19
  bcol = "black"

  # Define y axis and wrap label
  y.axis <- df$value
  y.lab <- str_wrap(y.lab, width = 40)

  ggplot(df, aes(x = variable, y = value, color = Genotype, shape = Genotype)) +
    # geom_violin(position = position_dodge(width = 0.75)) +
    geom_boxplot(position = position_dodge(width = 0.75), outlier.shape = NULL) +
    geom_point(position=position_dodge(width=0.75), size = 2) +
    ylim(0,1.2*max(y.axis, na.rm = TRUE)) + ylab(y.lab) + xlab(df$Tissue) +
    scale_color_manual(values=color.groups) +
    scale_shape_manual(values=shape.groups) +
    scale_x_discrete(labels = function(x) str_wrap(x, width = 20)) +
    theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
                       panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"),
                       aspect.ratio = 1, text = element_text(family=font_choice, size = font_size),
                       legend.position = legend_position) +
    stat_compare_means(show.legend = FALSE, label = p_display, method = stat_test,
                       label.y = e + dmax$value, family = font_choice) +
    geom_segment(data=dmax,
                 aes(x=as.numeric(variable)-w, xend=as.numeric(variable)+w,
                     y=value + r*e, yend=value + r*e), size=0.3, color=bcol, inherit.aes=FALSE)
}

by using the following call to the function to make the plot:

plot_it(Tissue = "Thymus", row_add = c(c(1:30), c(141:150)))

This generates this plot:

enter image description here

I would like to have the function create a facet wrap that would effectively convert the y axis into two segments so that the scales of the two segments allow for better data visualization. As it is, when I make a plot with variables that have highly disparate values, the scale is inappropriate for all of them.

If this is not possible, then is there an easy way to manually introduce a facet wrap on a per plot basis to break up the y axis?

John Gagnon
  • 825
  • 1
  • 8
  • 20

1 Answers1

2

You could use clustering to keep groups with similar means together. However, the graphs may be misleading because of the differing scales.

In the example below, I've used fake data, as the sample data has only two groups.

library(tidyverse)

# Fake data with five groups
set.seed(2)
dat = data.frame(group=rep(LETTERS[1:5], each=20),
                 sub=rep(rep(letters[1:2], each=10), 5),
                 value=rnorm(100, rep(c(20, 17, 27, 56, 80), each=20), 5))

# Add facet groups using kmeans clustering
dat = dat %>% 
  group_by(group) %>% 
  mutate(mean=mean(value)) %>% 
  ungroup %>% 
  mutate(facet_group = kmeans(mean, 2)$cluster)

ggplot(dat, aes(group, value, colour=sub)) +
  geom_boxplot() +
  facet_wrap(~ facet_group, scales="free", ncol=2) +
  expand_limits(y=0) +
  theme_classic() +
  theme(strip.background=element_blank(),
        strip.text=element_blank())

enter image description here

Or a more complex layout option:

library(gridExtra)

# Create the two plots separately and store them in a list
plots = unique(dat$facet_group) %>% 
  map(
    ~ ggplot(dat[dat$facet_group==.x, ], aes(group, value, colour=sub)) +
      geom_boxplot() +
      expand_limits(y=0) +
      labs(y="", x="") +
      theme_bw() +
      theme(strip.background=element_blank(),
            strip.text=element_blank())
  )

# Extract Legend 
g_legend <- function(a.gplot) { 
  tmp <- ggplot_gtable(ggplot_build(a.gplot)) 
  leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box") 
  legend <- tmp$grobs[[leg]] 
  return(legend)
  } 

leg = g_legend(plots[[1]])

# Lay out the plots and the legend
grid.arrange(
  arrangeGrob(grobs=map(plots, function(x) x + guides(colour=FALSE)), widths=c(3,2)),
  leg, widths=c(10,1), left="Value", bottom="Group"
  )

enter image description here

eipi10
  • 91,525
  • 24
  • 209
  • 285
  • Thanks, yeah that's definitely a solid option. However, for the particular types of graphs I would like to make, I would prefer to have a single x axis and segmented y to account for large differences in scale of points though. – John Gagnon Sep 29 '17 at 00:17
  • 2
    There's no natural way to do it in ggplot2 and (IMHO) it's an even worse approach than the one I used above. However, if you insist...[here is a broken axis option](https://stackoverflow.com/a/33944553/496488) I came up with a while back. – eipi10 Sep 29 '17 at 00:50
  • out of curiosity, why do you say that it is a worse approach? With this particular plot, each of those subsets represent a fraction of 100%. I would use stacked bar graphs but it makes it harder to visualize the differences between the two genotypes. By having them interleaved as box plots, it makes it easier to see which populations are different but if you separate them into different facets using different x axis, it makes it less clear that the 4 populations in my example are intrinsically linked – John Gagnon Sep 29 '17 at 01:50
  • I don't really like either approach because the point of the graph is to map magnitudes to lengths so that you can visually compare various groups (in this case different tissues) in a consistent way. A broken axis or two separate graphs with different y-ranges results in a given length representing different magnitudes for different tissues, subverting the goal of the visualization. – eipi10 Sep 29 '17 at 03:35
  • Have you considered a log scale? I'm actually not wild about that either in the case of a box plot, but it's another option to throw into the mix: `ggplot(dat, aes(group, value, colour=sub)) + geom_boxplot() + theme_bw() + scale_y_log10(limits=c(1,100), breaks=c(1:10,seq(10,100,10))) + annotation_logticks(sides="l")`. – eipi10 Sep 29 '17 at 03:39
  • Thanks for the suggestions. I thought about log scale but since what I'm plotting are percents, I think log scale looks a little funny. – John Gagnon Sep 29 '17 at 22:21