I have plenty of measurements (N ~ 500.000) of two laboratory tests that I wanted to plot using geom_boxplot
. These measurements were created using three different analyzers.
However, many of these measurements are extreme outliers which leads to extremely wide scales, which again results in boxes that are too small to really read or compare.
I created a dummy data set sampling only 250 of these measurements to showcase the issue. The data as follows:
dummy <- structure(list(result = c(3.93, 2.708, 2.08, 1.8422, 0.897, 1.68,
3.56, 2.8954, 0.972, 2.99, 0.567, 2.01, 2.5629, 0.7958, 4.81,
3.539, 2.24, 4.79, 2.07, 5.56, 3.4761, 1.74, 1.5691, 0.964, 1.8171,
0.005, 0.663, 2.16, 2.37, 0.0164, 1.25, 3.086, 0.769, 1.9573,
1.05, 2.17, 1.6331, 8.2358, 0.58, 1.43, 0.9328, 1.94, 2.59, 5.06,
0.0574, 1.61, 1.01, 8.21, 2.77, 0.9938, 20.38, 3.71, 1.6731,
0.0259, 0.9701, 0.0114, 0.499, 38.8, 4.8689, 3.02, 1.176, 2.86,
1.96, 5.03, 0.7564, 0.903, 2.0017, 0.4928, 1.3993, 4.02, 1.97,
10.6109, 1.18, 1.68, 230, 1.9764, 3.81, 3.3518, 0.985, 3.4, 16.1,
1.5889, 3.13, 2.0168, 1.82, 4.75, 2.61, 2.0133, 1.1971, 0.4736,
74.1, 0.737, 5.21, 1.6495, 1.4, 3.7408, 0.68, 2.26, 2.5, 2.16,
0.459, 0.0281, 5.34, 1.3, 4.11, 1.9344, 3.9611, 2.79, 1.72, 8.9041,
1.47, 2.61, 3.02, 1.91, 3.49, 1.0161, 1.9, 1.63, 1.31, 1.81,
2.556, 0.972, 4.9, 8.313, 1.55, 0.875, 1.4379, 3.68, 0.716, 2.76,
2.1897, 0.3121, 1.4376, 2.56, 0.89, 3.0298, 0.6003, 1.2542, 1.61,
0.491, 3.08, 1.45, 1.94, 2.1503, 2.6605, 23.5, 1.54, 3.54, 4.22,
2.31, 1.082, 1.45, 1.77, 0.423, 11.9, 2.77, 4.8894, 0.8142, 0.158,
9.2012, 1.96, 0.467, 0.4081, 1.06, 2, 3.05, 2.81, 0.2151, 2.21,
0.95, 2.3647, 0.357, 1.7284, 1.31, 0.9586, 1.4548, 4.51, 0.022,
2.2629, 39.9014, 1.3403, 6.64, 4.62, 1.27, 1.18, 1.1, 0.565,
0.939, 1.9141, 3.855, 0.455, 3.14, 1.09, 1.0475, 0.971, 2.33,
1.16, 4.6919, 100, 0.2412, 2.53, 1.84, 1.04, 2.35, 4.89, 1.6384,
3.33, 1.82, 0.8973, 0.6061, 0.98, 6.18, 0.4258, 2.5555, 1.67,
5.37, 2.29, 2.93, 5.0596, 2.2328, 2.84, 7.73, 1.8, 2.3978, 3.02,
2.71, 0.618, 0.0035, 3.97, 0.9827, 2.3385, 5.07, 0.306, 3.13,
2.62, 2.81, 4.4749, 1.0362, 1.4896, 2.3907, 1.45, 2.2823, 2.3726,
2.1746, 4.08, 2.98, 2.57, 0.947, 2.16, 1.46), testName = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L,
1L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 2L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L,
1L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L,
2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L,
1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Test A", "Test B"
), class = "factor"), accountName = structure(c(3L, 2L, 3L, 2L,
2L, 5L, 6L, 2L, 6L, 3L, 2L, 3L, 1L, 1L, 5L, 2L, 3L, 5L, 3L, 2L,
2L, 1L, 1L, 2L, 1L, 1L, 6L, 1L, 3L, 4L, 2L, 2L, 1L, 2L, 5L, 6L,
2L, 4L, 3L, 4L, 4L, 3L, 3L, 3L, 4L, 5L, 4L, 6L, 3L, 2L, 5L, 3L,
2L, 5L, 1L, 4L, 5L, 1L, 2L, 3L, 1L, 5L, 3L, 3L, 2L, 3L, 2L, 2L,
2L, 3L, 1L, 1L, 1L, 5L, 6L, 4L, 3L, 1L, 3L, 5L, 3L, 1L, 3L, 2L,
5L, 3L, 2L, 2L, 2L, 4L, 5L, 1L, 3L, 1L, 3L, 4L, 2L, 5L, 6L, 6L,
1L, 3L, 6L, 2L, 6L, 2L, 2L, 1L, 3L, 4L, 2L, 3L, 5L, 2L, 1L, 4L,
6L, 2L, 1L, 3L, 1L, 3L, 3L, 4L, 3L, 2L, 4L, 5L, 4L, 1L, 1L, 2L,
1L, 1L, 6L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 5L, 4L, 2L, 4L, 3L, 5L,
3L, 1L, 2L, 6L, 3L, 3L, 5L, 3L, 1L, 4L, 3L, 4L, 6L, 3L, 1L, 3L,
3L, 6L, 1L, 4L, 3L, 3L, 1L, 4L, 2L, 1L, 1L, 1L, 3L, 6L, 2L, 1L,
2L, 3L, 1L, 3L, 1L, 6L, 3L, 1L, 2L, 2L, 3L, 3L, 2L, 1L, 2L, 6L,
3L, 1L, 4L, 1L, 4L, 2L, 1L, 6L, 3L, 2L, 3L, 3L, 4L, 4L, 6L, 3L,
4L, 1L, 5L, 1L, 3L, 3L, 2L, 2L, 3L, 2L, 1L, 1L, 5L, 3L, 6L, 1L,
1L, 2L, 1L, 3L, 6L, 5L, 3L, 3L, 4L, 2L, 4L, 2L, 6L, 1L, 4L, 1L,
3L, 1L, 3L, 2L, 2L, 1L), .Label = c("Lab 1", "Lab 2", "Lab 3",
"Lab 4", "Lab 5", "Lab 6"), class = "factor"), moduleCode = structure(c(2L,
1L, 3L, 1L, 3L, 2L, 2L, 1L, 3L, 3L, 3L, 3L, 1L, 1L, 3L, 1L, 3L,
2L, 3L, 3L, 1L, 3L, 1L, 3L, 1L, 3L, 3L, 3L, 3L, 1L, 3L, 1L, 3L,
1L, 3L, 3L, 1L, 1L, 3L, 3L, 1L, 3L, 3L, 3L, 1L, 3L, 3L, 2L, 3L,
1L, 2L, 3L, 1L, 3L, 1L, 1L, 3L, 3L, 1L, 3L, 1L, 3L, 3L, 3L, 1L,
3L, 1L, 1L, 1L, 3L, 3L, 1L, 3L, 2L, 3L, 1L, 3L, 1L, 3L, 3L, 3L,
1L, 3L, 1L, 3L, 3L, 3L, 1L, 1L, 1L, 3L, 1L, 3L, 1L, 3L, 1L, 1L,
3L, 2L, 2L, 3L, 3L, 2L, 3L, 3L, 1L, 1L, 3L, 3L, 1L, 3L, 3L, 2L,
3L, 3L, 1L, 3L, 3L, 3L, 3L, 1L, 3L, 3L, 1L, 3L, 3L, 1L, 2L, 1L,
3L, 1L, 1L, 1L, 3L, 2L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 2L, 1L, 1L,
3L, 2L, 2L, 3L, 3L, 1L, 2L, 3L, 3L, 3L, 3L, 1L, 1L, 3L, 1L, 2L,
3L, 1L, 3L, 3L, 2L, 3L, 1L, 3L, 3L, 1L, 1L, 1L, 3L, 1L, 1L, 3L,
2L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 3L, 3L, 3L,
1L, 3L, 2L, 3L, 1L, 1L, 1L, 3L, 3L, 3L, 2L, 3L, 1L, 3L, 2L, 1L,
1L, 3L, 3L, 1L, 1L, 2L, 3L, 3L, 3L, 1L, 1L, 2L, 3L, 3L, 1L, 2L,
3L, 3L, 1L, 3L, 1L, 1L, 3L, 3L, 2L, 3L, 3L, 1L, 1L, 1L, 1L, 3L,
1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("X", "Y", "Z"
), class = "factor")), .Names = c("result", "testName", "accountName",
"moduleCode"), class = "data.frame", row.names = c(NA, -250L))
Using this code
library(ggplot2)
boxplot <- ggplot(data=dummy, aes(x=accountName, y=result, fill=moduleCode)) +
geom_boxplot(position=position_dodge(width=0.85), outlier.alpha=0.2) +
facet_wrap(~testName, scales="free_y", nrow=2) +
labs(fill="Module") +
theme_bw() +
theme(axis.ticks.x=element_line(color="black")) +
theme(axis.ticks.y=element_line(color="black")) +
theme(axis.text.x=element_text(size=10, angle=55, hjust=1, vjust=0.975, color="black")) +
theme(axis.text.y=element_text(color="black")) +
theme(panel.background=element_blank()) +
theme(axis.title.x=element_blank()) +
theme(axis.title.y=element_blank()) +
theme(axis.line=element_line(colour="black", size=0.5, linetype ="solid")) +
theme(panel.grid.major.y=element_line(color="#bdbdbd", linetype="dotted")) +
theme(panel.grid.minor.y=element_line(color="#bdbdbd", linetype="dotted")) +
theme(plot.margin=unit(c(0.5, 0.5, 0.5, 0.7), "cm"))
print(boxplot)
I created this plot
NOW, the question is: How can I scale the y-axis of the facets independently, and manually - best case, so that it works generically, since I wanted to write a script that can handle comparison of two random tests?
In general, I want the range of the scales to be much smaller than the complete data's range, i.e., I want to focus on the range of the lower and upper extremes of the boxplot whiskers only for each of the facets independently. Those can be obtained using
boxplot.stats(dummy$result)$stats[c(1, 5)]
I'm not looking for the scales='free_y'
argument. Also, scale_y_continuous
and coord_cartesian
won't do the job because, from what I understand, they set scales globally for all facets representing the single tests, which can vary greatly. I tried adding empty layers using geom_blank
with dummy data that is limited in range to the lower and upper whiskers extremes. The problem is, depending on the tests, there always might be extreme outliers in the real data still causing the scales to get really wide and thus ignoring the range of the dummy data plugged into geom_blank
.
I also tried extracting and manipulating the scales after creating the graph object using function ggplot_build
:
graph_object <- ggplot_build(boxplot)
graph_object$layout$panel_scales$y[[1]]$range$range <- boxplot.stats(dummy[with(dummy, testName=="Test A"),]$result)$stats[c(1, 5)]
graph_object$layout$panel_scales$y[[2]]$range$range <- boxplot.stats(dummy[with(dummy, testName=="Test B"),]$result)$stats[c(1, 5)]
I then tried plotting the manipulated object using print(graph_object)
and graph_object$plot
, however, the scales still are the 'original' wider ones including the extreme values. Am I missing anyhting?
Sorry for the very long post, I might have gotten lost somewhere on the way trying to solve this and overlooked the solution. Therefore, I would be really happy to see some ideas or even a proper solution to this.
Thanks a lot!