3

How can you view the id of the outliers in a boxplot?

structure(list(pot = c(1L, 2L, 3L, 4L, 21L, 22L, 23L, 24L, 5L, 
6L, 7L, 8L, 25L, 26L, 27L, 28L, 9L, 10L, 11L, 12L, 29L, 30L, 
31L, 32L, 13L, 14L, 15L, 16L, 33L, 34L, 35L, 36L, 17L, 18L, 19L, 
20L, 37L, 38L, 39L, 40L, 41L, 42L, 43L, 44L, 61L, 62L, 63L, 64L, 
45L, 46L, 47L, 48L, 65L, 66L, 67L, 68L, 49L, 50L, 51L, 52L, 69L, 
70L, 71L, 72L, 53L, 54L, 55L, 56L, 73L, 74L, 75L, 76L, 57L, 58L, 
59L, 60L, 77L, 78L, 79L, 80L, 81L, 82L, 83L, 84L, 101L, 102L, 
103L, 104L, 85L, 86L, 87L, 88L, 105L, 106L, 107L, 108L, 89L, 
90L, 91L, 92L, 109L, 110L, 111L, 112L, 93L, 94L, 95L, 96L, 113L, 
114L, 115L, 116L, 97L, 98L, 99L, 100L, 117L, 118L, 119L, 120L, 
121L, 122L, 123L, 124L, 141L, 142L, 143L, 144L, 125L, 126L, 127L, 
128L, 145L, 146L, 147L, 148L, 129L, 130L, 131L, 132L, 149L, 150L, 
151L, 152L, 133L, 134L, 135L, 136L, 153L, 154L, 155L, 156L, 137L, 
138L, 139L, 140L, 157L, 158L, 159L, 160L), rep = c(1L, 2L, 3L, 
4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 
4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 
4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 
4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 
4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 
4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 
4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 
4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 
4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 
4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L), cultivar = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("Dinninup", 
"Riverina", "Seaton Park", "Yarloop"), class = "factor"), Waterlogging = structure(c(2L, 
2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 
2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 
2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 
2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 
2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 
2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 
2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 
2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 
2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 
2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L), .Label = c("Non-waterlogged", 
"Waterlogged"), class = "factor"), P = c(12.1, 12.1, 12.1, 12.1, 
12.1, 12.1, 12.1, 12.1, 15.17, 15.17, 15.17, 15.17, 15.17, 15.17, 
15.17, 15.17, 18.24, 18.24, 18.24, 18.24, 18.24, 18.24, 18.24, 
18.24, 24.39, 24.39, 24.39, 24.39, 24.39, 24.39, 24.39, 24.39, 
48.35, 48.35, 48.35, 48.35, 48.35, 48.35, 48.35, 48.35, 12.1, 
12.1, 12.1, 12.1, 12.1, 12.1, 12.1, 12.1, 15.17, 15.17, 15.17, 
15.17, 15.17, 15.17, 15.17, 15.17, 18.24, 18.24, 18.24, 18.24, 
18.24, 18.24, 18.24, 18.24, 24.39, 24.39, 24.39, 24.39, 24.39, 
24.39, 24.39, 24.39, 48.35, 48.35, 48.35, 48.35, 48.35, 48.35, 
48.35, 48.35, 12.1, 12.1, 12.1, 12.1, 12.1, 12.1, 12.1, 12.1, 
15.17, 15.17, 15.17, 15.17, 15.17, 15.17, 15.17, 15.17, 18.24, 
18.24, 18.24, 18.24, 18.24, 18.24, 18.24, 18.24, 24.39, 24.39, 
24.39, 24.39, 24.39, 24.39, 24.39, 24.39, 48.35, 48.35, 48.35, 
48.35, 48.35, 48.35, 48.35, 48.35, 12.1, 12.1, 12.1, 12.1, 12.1, 
12.1, 12.1, 12.1, 15.17, 15.17, 15.17, 15.17, 15.17, 15.17, 15.17, 
15.17, 18.24, 18.24, 18.24, 18.24, 18.24, 18.24, 18.24, 18.24, 
24.39, 24.39, 24.39, 24.39, 24.39, 24.39, 24.39, 24.39, 48.35, 
48.35, 48.35, 48.35, 48.35, 48.35, 48.35, 48.35), total = c(3.66, 
2.02, 1.59, 1.67, 2.12, 2.46, 1.79, 2.09, 2.03, 2.13, 1.83, 2.34, 
2.66, 2.2, 1.79, 1.97, 2.17, 2.44, 1.49, 2.19, 2.92, 2.43, 1.58, 
2.07, 2.48, 2.49, 1.69, 2.1, 2.38, 2.52, 2.41, 2.46, 2.22, 2.07, 
1.97, 2.3, 2.48, 3.16, 1.76, 2.38, 2.81, 2.64, 2.59, 3.28, 3.18, 
2.57, 2.9, 3, 2.38, 2.72, 2.58, 2.73, 3.06, 3.01, 3.01, 2.77, 
2.95, 2.36, 2.91, 2.38, 3.33, 3.19, 3.17, 3.16, 3.16, 3.2, 2.58, 
3.71, 3.11, 2.7, 2.92, 1.93, 2.95, 2.57, 2.68, 2.48, 3.34, 2.75, 
2.52, 1.88, 1.19, 0.57, 0.64, 0.66, 1.13, 1.28, 0.85, 0.96, 1.34, 
2.14, 0.63, 1.27, 1.13, 0.64, 1.21, 1.95, 1.11, 0.91, 0.75, 0.63, 
1.06, 1.07, 1.05, 0.8, 1.41, 1.13, 0.75, 0.89, 1.98, 1.27, 1.01, 
1, 1.16, 0.64, 0.64, 1.02, 1.03, 1.13, 0.79, 0.6, 3.88, 2.79, 
2.73, 2.77, 3.54, 2.05, 1.51, 1.88, 3.86, 3.13, 1.97, 3.46, 3.98, 
3.6, 2.12, 2.86, 2.95, 1.65, 1.94, 2.53, 2.21, 1.94, 2.05, 2.22, 
3, 3.28, 1.55, 3.85, 2.4, 2.1, 1.98, 1.81, 2.48, 1.66, 2.06, 
1.23, 3.75, 1.99, 1.67, 1.93)), class = "data.frame", row.names = c(NA, 
-160L))
boxplot(total~cultivar*as.factor(P),data=x)

boxplot with outliers

This is what I am after....

desired result

I have tried following example but does not work....

boxplot(total~cultivar*as.factor(P),data=x,id=list(n=Inf))

Identifying the outliers on the plot will make it easier to remove them from analysis. For some reason its not as straightforward as I thought. The post is asking me to add more details but I think there is sufficient.

r2evans
  • 141,215
  • 6
  • 77
  • 149
Eliott Reed
  • 351
  • 1
  • 4
  • 12

2 Answers2

4

You can use the car package:

library(car)

Boxplot(total ~ cultivar*as.factor(P), id.method="y", data = x)

Update:

Is it possible to flip the coordinates in car::Boxplot?

For the sake of the challenge, I tried some hacky methods. After all, I was able to rotate the plot, but it's not as conventional as it is for ggplot2::coord_flip. Here, I am just rotating the plot. So, the labels are still in their previous alignment. We can go further, remove the labels and rewrite them, but that would defeat the whole purpose of this solution which is simplicity.

library(car)
library(gridGraphics)

p <- Boxplot(total ~ cultivar*as.factor(P), id.method="y", data = x)

grab_grob <- function(){
  grid.echo()
  grid.grab()
}

g <- grab_grob()
grid.newpage()
pushViewport(viewport(width=0.5,angle=90))
grid.draw(g)

Community
  • 1
  • 1
M--
  • 25,431
  • 8
  • 61
  • 93
  • 1
    I love it when I have 30+ lines of code in an answer and somebody comes along with a one-liner. – r2evans Jan 18 '20 at 02:25
  • 2
    @r2evans sorry, didn't mean to downplay your efforts ;) – M-- Jan 18 '20 at 02:27
  • How do you flip this graph so you can easily read the labels like the answer below? – Eliott Reed Jan 18 '20 at 08:22
  • I think you can add `horizontal = TRUE`, since I think it uses the base R `boxplot` internally (which takes that argument). – r2evans Jan 18 '20 at 15:35
  • Unfortunately, horizontal = TRUE does not work with "Boxplot" from the car package. – Eliott Reed Jan 19 '20 at 05:05
  • @EliottReed I am afraid it's not possible for `car::Boxplot` to be rotated. If you load `gridGraphics` and save the boxplot into a variable, e.g. `p`, and then run `str(p)` it will return the outliers plus some warnings that says the plot cannot be clipped for rotations, etc. This answer explains a bit: https://stackoverflow.com/a/3793658/6461462 It is something that package carer should add to the functions: https://github.com/cran/car. Issue of readability can be taken care of by defining a better size or splitting the plot into chunks (some ideas out of thin air). Cheers. – M-- Jan 19 '20 at 05:37
3

Unfortunately, though boxplot does return a list structure that provides the values of the outliers (e.g., boxplot(..., plot=FALSE)$out), this doesn't help here since there are equal values in other groups that are not outliers there. (In fact, I find using $out always a bit risky unless it is just one group.)

But you can use $stats to get the whisker parameters and find everything yourself. Unfortunately, this is not a one-liner.

First, though, since I don't know what you mean by "id", I'll add something to the data:

x$id <- seq_len(nrow(x))

base R

bp <- boxplot(total ~ cultivar * as.factor(P), data = x)
lims <- data.frame(nm = bp$names, t(bp$stats[c(1,5),]))
tmpx <- merge(transform(x, nm = paste(cultivar, as.factor(P), sep = ".")), lims, by = "nm", all.x = TRUE)
tmpx <- subset(tmpx, total < X1 | total > X2)
tmpx$xval <- match(tmpx$nm, bp$names)
text(total ~ xval, id, data = tmpx, adj = c(-0.5, 0.5))

base R boxplot, with outlier ids

Overlaying text over boxplots might be a problem for you; you can play with various shifting and/or flipping the coordinates to control this. Clipping (not shown here, but when a text label disappears out of the plot region) can also be a problem, so you might need to manually control the limits of the plot region.

dplyr

In case you like the tidyverse-way of looking at data-munging, here's an alternative that produces the same plot.

library(dplyr)
bp <- boxplot(total ~ cultivar * as.factor(P), data = x)
x %>%
  mutate( nm = paste(cultivar, as.factor(P), sep = ".") ) %>%
  left_join(data.frame(nm = bp$names, t(bp$stats[c(1,5),]), stringsAsFactors = FALSE),
            by = "nm") %>%
  filter(total < X1 | total > X2) %>%
  mutate(xval = match(nm, bp$names)) %>%
  text(data = ., total ~ xval, as.character(id), adj = c(-0.5, 0.5))

(Same plot.)

dplyr and ggplot2

library(dplyr)
library(ggplot2)
bp <- boxplot(total ~ cultivar * as.factor(P), data = x, plot = FALSE)
x %>%
  mutate( nm = paste(cultivar, as.factor(P), sep = ".") ) %>%
  left_join(data.frame(nm = bp$names, t(bp$stats[c(1,5),]), stringsAsFactors = FALSE),
            by = "nm") %>%
  mutate(outlier = total < X1 | total > X2) %>%
  ggplot(aes(interaction(cultivar, P), total)) +
  geom_boxplot() +
  geom_text(aes(label = id), hjust = -0.5, data = ~ filter(., outlier)) +
  coord_flip()

ggplot2 boxplot, with outlier ids

I chose to flip the coordinates so that the labels would be all included and shown, but it's not required for the method. One trick I used is that the data= argument to the ggplot2 functions can take an expression (I think of it as a tilde-function), which allows subsetting of the main dataset in-place. Here I use dplyr::filter, but in this case it is just as easy to use subset (base R) in case you are not otherwise using dplyr.

r2evans
  • 141,215
  • 6
  • 77
  • 149