How to show the id of outliers on a boxplot

Question

How can you view the id of the outliers in a boxplot?

structure(list(pot = c(1L, 2L, 3L, 4L, 21L, 22L, 23L, 24L, 5L, 
6L, 7L, 8L, 25L, 26L, 27L, 28L, 9L, 10L, 11L, 12L, 29L, 30L, 
31L, 32L, 13L, 14L, 15L, 16L, 33L, 34L, 35L, 36L, 17L, 18L, 19L, 
20L, 37L, 38L, 39L, 40L, 41L, 42L, 43L, 44L, 61L, 62L, 63L, 64L, 
45L, 46L, 47L, 48L, 65L, 66L, 67L, 68L, 49L, 50L, 51L, 52L, 69L, 
70L, 71L, 72L, 53L, 54L, 55L, 56L, 73L, 74L, 75L, 76L, 57L, 58L, 
59L, 60L, 77L, 78L, 79L, 80L, 81L, 82L, 83L, 84L, 101L, 102L, 
103L, 104L, 85L, 86L, 87L, 88L, 105L, 106L, 107L, 108L, 89L, 
90L, 91L, 92L, 109L, 110L, 111L, 112L, 93L, 94L, 95L, 96L, 113L, 
114L, 115L, 116L, 97L, 98L, 99L, 100L, 117L, 118L, 119L, 120L, 
121L, 122L, 123L, 124L, 141L, 142L, 143L, 144L, 125L, 126L, 127L, 
128L, 145L, 146L, 147L, 148L, 129L, 130L, 131L, 132L, 149L, 150L, 
151L, 152L, 133L, 134L, 135L, 136L, 153L, 154L, 155L, 156L, 137L, 
138L, 139L, 140L, 157L, 158L, 159L, 160L), rep = c(1L, 2L, 3L, 
4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 
4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 
4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 
4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 
4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 
4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 
4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 
4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 
4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 
4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L), cultivar = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("Dinninup", 
"Riverina", "Seaton Park", "Yarloop"), class = "factor"), Waterlogging = structure(c(2L, 
2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 
2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 
2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 
2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 
2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 
2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 
2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 
2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 
2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 
2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L), .Label = c("Non-waterlogged", 
"Waterlogged"), class = "factor"), P = c(12.1, 12.1, 12.1, 12.1, 
12.1, 12.1, 12.1, 12.1, 15.17, 15.17, 15.17, 15.17, 15.17, 15.17, 
15.17, 15.17, 18.24, 18.24, 18.24, 18.24, 18.24, 18.24, 18.24, 
18.24, 24.39, 24.39, 24.39, 24.39, 24.39, 24.39, 24.39, 24.39, 
48.35, 48.35, 48.35, 48.35, 48.35, 48.35, 48.35, 48.35, 12.1, 
12.1, 12.1, 12.1, 12.1, 12.1, 12.1, 12.1, 15.17, 15.17, 15.17, 
15.17, 15.17, 15.17, 15.17, 15.17, 18.24, 18.24, 18.24, 18.24, 
18.24, 18.24, 18.24, 18.24, 24.39, 24.39, 24.39, 24.39, 24.39, 
24.39, 24.39, 24.39, 48.35, 48.35, 48.35, 48.35, 48.35, 48.35, 
48.35, 48.35, 12.1, 12.1, 12.1, 12.1, 12.1, 12.1, 12.1, 12.1, 
15.17, 15.17, 15.17, 15.17, 15.17, 15.17, 15.17, 15.17, 18.24, 
18.24, 18.24, 18.24, 18.24, 18.24, 18.24, 18.24, 24.39, 24.39, 
24.39, 24.39, 24.39, 24.39, 24.39, 24.39, 48.35, 48.35, 48.35, 
48.35, 48.35, 48.35, 48.35, 48.35, 12.1, 12.1, 12.1, 12.1, 12.1, 
12.1, 12.1, 12.1, 15.17, 15.17, 15.17, 15.17, 15.17, 15.17, 15.17, 
15.17, 18.24, 18.24, 18.24, 18.24, 18.24, 18.24, 18.24, 18.24, 
24.39, 24.39, 24.39, 24.39, 24.39, 24.39, 24.39, 24.39, 48.35, 
48.35, 48.35, 48.35, 48.35, 48.35, 48.35, 48.35), total = c(3.66, 
2.02, 1.59, 1.67, 2.12, 2.46, 1.79, 2.09, 2.03, 2.13, 1.83, 2.34, 
2.66, 2.2, 1.79, 1.97, 2.17, 2.44, 1.49, 2.19, 2.92, 2.43, 1.58, 
2.07, 2.48, 2.49, 1.69, 2.1, 2.38, 2.52, 2.41, 2.46, 2.22, 2.07, 
1.97, 2.3, 2.48, 3.16, 1.76, 2.38, 2.81, 2.64, 2.59, 3.28, 3.18, 
2.57, 2.9, 3, 2.38, 2.72, 2.58, 2.73, 3.06, 3.01, 3.01, 2.77, 
2.95, 2.36, 2.91, 2.38, 3.33, 3.19, 3.17, 3.16, 3.16, 3.2, 2.58, 
3.71, 3.11, 2.7, 2.92, 1.93, 2.95, 2.57, 2.68, 2.48, 3.34, 2.75, 
2.52, 1.88, 1.19, 0.57, 0.64, 0.66, 1.13, 1.28, 0.85, 0.96, 1.34, 
2.14, 0.63, 1.27, 1.13, 0.64, 1.21, 1.95, 1.11, 0.91, 0.75, 0.63, 
1.06, 1.07, 1.05, 0.8, 1.41, 1.13, 0.75, 0.89, 1.98, 1.27, 1.01, 
1, 1.16, 0.64, 0.64, 1.02, 1.03, 1.13, 0.79, 0.6, 3.88, 2.79, 
2.73, 2.77, 3.54, 2.05, 1.51, 1.88, 3.86, 3.13, 1.97, 3.46, 3.98, 
3.6, 2.12, 2.86, 2.95, 1.65, 1.94, 2.53, 2.21, 1.94, 2.05, 2.22, 
3, 3.28, 1.55, 3.85, 2.4, 2.1, 1.98, 1.81, 2.48, 1.66, 2.06, 
1.23, 3.75, 1.99, 1.67, 1.93)), class = "data.frame", row.names = c(NA, 
-160L))

boxplot(total~cultivar*as.factor(P),data=x)

This is what I am after....

I have tried following example but does not work....

boxplot(total~cultivar*as.factor(P),data=x,id=list(n=Inf))

Identifying the outliers on the plot will make it easier to remove them from analysis. For some reason its not as straightforward as I thought. The post is asking me to add more details but I think there is sufficient.

the second plot is just showing us how the outliers are labelled. The first plot is the one im working with: boxplot(total~cultivar*as.factor(P),data=x) — Eliott Reed, Jan 18 '20 at 01:43
So what Im trying to figure out is how to label the outliers like in the second plot — Eliott Reed, Jan 18 '20 at 01:44
@G5W first plot is what they get from their code, second one is the desired output. — M--, Jan 18 '20 at 02:28
Besides the answers below, further reading: https://www.r-statistics.com/2011/01/how-to-label-all-the-outliers-in-a-boxplot/ — M--, Jan 18 '20 at 02:29
If I run OP's code on OP's data, I do not get OP's first plot. — G5W, Jan 18 '20 at 02:32
@G5W same for me, though I dismissed that as a sampling issue. \*shrug\* — r2evans, Jan 18 '20 at 02:46

score 4 · Accepted Answer · edited Jun 20 '20 at 09:12

4

You can use the car package:

library(car)

Boxplot(total ~ cultivar*as.factor(P), id.method="y", data = x)

Update:

Is it possible to flip the coordinates in car::Boxplot?

For the sake of the challenge, I tried some hacky methods. After all, I was able to rotate the plot, but it's not as conventional as it is for ggplot2::coord_flip. Here, I am just rotating the plot. So, the labels are still in their previous alignment. We can go further, remove the labels and rewrite them, but that would defeat the whole purpose of this solution which is simplicity.

library(car)
library(gridGraphics)

p <- Boxplot(total ~ cultivar*as.factor(P), id.method="y", data = x)

grab_grob <- function(){
  grid.echo()
  grid.grab()
}

g <- grab_grob()
grid.newpage()
pushViewport(viewport(width=0.5,angle=90))
grid.draw(g)

edited Jun 20 '20 at 09:12

Community

1
1

answered Jan 18 '20 at 02:25

M--

25,431
8
61
93

1

I love it when I have 30+ lines of code in an answer and somebody comes along with a one-liner. – r2evans Jan 18 '20 at 02:25
2

@r2evans sorry, didn't mean to downplay your efforts ;) – M-- Jan 18 '20 at 02:27
How do you flip this graph so you can easily read the labels like the answer below? – Eliott Reed Jan 18 '20 at 08:22
I think you can add `horizontal = TRUE`, since I think it uses the base R `boxplot` internally (which takes that argument). – r2evans Jan 18 '20 at 15:35
Unfortunately, horizontal = TRUE does not work with "Boxplot" from the car package. – Eliott Reed Jan 19 '20 at 05:05
@EliottReed I am afraid it's not possible for `car::Boxplot` to be rotated. If you load `gridGraphics` and save the boxplot into a variable, e.g. `p`, and then run `str(p)` it will return the outliers plus some warnings that says the plot cannot be clipped for rotations, etc. This answer explains a bit: https://stackoverflow.com/a/3793658/6461462 It is something that package carer should add to the functions: https://github.com/cran/car. Issue of readability can be taken care of by defining a better size or splitting the plot into chunks (some ideas out of thin air). Cheers. – M-- Jan 19 '20 at 05:37

r2evans · Answer 2 · 2020-02-03T07:11:43.800

Unfortunately, though boxplot does return a list structure that provides the values of the outliers (e.g., boxplot(..., plot=FALSE)$out), this doesn't help here since there are equal values in other groups that are not outliers there. (In fact, I find using $out always a bit risky unless it is just one group.)

But you can use $stats to get the whisker parameters and find everything yourself. Unfortunately, this is not a one-liner.

First, though, since I don't know what you mean by "id", I'll add something to the data:

x$id <- seq_len(nrow(x))

`base` R

bp <- boxplot(total ~ cultivar * as.factor(P), data = x)
lims <- data.frame(nm = bp$names, t(bp$stats[c(1,5),]))
tmpx <- merge(transform(x, nm = paste(cultivar, as.factor(P), sep = ".")), lims, by = "nm", all.x = TRUE)
tmpx <- subset(tmpx, total < X1 | total > X2)
tmpx$xval <- match(tmpx$nm, bp$names)
text(total ~ xval, id, data = tmpx, adj = c(-0.5, 0.5))

Overlaying text over boxplots might be a problem for you; you can play with various shifting and/or flipping the coordinates to control this. Clipping (not shown here, but when a text label disappears out of the plot region) can also be a problem, so you might need to manually control the limits of the plot region.

`dplyr`

In case you like the tidyverse-way of looking at data-munging, here's an alternative that produces the same plot.

library(dplyr)
bp <- boxplot(total ~ cultivar * as.factor(P), data = x)
x %>%
  mutate( nm = paste(cultivar, as.factor(P), sep = ".") ) %>%
  left_join(data.frame(nm = bp$names, t(bp$stats[c(1,5),]), stringsAsFactors = FALSE),
            by = "nm") %>%
  filter(total < X1 | total > X2) %>%
  mutate(xval = match(nm, bp$names)) %>%
  text(data = ., total ~ xval, as.character(id), adj = c(-0.5, 0.5))

(Same plot.)

`dplyr` and `ggplot2`

library(dplyr)
library(ggplot2)
bp <- boxplot(total ~ cultivar * as.factor(P), data = x, plot = FALSE)
x %>%
  mutate( nm = paste(cultivar, as.factor(P), sep = ".") ) %>%
  left_join(data.frame(nm = bp$names, t(bp$stats[c(1,5),]), stringsAsFactors = FALSE),
            by = "nm") %>%
  mutate(outlier = total < X1 | total > X2) %>%
  ggplot(aes(interaction(cultivar, P), total)) +
  geom_boxplot() +
  geom_text(aes(label = id), hjust = -0.5, data = ~ filter(., outlier)) +
  coord_flip()

I chose to flip the coordinates so that the labels would be all included and shown, but it's not required for the method. One trick I used is that the data= argument to the ggplot2 functions can take an expression (I think of it as a tilde-function), which allows subsetting of the main dataset in-place. Here I use dplyr::filter, but in this case it is just as easy to use subset (base R) in case you are not otherwise using dplyr.

How to show the id of outliers on a boxplot

2 Answers2

Update:

`base` R

`dplyr`

`dplyr` and `ggplot2`

Linked

How to show the id of outliers on a boxplot

2 Answers2

Update:

base R

dplyr

dplyr and ggplot2

Linked

`base` R

`dplyr`

`dplyr` and `ggplot2`