check out my most sophisticated functions. It has three methods (z mad, iqr), and different processing of outliers (remove, or replace). Plots are available and hacking (trying different methods or thresholds) are possible
see the example:
set.seed(1234)
x = rnorm(10)
ez.outlier(iris,'Sepal.Length',fill='null',hack=T,cutoff=c(1,2,3),plot=T)

#' univariate outlier cleanup
#' @description univariate outlier cleanup
#' @param x a data frame or a vector
#' @param col colwise processing
#' \cr col name
#' \cr if x is not a data frame, col is ignored
#' \cr could be multiple cols
#' @param method z score, mad, or IQR (John Tukey)
#' @param cutoff abs() > cutoff will be treated as outliers. Default/auto values (i.e. if NA):
#' \cr z 95% of values fall within 1.96, qnorm(0.025,lower.tail=F), or 3
#' \cr mad 2.5, which is the standard recommendation, or 5.2
#' \cr iqr 1.5
#' \cr if multiple values specified, use the first one (an exception is hack=T, during which method and cutoff same length or scalar)
#' @param hack call mapply to try all method and cutoff (same length or scalar, ie, different methods with
#' corresponding cutoff, or same method with different cutoff).
#' @param plot boxplot and hist before and after outlier processing.
#' @param fillout how to process outlier, fill with na, mean, median (columnwise for data frame), or
#' null --> remove outlier (only for vector or df with single col specified)
#' @return returns a new data frame or vector. If hack=T, returns nothings
#' @note univariate outlier approach
#' The Z-score method relies on the mean and standard deviation of a group of data to measure central
#' tendency and dispersion. This is troublesome, because the mean and standard deviation are highly
#' affected by outliers – they are not robust. In fact, the skewing that outliers bring is one of the
#' biggest reasons for finding and removing outliers from a dataset!
#' Another drawback of the Z-score method is that it behaves strangely in small datasets – in fact,
#' the Z-score method will never detect an outlier if the dataset has fewer than 12 items in it.
#' \cr
#' \cr
#' Median absolute deviation, modified z-score. The median and MAD are robust measures of central tendency and dispersion, respectively.
#' \cr
#' \cr
#' Interquartile range method is that, like the modified Z-score method, it uses a robust measure of dispersion.
#' \cr
#' @examples
#' set.seed(1234)
#' x = rnorm(10)
#' iris %>% ez.outlier('Sepal.Length',fill='null',hack=T,plot=T)
#' @export
ez.outlier = function(x, col=NULL, method=c('z','mad','iqr'), cutoff=NA, fillout=c('na','null','mean','median'), hack=FALSE, plot=FALSE, na.rm=TRUE, print2scr=TRUE) {
# https://datascienceplus.com/rscript/outlier.R
# https://cran.r-project.org/web/packages/outliers/index.html
# https://rpubs.com/hauselin/outliersDetect
if (hack==T){
# here for programming reason, for mapply,
# cutoff could not be NULL, use NA, because length(NULL)=0, but length(NA)=1
mapply(ez.outlier,method=method,cutoff=cutoff,MoreArgs=list(x=x,col=col,hack=F,plot=plot,fillout=fillout,na.rm=na.rm,print2scr=print2scr),SIMPLIFY=F,USE.NAMES=F)
cat('Hack done! No actual data returned.\n')
return(invisible(NULL))
}
method = match.arg(method); fillout =fillout[1]; cutoff=cutoff[1]
if (!is.data.frame(x)) {
# todropna is a workaround for data frame with single col passed in
x.bak.plot = x; x.replace.na = x; oldNAs = sum(is.na(x.replace.na))
if (fillout=='na' | fillout=='todropna') {
replacement = NA
} else if (fillout=='mean') {
replacement = mean(x, na.rm=na.rm)
} else if (fillout=='median') {
replacement = median(x, na.rm=na.rm)
} else if (fillout=='null') {
replacement = NULL
}
if (method=='z'){
if(is.na(cutoff)) cutoff = qnorm(0.025,lower.tail=F)
absz = abs((x - mean(x, na.rm=na.rm))/sd(x, na.rm=na.rm))
if (!is.null(replacement)) {
x[absz > cutoff] <- replacement
} else {
# if nothing above cutoff, x is untouched
if (length(which(absz > cutoff)) > 0) {
x = x[-which(absz > cutoff)]
}
}
x.replace.na[absz > cutoff] <- NA
} else if (method=='mad'){
if(is.na(cutoff)) cutoff = 2.5
absmad <- abs((x - median(x, na.rm=na.rm))/mad(x, na.rm=na.rm))
if (!is.null(replacement)) {
x[absmad > cutoff] <- replacement
} else {
if (length(which(absmad > cutoff)) > 0) {
x = x[-which(absmad > cutoff)]
}
}
x.replace.na[absmad > cutoff] <- NA
} else if (method=='iqr'){
# https://stackoverflow.com/a/4788102/2292993
if(is.na(cutoff)) cutoff = 1.5
q1 <- quantile(x, 0.25, na.rm=na.rm)
q3 <- quantile(x, 0.75, na.rm=na.rm)
# alternatively iqr = q3-q1
iqr = IQR(x, na.rm = na.rm)
lower_bound = q1 - (iqr * cutoff)
upper_bound = q3 + (iqr * cutoff)
if (!is.null(replacement)) {
x[(x > upper_bound) | (x < lower_bound)] <- replacement
} else {
if (length(which((x > upper_bound) | (x < lower_bound))) > 0) {
x = x[-which((x > upper_bound) | (x < lower_bound))]
}
}
x.replace.na[(x.replace.na > upper_bound) | (x.replace.na < lower_bound)] <- NA
}
newNAs = sum(is.na(x.replace.na)) - oldNAs
if (print2scr) {
if (!is.null(col)) {
cat(sprintf('%-15s %5s(%.2f): %3d outliers found and %s.\n', toString(col), toupper(method), cutoff, newNAs, ifelse((is.null(replacement)|fillout=='todropna'),'REMOVED','REPLACED')))
} else {
cat(sprintf('%5s(%.2f): %3d outliers found and %s.\n', toupper(method), cutoff, newNAs, ifelse((is.null(replacement)|fillout=='todropna'),'REMOVED','REPLACED')))
}
}
if (plot){
# mar controls margin size for individual plot it goes c(bottom, left, top, right)
# oma is margin for the whole?
opar = par(mfrow=c(2, 2), oma=c(0,0,1.5,0), mar = c(2,2,1.5,0.5))
on.exit(par(opar))
boxplot(x.bak.plot, main=sprintf("With outliers (n=%d)",length(x.bak.plot)))
hist(x.bak.plot, main=sprintf("With outliers (n=%d)",length(x.bak.plot)), xlab=NULL, ylab=NULL)
boxplot(x, main=sprintf("With outliers (n=%d)",length(x.bak.plot)-newNAs))
hist(x, main=sprintf("With outliers (n=%d)",length(x.bak.plot)-newNAs), xlab=NULL, ylab=NULL)
title(sprintf("%s Outlier Check: %s(%.2f)",toString(col), toupper(method), cutoff), outer=TRUE)
}
} else if (is.data.frame(x)) {
if (length(col)>1 & fillout=='null') {
cat('I do not know how to remove univariate outliers in multiple cols. fillout: null --> na ...\n')
fillout='na'
} else if (fillout=='null') {
fillout='todropna'
}
# trick to pass actual col name
x[col] = lapply(1:length(col), function(j) {ez.outlier(x=x[col][[j]],col=col[j],method=method,cutoff=cutoff,plot=plot,hack=hack,fillout=fillout,na.rm=na.rm,print2scr=print2scr)})
if (fillout=='todropna') x=x[complete.cases(x[,col,drop=FALSE]),,drop=FALSE]
} # end if
return(invisible(x))
}