outliers.R

Plain text source: outliers.R


# -*- Mode:R; Coding:utf-8; fill-column:160 -*-

################################################################################################################################################################
# @file      outliers.R
# @author    Mitch Richling <https://www.mitchr.me>
# @Copyright Copyright 2015 by Mitch Richling.  All rights reserved.
# @brief     Outliers detection.@EOL
# @Keywords  outlier detection r base 
#
# Topics:
#     1) Three different outlier detection rules are demonstrated:
#        * The 3-sigma rule
#        * The Hampel identifier
#        * The Box Plot Rule
#     2) Two different ways to generate random heavy tail data
#        * The Weibull distribution with shape parameter less than 1
#        * A series of normal variates with increasing standard deviations and decreasing sample counts
#     3) A hand crafted data set for a nice graph with labeled cutoff lines
#     4) Drawing points as a series (like a broadly offset strip chart), and highlighting outliers graphically
#

################################################################################################################################################################
## Three kinds of data (set dataType to one of the following):
##   * 'hand'      -- Hand crafted for a nice plot
##   * 'normzoom'  -- Heavy tails constructed from a sequence of normals
##   * 'weibull'   -- Weibull has a heavy tail, and it is always positive

dataType <- 'hand'

if(dataType=='normzoom')
  daData <- do.call(c, lapply(1:5, function (x) rnorm(2^(7-x), sd=2*x)))
if(dataType=='hand')
  daData <- c(rep(c(1,-1), 10), c(8,11,15))
if(dataType=='weibull')
  daData <- rweibull(200, .8)

################################################################################################################################################################
## Old 3 sigma, or "normal", Rule
## Good data: [mean–c*sd, mean+c*sd]

cParmSig  <- 3                                                # This is where the "3" comes from in "3 Sigma"
daMean    <- mean(daData)                                     # The "center" of our "non-outlier" interval
daSD      <- sd(daData)                                       # The "radius" of our "non-outlier" interval
cutOffSig <- cParmSig*c(-1, 1)*daSD+daMean                    # Lower and upper limits of our "non-outlier" interval

################################################################################################################################################################
## Hampel identifier
## Good data: [median–c*mad, median+c*mad]

cParmHem  <- 3                                                # This is the most common value used today
daMAD     <- mad(daData)                                      # The "center" of our "non-outlier" interval
daMedian  <- median(daData)                                   # The "radius" of our "non-outlier" interval
cutOffHem <- cParmHem*c(-1, 1)*daMAD+daMedian                 # Lower and upper limits of our "non-outlier" interval

################################################################################################################################################################
## boxplot rule
## Good data: [Q1–c*IQD,Q3+c*IQD],
##   IQD=Q3–Q1, Q1 is the first quartile, and Q3 the third

cParmBox  <- 1.5                                              # This is the most common value used today
daQUAR    <- quantile(daData, c(.25, .75))                    # The first and third quartiles (Q1 & Q3)
cutOffBox <- daQUAR+cParmBox*c(-1,1)*(daQUAR[2]-daQUAR[1])    # Lower and upper limits of our "non-outlier" interval

################################################################################################################################################################
## Plot it all

plot(1:length(daData), daData,                                                 # Plot the data with artificial x-data
     ylim=c(-1,1)*max(abs(c(daData,                                            # Set the y-axis to be symmetric about 0 and
                            cutOffSig, cutOffHem, cutOffBox))),                  # big enough for all data and intervals
     pch='.')                                                                  # Plot tiny dots for data points!

abline(h=cutOffSig, col='red')                                                 # Draw the interval limit lines
outIdx <- daData<cutOffSig[1] | daData>cutOffSig[2]                            # Find outliers
points((1:length(daData))[outIdx], daData[outIdx], col='red', cex=2, pch=3)    # Draw red, pch=3 points on outliers
if(dataType=='hand') text(12, cutOffSig, "3 Sigma", pos=c(3,1), col='red')     # For hand crafted data, label interval limit lines

abline(h=cutOffHem, col='blue')                                                # Draw the interval limit lines
outIdx <- daData<cutOffHem[1] | daData>cutOffHem[2]                            # Find outliers
points((1:length(daData))[outIdx], daData[outIdx], col='blue', cex=2, pch=4)   # Draw blue, pch=4 points on outliers
if(dataType=='hand') text(12, cutOffHem, "Hampel", pos=c(3,1), col='blue')     # For hand crafted data, label interval limit lines

abline(h=cutOffBox, col='green3')                                              # Draw the interval limit lines
outIdx <- daData<cutOffBox[1] | daData>cutOffBox[2]                            # Find outliers
points((1:length(daData))[outIdx], daData[outIdx], col='green3', cex=2, pch=5) # Draw green3, pch=5 points on outliers
if(dataType=='hand') text(12, cutOffBox, "BoxPlot", pos=c(3,1), col='green3')  # For hand crafted data, label interval limit lines
plot of chunk auto-report
if(dataType!='hand')                                                           # For non-hand crafted data, draw a legend
  legend("bottomleft", legend=c("3 Sigma", "Hampel", "BoxPlot"),
         text.col=c('red', 'blue', 'green3'), col=c('red', 'blue', 'green3'),
         pch=c(3,4,5))

The R session information (including the OS info, R version and all packages used):

    options(width=80)
    sessionInfo()
R version 3.3.0 (2016-05-03)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Debian GNU/Linux 8 (jessie)

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] graphics  grDevices datasets  utils     grid      stats     base     

other attached packages:
[1] RColorBrewer_1.1-2 reshape2_1.4.1     ggplot2_2.1.0      dplyr_0.4.3       
[5] data.table_1.9.6   gridExtra_2.2.1    knitr_1.13         lattice_0.20-33   

loaded via a namespace (and not attached):
 [1] Rcpp_0.12.5      assertthat_0.1   plyr_1.8.3       chron_2.3-47    
 [5] R6_2.1.2         gtable_0.2.0     DBI_0.4-1        formatR_1.4     
 [9] magrittr_1.5     evaluate_0.9     scales_0.4.0     highr_0.6       
[13] stringi_1.0-1    tools_3.3.0      stringr_1.0.0    munsell_0.4.3   
[17] parallel_3.3.0   colorspace_1.2-6 methods_3.3.0   
    Sys.time()
[1] "2016-07-09 20:07:45 CDT"