polynomialRegression.R

Plain text source: polynomialRegression.R


# -*- Mode:R; Coding:utf-8; fill-column:160 -*-

################################################################################################################################################################
# @file      polynomialRegression.R
# @author    Mitch Richling <https://www.mitchr.me>
# @Copyright Copyright 2015 by Mitch Richling.  All rights reserved.
# @brief     Polynomial regression and graphics.@EOL
# @Keywords  polynomial regression multiple base r cran package ggplot2
#
# This example highlights some of the particulars of doing "simple" polynomial regression R when compared to simple linear regression:
#  1) Automating multiple fits
#  2) Specifying the model formula for the fit,
#  3) Selecting the model degree using ANOVA, and graphically with simultaneous model plots
#  4) Visually exploring confidence and prediction intervals vs model degree with ggplot2.
#

################################################################################################################################################################
# Our data.  

# TRUE => fixed data, FALSE => randomly generated data.
if(TRUE) {
  daDat   <- data.frame(x=c(  0.0000000,  0.4444444,  0.8888889, 1.3333333, 1.7777778,  2.2222222,   2.6666667,  3.1111111,  3.5555556, 4.0000000),
                        f=c( -6.0000000, -2.2085048, -0.2606310, 0.3703704, 0.2112483, -0.2112483,  -0.3703704,  0.2606310,  2.2085048, 6.0000000),
                        e=c( -0.5747532, -0.1930761,  1.7809493, 1.2608157, 0.8505483,  1.23321618, -1.5896486, -0.0399569, -1.7746238, 0.0406907),
                        y=c( -6.5747532, -2.4015809,  1.5203184, 1.6311861, 1.0617966,  1.0219679,  -1.9600190,  0.2206741,  0.4338810, 6.0406907))
} else {
  daDat   <- data.frame(x=seq(0, 4, length.out=20))  # x-data
  daDat$f <- with(daDat, x^3-6*x^2+11*x-6)           # Function
  daDat$e <- rnorm(daDat$x)                          # Error
  daDat$y <- daDat$f + daDat$e                       # y-data
}

head(daDat, 20)
           x          f          e          y
1  0.0000000 -6.0000000 -0.5747532 -6.5747532
2  0.4444444 -2.2085048 -0.1930761 -2.4015809
3  0.8888889 -0.2606310  1.7809493  1.5203184
4  1.3333333  0.3703704  1.2608157  1.6311861
5  1.7777778  0.2112483  0.8505483  1.0617966
6  2.2222222 -0.2112483  1.2332162  1.0219679
7  2.6666667 -0.3703704 -1.5896486 -1.9600190
8  3.1111111  0.2606310 -0.0399569  0.2206741
9  3.5555556  2.2085048 -1.7746238  0.4338810
10 4.0000000  6.0000000  0.0406907  6.0406907
################################################################################################################################################################
# We fit four models using progressively higher degree polynomials (degree 1 up to maxFdeg)

maxFdeg <- 4                                            ######## TRY THIS: Value of 6 vs. 4
if(maxFdeg == 4) {
  ## For illustrative purposes we demonstrate hand coded formulas for the maxFdeg==4 case
  daFits <- list(lm(y ~ x,                    data=daDat),  # The degree 1 case is simple linear regression
                 lm(y ~ x + I(x^2),           data=daDat),  # Note the use of the "I" function here
                 lm(y ~ x + I(x^2) + I(x^3),  data=daDat),
                 lm(y ~ poly(x, 4, raw=TRUE), data=daDat))  # The formula gets a bit long by deg>4, so we use the "poly" function
} else {
  ## This is how one generic fitting formulas can be constructed
  daFits <- lapply(1:maxFdeg, function (i) lm(y ~ poly(x, i, raw=TRUE), data=daDat))
}

daFits
[[1]]

Call:
lm(formula = y ~ x, data = daDat)

Coefficients:
(Intercept)            x  
     -3.066        1.583  


[[2]]

Call:
lm(formula = y ~ x + I(x^2), data = daDat)

Coefficients:
(Intercept)            x       I(x^2)  
    -3.8040       2.8277      -0.3112  


[[3]]

Call:
lm(formula = y ~ x + I(x^2) + I(x^3), data = daDat)

Coefficients:
(Intercept)            x       I(x^2)       I(x^3)  
     -6.973       15.873       -8.907        1.433  


[[4]]

Call:
lm(formula = y ~ poly(x, 4, raw = TRUE), data = daDat)

Coefficients:
            (Intercept)  poly(x, 4, raw = TRUE)1  poly(x, 4, raw = TRUE)2  poly(x, 4, raw = TRUE)3  poly(x, 4, raw = TRUE)4  
                -6.7178                  13.4772                  -5.8821                   0.2198                   0.1516  
################################################################################################################################################################
# Compute model values at 100 points between the min and max x values

newx  <- data.frame(x=seq(min(daDat$x), max(daDat$x), length.out=100))
newy  <- NULL
for(daFitDeg in 1:maxFdeg)
  newy  <- rbind(newy, data.frame(x=newx,
                                  degree=rep(daFitDeg, length(newx)),
                                  y=predict(daFits[[daFitDeg]], newdata=newx)))
newy$degree <- factor(newy$degree)

ggplot() +
  geom_line(data=newy, aes(x=x, y=y, col=degree)) +
  geom_line(data=daDat, aes(x=x, y=y), lwd=2)
plot of chunk auto-report
################################################################################################################################################################
# Use ANOVA to determine which fit seems best

do.call(anova, daFits)
Analysis of Variance Table

Model 1: y ~ x
Model 2: y ~ x + I(x^2)
Model 3: y ~ x + I(x^2) + I(x^3)
Model 4: y ~ poly(x, 4, raw = TRUE)
  Res.Df    RSS Df Sum of Sq       F    Pr(>F)    
1      8 55.780                                   
2      7 53.784  1     1.995  2.2925 0.1904210    
3      6  4.928  1    48.856 56.1313 0.0006695 ***
4      5  4.352  1     0.576  0.6622 0.4527810    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
################################################################################################################################################################
# Compute prediction intervals and confidence intervals.  Draw the result.

newx  <- data.frame(x=seq(from=min(daDat$x),
                          to=max(daDat$x)+diff(range(daDat$x))/4, # go out only 1/4 as far as we did in linearRegression.R
                          length.out=100))
newy  <- NULL
for(daFitDeg in 1:maxFdeg) {
  tmpp <- predict(daFits[[daFitDeg]], newdata=newx, interval="prediction")
  tmpc <- predict(daFits[[daFitDeg]], newdata=newx, interval="confidence")
  newy  <- rbind(newy, data.frame(x=newx, degree=rep(daFitDeg, length(newx)),
                                  fit=tmpp[,'fit'], pLow=tmpp[,'lwr'], pUp=tmpp[,'upr'], cLow=tmpc[,'lwr'], cUp=tmpc[,'upr']))
}
newy$degree <- factor(paste('degree', newy$degree))

ggplot(newy, aes(x=x, y=fit, group=degree)) +
  facet_wrap(~degree, ncol=2) +
  geom_ribbon(aes(ymin=pLow, ymax=pUp), alpha=.5, fill='pink', col='red') +
  geom_ribbon(aes(ymin=cLow, ymax=cUp), alpha=.5, fill='red', col='pink') +
  geom_line()
plot of chunk auto-report

The R session information (including the OS info, R version and all packages used):

    options(width=80)
    sessionInfo()
R version 3.3.0 (2016-05-03)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Debian GNU/Linux 8 (jessie)

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] graphics  grDevices datasets  utils     grid      stats     base     

other attached packages:
[1] RColorBrewer_1.1-2 reshape2_1.4.1     ggplot2_2.1.0      dplyr_0.4.3       
[5] data.table_1.9.6   gridExtra_2.2.1    knitr_1.13         lattice_0.20-33   

loaded via a namespace (and not attached):
 [1] Rcpp_0.12.5      magrittr_1.5     munsell_0.4.3    colorspace_1.2-6
 [5] R6_2.1.2         stringr_1.0.0    highr_0.6        plyr_1.8.3      
 [9] tools_3.3.0      parallel_3.3.0   gtable_0.2.0     DBI_0.4-1       
[13] assertthat_0.1   digest_0.6.9     formatR_1.4      evaluate_0.9    
[17] labeling_0.3     stringi_1.0-1    methods_3.3.0    scales_0.4.0    
[21] chron_2.3-47    
    Sys.time()
[1] "2016-07-09 20:06:51 CDT"