#####################################################################################
##
##  Intro STATS : ORDINARY LEAST SQUARES LINEAR REGRESSION AND RESIDUAL PLOTS
##  Henry Glick (JDRS) (Last updated Sept. 8, 2017)
##  http://www.reuningscherer.net/stat10x/r/
##
#####################################################################################

## This script provides one example of fitting ordinary least squares      ##
## linear regression models and plotting the results using the mammal      ##
## body-brain weights data                                                 ##

#####################################################################################
#####################################################################################


## Fit an ordinary least squares linear regression model to log-transformed ##
## mammal body-brain data (Page 131)                                        ##

#Get the mammal data
brain <- read.csv("http://reuningscherer.net/stat10x/data/Body Brain Mammals.csv")

#Create two new log-transformed variables
brain$logBody <- log(brain$Body.Weight..kg.) #log() defaults to natural log
brain$logBrain <- log(brain$Brain.Weight..kg.)

#Fit a least squares linear regression model predicting log(Brain Weight) using log(Body Weight) and plot data and model. (Page 131)
m1 <- lm(brain$logBrain ~ brain$logBody) #Fit an OLS linear model

#Plot the data and the model
oldPar <- par() #Store graphical parameters for a 'reset' later
par(mar=c(5.1, 4.1, 5, 2.1)) #Increase the margin space to allow for a subtitle
plot(brain$logBody, brain$logBrain, xlab="log(Body Weight)", ylab="log(Brain Weight)", col="red", pch=16, cex=1.2) #Plot the data
title("Scatterplot of log(Body Weight) vs. log(Brain Weight)", line = 3) #Add the main title separately so we can control the location usine the 'line' argument
mtext(paste("logBrain = ", round(m1$coefficients[1],3), " + ", round(m1$coefficients[2],3), " logBody", sep=""), line=1.5) #Add a subtitle composed of the actual model formula, maing use of the coefficient values stored in the model object.
text(-4.5,7, "S", pos=4); text(-1.75,7, round(summary(m1)$sigma,4), pos=4) #Add the residual standard error, which Minitab calls "S"
text(-4.5,6.5, "R-Sq", pos=4); text(-1.75, 6.5, paste(round(summary(m1)$r.squared,3)*100,"%", sep=""), pos=4) #Add the coefficient of determination (R2)
text(-4.5,6, "R-Sq(adj)", pos=4); text(-1.75, 6, paste(round(summary(m1)$adj.r.squared,3)*100,"%", sep=""), pos=4) #Add the adjusted coefficient of determination
rect(-4.5,5.5,-0.1,7.4) #Add a decorative box for the text
abline(m1, col="blue") #Plot a line representing the mean model (m1)
summary(m1) #Take a look at a summary of the OLS model
par(oldPar) #Reset the graphical parameters to what they were original


#####################################################################################
#####################################################################################


## Look at residual plots to evaluate some of the basic ordinary least squares ##
## regression assumptions. (Page 143, 145)

myResiduals <- rstandard(m1) #Get standardized residuals from linear model

#Create a normal probability plot
  #Method 1 of 2
qqnorm(myResiduals, main = "Normal Probability Plot", pch=16, col="red", cex=1.2) #This is a normal probability plot with the standardized residuals from your OLS model on the y axis and standardized units of the theoretical (expected) residual values on the x axis. You may find this confusing. While the units of the axes are the same, you may prefer the 'sample quantiles' on the x axis (though the interpretation won't change). You may also prefer the percentile units, as shown in Minitab (see Method 2).
qqline(myResiduals, col="blue")#Add a line representing normality

#Create a normal probability plot a la Minitab with custom function
  #Method 2 of 2
#You don't need to know how to do this, but in case you wanted a plot with the same structure as Minitab, you can use the following function. Yet another methods is the probplot() function within the "e1071" package (e.g., probplot(myResiduals, qdist=qnorm, ylab="Probability in %").
#First we create a function to make execution easier
minitabQQ <- function(modelResiduals){ #In use, the function will take a single argument: your OLS standardized residuals
  x <- sort(modelResiduals) #Sort the residuals low to high to use as the x variable
  y <- qnorm(ppoints(length(x))) #Make a vector of probabilities of the same length (n) as your residuals, based on a standard normal distribution. Then apply this to the quantile function that answers the question, what is the z-score of the pth quantile (percentile from ppoints) of the normal distribution? The qnorm function expects a mean and sd to be specified. When they are not specified the default is mean=0 and sd=1, which is the case for our standardized residuals. These values will be the y ordinate values on the plot, since these are the z-scores that correspond with our standardized residual values.
  plot(x,y, type="n", axes=F, ylab="Percent", xlab="Residual", main="Normal Probability Plot") #Create a plot but suppress the content
  axis(1);box() #Add x axis and a bounding box
  probs <- c(0.01, 0.05, seq(0.1, 0.9, by = 0.1), 0.95, 0.99) #Create a controlled vector of probabilities to use in labeling. Include small and large values to enable better visualization.
  qprobs <- qnorm(probs) #Create z-scores based on standard normal distribution with mean 0 and SD of 1. Answers the question, what is the z-score of the pth quantile of the normal distribution? These are the locations at which we will place labels
  axis(2, at = qprobs, labels = 100 * probs) #plot the y axis labels, scaled to 0-100
  points(x,y, pch=16, col="red", cex=1.2) #Add the actual residuals to plot
  #Manually compute linear model equation
  xl <- quantile(x, c(0.25, 0.75)) #Create two percentile locations based on x data
  yl <- qnorm(c(0.25, 0.75)) #Create two percentile locations based on standard normal distribution
  slope <- diff(yl)/diff(xl) #Change in y over change in x = slope
  intercept <- yl[1] - slope * xl[1] #Compute intercept for the model
  abline(intercept, slope, col = "blue") #Plot line using intercept and slope
}

minitabQQ(myResiduals) #Here is where we actually use the function. This plot mimics what is in Minitab, but has the axes flipped with respect to R's qqnorm() function.

#------------------

#Create a 4-panel plot illustrating log(Brain) residuals (page 145)

oldPar <- par() #Store graphical parameters for a 'reset' later
par(oma=c(0,0,2,0), mfrow=c(2,2)) #Change the outer plot margins to make room for a primary title and change the plotting window to allow four plots (2x2)
minitabQQ(myResiduals) #Plot the normal probability plot
plot(m1$fitted.values, myResiduals, pch=16, col="red", cex=1.2, xlab= "Fitted Value", ylab="Residual", main="Versus Fits"); abline(h=0) #Plot Fitted vs Residual values and add a horizontal line at the mean
hist(myResiduals, breaks=10, col="gray", main="Histogram", xlab="Residual") #Plot historgram of residual values
plot(seq(1,length(myResiduals)), myResiduals, pch=16, col="red", cex=1.2, xlab="Observation Order", ylab="Residual", main="Versus Order") #Plot residuals by observational order
lines(seq(1,length(myResiduals)), myResiduals, col="blue"); abline(h=0) #Add a horizontal mean line at 0 and a line connecting each point
title("Residual Plots for log(Brain)", line = 0, outer = TRUE) #Add a primary plot title
par(oldPar) #Reset the graphical parameters to what they were original


# END SCRIPT #