##################################################################################### ## ## Intro STATS : ORDINARY LEAST SQUARES LINEAR REGRESSION AND RESIDUAL PLOTS ## Henry Glick (JDRS) (Last updated Sept. 8, 2017) ## http://www.reuningscherer.net/stat10x/r/ ## ##################################################################################### ## This script provides one example of fitting ordinary least squares ## ## linear regression models and plotting the results using the mammal ## ## body-brain weights data ## ##################################################################################### ##################################################################################### ## Fit an ordinary least squares linear regression model to log-transformed ## ## mammal body-brain data (Page 131) ## #Get the mammal data brain <- read.csv("http://reuningscherer.net/stat10x/data/Body Brain Mammals.csv") #Create two new log-transformed variables brain$logBody <- log(brain$Body.Weight..kg.) #log() defaults to natural log brain$logBrain <- log(brain$Brain.Weight..kg.) #Fit a least squares linear regression model predicting log(Brain Weight) using log(Body Weight) and plot data and model. (Page 131) m1 <- lm(brain$logBrain ~ brain$logBody) #Fit an OLS linear model #Plot the data and the model oldPar <- par() #Store graphical parameters for a 'reset' later par(mar=c(5.1, 4.1, 5, 2.1)) #Increase the margin space to allow for a subtitle plot(brain$logBody, brain$logBrain, xlab="log(Body Weight)", ylab="log(Brain Weight)", col="red", pch=16, cex=1.2) #Plot the data title("Scatterplot of log(Body Weight) vs. log(Brain Weight)", line = 3) #Add the main title separately so we can control the location usine the 'line' argument mtext(paste("logBrain = ", round(m1$coefficients[1],3), " + ", round(m1$coefficients[2],3), " logBody", sep=""), line=1.5) #Add a subtitle composed of the actual model formula, maing use of the coefficient values stored in the model object. text(-4.5,7, "S", pos=4); text(-1.75,7, round(summary(m1)$sigma,4), pos=4) #Add the residual standard error, which Minitab calls "S" text(-4.5,6.5, "R-Sq", pos=4); text(-1.75, 6.5, paste(round(summary(m1)$r.squared,3)*100,"%", sep=""), pos=4) #Add the coefficient of determination (R2) text(-4.5,6, "R-Sq(adj)", pos=4); text(-1.75, 6, paste(round(summary(m1)$adj.r.squared,3)*100,"%", sep=""), pos=4) #Add the adjusted coefficient of determination rect(-4.5,5.5,-0.1,7.4) #Add a decorative box for the text abline(m1, col="blue") #Plot a line representing the mean model (m1) summary(m1) #Take a look at a summary of the OLS model par(oldPar) #Reset the graphical parameters to what they were original ##################################################################################### ##################################################################################### ## Look at residual plots to evaluate some of the basic ordinary least squares ## ## regression assumptions. (Page 143, 145) myResiduals <- rstandard(m1) #Get standardized residuals from linear model #Create a normal probability plot #Method 1 of 2 qqnorm(myResiduals, main = "Normal Probability Plot", pch=16, col="red", cex=1.2) #This is a normal probability plot with the standardized residuals from your OLS model on the y axis and standardized units of the theoretical (expected) residual values on the x axis. You may find this confusing. While the units of the axes are the same, you may prefer the 'sample quantiles' on the x axis (though the interpretation won't change). You may also prefer the percentile units, as shown in Minitab (see Method 2). qqline(myResiduals, col="blue")#Add a line representing normality #Create a normal probability plot a la Minitab with custom function #Method 2 of 2 #You don't need to know how to do this, but in case you wanted a plot with the same structure as Minitab, you can use the following function. Yet another methods is the probplot() function within the "e1071" package (e.g., probplot(myResiduals, qdist=qnorm, ylab="Probability in %"). #First we create a function to make execution easier minitabQQ <- function(modelResiduals){ #In use, the function will take a single argument: your OLS standardized residuals x <- sort(modelResiduals) #Sort the residuals low to high to use as the x variable y <- qnorm(ppoints(length(x))) #Make a vector of probabilities of the same length (n) as your residuals, based on a standard normal distribution. Then apply this to the quantile function that answers the question, what is the z-score of the pth quantile (percentile from ppoints) of the normal distribution? The qnorm function expects a mean and sd to be specified. When they are not specified the default is mean=0 and sd=1, which is the case for our standardized residuals. These values will be the y ordinate values on the plot, since these are the z-scores that correspond with our standardized residual values. plot(x,y, type="n", axes=F, ylab="Percent", xlab="Residual", main="Normal Probability Plot") #Create a plot but suppress the content axis(1);box() #Add x axis and a bounding box probs <- c(0.01, 0.05, seq(0.1, 0.9, by = 0.1), 0.95, 0.99) #Create a controlled vector of probabilities to use in labeling. Include small and large values to enable better visualization. qprobs <- qnorm(probs) #Create z-scores based on standard normal distribution with mean 0 and SD of 1. Answers the question, what is the z-score of the pth quantile of the normal distribution? These are the locations at which we will place labels axis(2, at = qprobs, labels = 100 * probs) #plot the y axis labels, scaled to 0-100 points(x,y, pch=16, col="red", cex=1.2) #Add the actual residuals to plot #Manually compute linear model equation xl <- quantile(x, c(0.25, 0.75)) #Create two percentile locations based on x data yl <- qnorm(c(0.25, 0.75)) #Create two percentile locations based on standard normal distribution slope <- diff(yl)/diff(xl) #Change in y over change in x = slope intercept <- yl[1] - slope * xl[1] #Compute intercept for the model abline(intercept, slope, col = "blue") #Plot line using intercept and slope } minitabQQ(myResiduals) #Here is where we actually use the function. This plot mimics what is in Minitab, but has the axes flipped with respect to R's qqnorm() function. #------------------ #Create a 4-panel plot illustrating log(Brain) residuals (page 145) oldPar <- par() #Store graphical parameters for a 'reset' later par(oma=c(0,0,2,0), mfrow=c(2,2)) #Change the outer plot margins to make room for a primary title and change the plotting window to allow four plots (2x2) minitabQQ(myResiduals) #Plot the normal probability plot plot(m1$fitted.values, myResiduals, pch=16, col="red", cex=1.2, xlab= "Fitted Value", ylab="Residual", main="Versus Fits"); abline(h=0) #Plot Fitted vs Residual values and add a horizontal line at the mean hist(myResiduals, breaks=10, col="gray", main="Histogram", xlab="Residual") #Plot historgram of residual values plot(seq(1,length(myResiduals)), myResiduals, pch=16, col="red", cex=1.2, xlab="Observation Order", ylab="Residual", main="Versus Order") #Plot residuals by observational order lines(seq(1,length(myResiduals)), myResiduals, col="blue"); abline(h=0) #Add a horizontal mean line at 0 and a line connecting each point title("Residual Plots for log(Brain)", line = 0, outer = TRUE) #Add a primary plot title par(oldPar) #Reset the graphical parameters to what they were original # END SCRIPT #