##################################################################################### ## ## Intro STATS : ORDINARY LEAST SQUARES LINEAR REGRESSION AND RESIDUAL PLOTS ## Henry Glick (JDRS) (Last updated Sept. 8, 2017) ## http://www.reuningscherer.net/stat10x/r/ ## ##################################################################################### ## This script provides one example of fitting ordinary least squares ## ## linear regression models and plotting the results using the World Bank ## ## 2013 data. ## ##################################################################################### ##################################################################################### ## Fit a least squares linear regression model predicting Fertility ## ## from GNI and plot data and model (Page 146) ## #Get World Bank Data WB <- read.csv("http://reuningscherer.net/stat10x/data/WorldBank2013.csv") #Fit a least squares linear regression model predicting Fertility using GNI. (Page 146) m1 <- lm(WB$Fertility ~ WB$GNI) #Fit a OLS linear model #Plot the data and the model oldPar <- par() #Store graphical parameters for a 'reset' later par(mar=c(5.1, 4.1, 5, 2.1)) #Increase the margin space to allow for a subtitle plot(WB$GNI, WB$Fertility, xlab="GNI", ylab="Fertility", col="red", pch=16, cex=1.2, ylim=c(0, 8)) #Plot the data title("Scatterplot of GNI vs. Fertility", line = 3) #Add the main title separately so we can control the location usine the 'line' argument mtext(paste("Fertility = ", round(m1$coefficients[1],3), " + ", round(m1$coefficients[2],5), " GNI", sep=""), line=1.5) #Add a subtitle composed of the actual model formula, maing use of the coefficient values stored in the model object. text(6e+04,6, "S", pos=4); text(8e+04,6, round(summary(m1)$sigma,4), pos=4) #Add the residual standard error, which Minitab calls "S" text(6e+04,5.5, "R-Sq", pos=4); text(8e+04, 5.5, paste(round(summary(m1)$r.squared,3)*100,"%", sep=""), pos=4) #Add the coefficient of determination (R2) text(6e+04,5, "R-Sq(adj)", pos=4); text(8e+04, 5, paste(round(summary(m1)$adj.r.squared,3)*100,"%", sep=""), pos=4) #Add the adjusted coefficient of determination rect(6e+04,4.5,9.5e+04, 6.5) #Add a decorative box for the text abline(m1, col="blue") #Plot a line representing the mean model (m1) summary(m1) #Take a look at a summary of the OLS model par(oldPar) #Reset the graphical parameters to what they were original ##################################################################################### ##################################################################################### ## Look at residual plots to evaluate some of the basic ordinary least squares ## ## regression assumptions. (Page 143, 145) myResiduals <- rstandard(m1) #Get standardized residuals from linear model #Create a normal probability plot a la Minitab #First create a function to use below minitabQQ <- function(modelResiduals){ #In use, the function will take a single argument: your OLS standardized residuals x <- sort(modelResiduals) #Sort the residuals low to high to use as the x variable y <- qnorm(ppoints(length(x))) #Make a vector of probabilities of the same length (n) as your residuals, based on a standard normal distribution. Then apply this to the quantile function that answers the question, what is the z-score of the pth quantile (percentile from ppoints) of the normal distribution? The qnorm function expects a mean and sd to be specified. When they are not specified the default is mean=0 and sd=1, which is the case for our standardized residuals. These values will be the y ordinate values on the plot, since these are the z-scores that correspond with our standardized residual values. plot(x,y, type="n", axes=F, ylab="Percent", xlab="Residual", main="Normal Probability Plot") #Create a plot but suppress the content axis(1);box() #Add x axis and a bounding box probs <- c(0.01, 0.05, seq(0.1, 0.9, by = 0.1), 0.95, 0.99) #Create a controlled vector of probabilities to use in labeling. Include small and large values to enable better visualization. qprobs <- qnorm(probs) #Create z-scores based on standard normal distribution with mean 0 and SD of 1. Answers the question, what is the z-score of the pth quantile of the normal distribution? These are the locations at which we will place labels axis(2, at = qprobs, labels = 100 * probs) #plot the y axis labels, scaled to 0-100 points(x,y, pch=16, col="red", cex=1.2) #Add the actual residuals to plot #Manually compute linear model equation xl <- quantile(x, c(0.25, 0.75)) #Create two percentile locations based on x data yl <- qnorm(c(0.25, 0.75)) #Create two percentile locations based on standard normal distribution slope <- diff(yl)/diff(xl) #Change in y over change in x = slope intercept <- yl[1] - slope * xl[1] #Compute intercept for the model abline(intercept, slope, col = "blue") #Plot line using intercept and slope } minitabQQ(myResiduals) #Here is where we actually use the function. This plot mimics what is in Minitab, but has the axes flipped with respect to R's qqnorm() function. #------------------ #Create a 4-panel plot illustrating residuals (like Page 145, but for WB data; Page 147) oldPar <- par() #Store graphical parameters for a 'reset' later par(oma=c(0,0,2,0), mfrow=c(2,2)) #Change the outer plot margins to make room for a primary title and change the plotting window to allow four plots (2x2) minitabQQ(myResiduals) #Plot the normal probability plot plot(m1$fitted.values, myResiduals, pch=16, col="red", cex=1.2, xlab= "Fitted Value", ylab="Residual", main="Versus Fits"); abline(h=0) #Plot Fitted vs Residual values and add a horizontal line at the mean hist(myResiduals, breaks=10, col="gray", main="Histogram", xlab="Residual") #Plot historgram of residual values plot(seq(1,length(myResiduals)), myResiduals, pch=16, col="red", cex=1.2, xlab="Observation Order", ylab="Residual", main="Versus Order") #Plot residuals by observational order lines(seq(1,length(myResiduals)), myResiduals, col="blue"); abline(h=0) #Add a horizontal mean line at 0 and a line connecting each point title("Residual Plots for Fertility", line = 0, outer = TRUE) #Add a primary plot title par(oldPar) #Reset the graphical parameters to what they were original ##################################################################################### ##################################################################################### ## Repeat above using log-transformed GNI per Capita as a predictor (Page 148) #Create a new log-transformed variable WB$logGNI <- log(WB$GNI) #log() defaults to natural log #Fit a least squares linear regression model predicting Fertility using log(GNI). (Page 146) m2 <- lm(WB$Fertility ~ WB$logGNI) #Fit a OLS linear model #Plot the data and the model (Page 148 top right). Note improvement in model fit as linearity has improved. oldPar <- par() #Store graphical parameters for a 'reset' later par(mar=c(5.1, 4.1, 5, 2.1)) #Increase the margin space to allow for a subtitle plot(WB$logGNI, WB$Fertility, xlab="log(GNI per Capita)", ylab="Fertility", col="red", pch=16, cex=1.2, ylim=c(0, 8)) #Plot the data title("Scatterplot of log(GNI per Capita) vs. Fertility", line = 3) #Add the main title separately so we can control the location usine the 'line' argument mtext(paste("Fertility = ", round(m2$coefficients[1],3), " + ", round(m2$coefficients[2],5), " GNI", sep=""), line=1.5) #Add a subtitle composed of the actual model formula, maing use of the coefficient values stored in the model object. text(9,7.5, "S", pos=4); text(10.5,7.5, round(summary(m2)$sigma,4), pos=4) #Add the residual standard error, which Minitab calls "S" text(9,7, "R-Sq", pos=4); text(10.5, 7, paste(round(summary(m2)$r.squared,3)*100,"%", sep=""), pos=4) #Add the coefficient of determination (R2) text(9,6.5, "R-Sq(adj)", pos=4); text(10.5,6.5, paste(round(summary(m2)$adj.r.squared,3)*100,"%", sep=""), pos=4) #Add the adjusted coefficient of determination rect(9,6,11.5, 8) #Add a decorative box for the text abline(m2, col="blue") #Plot a line representing the mean model (m2) summary(m2) #Take a look at a summary of the OLS model par(oldPar) #Reset the graphical parameters to what they were original ##################################################################################### ##################################################################################### ## Look at residual plots to evaluate some of the basic ordinary least squares ## ## regression assumptions. (Page 148, like 145) myResiduals <- rstudent(m2) #Get standardized residuals from linear model #Create a 4-panel plot illustrating Fertility residuals (page 148, like 145) oldPar <- par() #Store graphical parameters for a 'reset' later par(oma=c(0,0,2,0), mfrow=c(2,2)) #Change the outer plot margins to make room for a primary title and change the plotting window to allow four plots (2x2) #1 minitabQQ(myResiduals) #Plot the normal probability plot #2 plot(m2$fitted.values, myResiduals, pch=16, col="red", cex=1.2, xlab= "Fitted Value", ylab="Residual", main="Versus Fits"); abline(h=0) #Plot Fitted vs Residual values and add a horizontal line at the mean #3 hist(myResiduals, breaks=10, col="gray", main="Histogram", xlab="Residual") #Plot historgram of residual values #4 plot(m2$fitted.values, myResiduals, pch=16, col="red", cex=1.2, xlab= "Fitted Value", ylab="Residual", main="Versus Fits"); abline(h=0) #Plot Fitted vs Residual values and add a horizontal line at the mean title("Residual Plots for Fertility", line = 0, outer = TRUE) #Add a primary plot title par(oldPar) #Reset the graphical param summary(m2) #Take a look at a summary of the OLS model # END SCRIPT #