##################################################################################### ## ## Intro STATS : SCATTERPLOTS AND CORRELATION ## JDRS, HBG (Last updated Sept. 8, 2017) ## http://www.reuningscherer.net/stat10x/r/ ## ##################################################################################### ## Make a scatterplot and get Pearson's correlation coefficient ## #Get Alleghany Forest Tree Data Forest <- read.csv("http://reuningscherer.net/stat10x/data/Alleghany Forest.csv") #Make scatterplot of Diameter (x) vs Volume (y) (Page 119) plot(Forest$Diameter, Forest$Volume, xlab="Diameter", ylab="Volume", main="Alleghany Forest:\nBlack Cherry", pch=19, col='blue') #Comments on Code Above : # "xlabl"/"ylab" control axis labels; "main" controls primary title; # "pch" controls symbol type (see ?Points); "\n" = carriage return; "col" changes plot character color #Calculate correlation of Volume and Diamter (Page 111) #Method 1 of 2 pearsonCorrelation <- cor(Forest$Diameter, Forest$Volume, method = "pearson") #Comment on code above # Create correlation object. This function returns the correlation coefficient (r) but not a confidence interval or p-value cat("Pearson correlation of Diameter and Volume = ", round(pearsonCorrelation,3)) #Concatenate and print the result #Method 2 or 2 corTestObject <- cor.test(Forest$Diameter, Forest$Volume) #Comment on above # Create a correlation object. This function both computes the correlation and performs a statistical test of H0: the correlation = 0 vs H1: the correlation != 0. cat("Pearson correlation of Diameter and Volume = ", round(corTestObject$estimate,3), "\nP-Value = ", round(corTestObject$p.value, 3)) #Concatenate and print the results ##################################################################################### ##################################################################################### ## Make scatterplots of the World Bank 2013 data and reproduce scatterplots ## ## from Page 104, with various examples of using plotting arguments ## #Get World Bank Data WB <- read.csv("http://reuningscherer.net/stat10x/data/WorldBank2013.csv") #Make a scatterplot of Life Expectancy (x) vs Fertility (y) (Page 104) plot(WB$LifeExp, WB$Fertility, xlab="Life Expectancy", ylab="Fertility", main="World Bank Data 2013:\nFactors Associated with Fertility Rate ", pch=16, col="red", cex=1.2, xlim=c(40,85), ylim=c(0,7.2)) #Comment on above # "col" controls the point color. "xlim"/"ylim" controls the limits of the axes. Note that it requires a two-element vector created with the combine function c(). # If you don't specify the axis ranges the plot axes will be shortened slightly in comparison to Minitab. Generally you don't need to use these ranges. # I use them here to illustrate a common argument. "cex" is a multiplicative factor for scaling the point size. #Make a scatterplot of GNI per Capita (x) vs Fertility (y) (Page 104) plot(WB$GNI, WB$Fertility, xlab="GNI per Capita", ylab="Fertility", main="World Bank Data 2013:\nFactors Associated with Fertility Rate ", pch=16, col="red", cex=1.2, axes=F) #Create the plot but suppress the axes axis(2) #Add the y axis, which has units formatted the way we want my.axis <- as.integer(axTicks(1)) #Create a vector of x-axis labels as integers (instead of in scientific formatting) axis(1, at=axTicks(1), labels=my.axis) #Add the x axis back in with the correct labels box() #Add the bounding box to match default #Make a scatterplot of GNI per Capita (x) vs Fertility (y) with some more complicated graphical paramenters. (Page 104) oldPar <- par() #Store old graphical parameters for a 'reset' at the end par(bg = "papayawhip", fg="black") #Change the plot background color to match notes plot(WB$GNI, WB$Fertility, xlab="GNI per Capita", ylab="Fertility", main="World Bank Data 2013:\nFactors Associated with Fertility Rate ", pch=16, col="red", cex=1.2, axes=F, panel.first= rect(par("usr")[1],par("usr")[3],par("usr")[2],par("usr")[4],col = "white")) #Create the plot but suppress the axes. Extract from the graphical parameters associated with this particular plotting operation the coordinate extremes of the primary plotting area, and use them to create a rectangle that gets plotted first (panel.first), before the point data. axis(2) #Add back in the y axis, which already has units formatted the way we want myAxis <- as.integer(axTicks(1)) #Create a vector of x-axis labels as integers (instead of in scientific formatting) for easy of reading axis(1, at=axTicks(1), labels=my.axis) #Add the x axis back in with the correct labels, making use of the myAxis object par(oldPar) #Reset the graphical parameters to what they were before we changed things ##################################################################################### ##################################################################################### ## Explore the effects of data transformation using the mammal body-brain data ## #Get the mammal data brain <- read.csv("http://reuningscherer.net/stat10x/data/Body Brain Mammals.csv") #Make scatterplot of Body Weight (x) vs Brain Weight (y) (Page 115) #Method 1 of 2 plot(brain$Body.Weight..kg., brain$Brain.Weight..kg., main="Scatterplot of Brain Weight (g) vs. Body Weight (kg)", xlab="Body Weight (kg)", ylab="Brain Weight (g)", xlim=c(-100, 7000),col="red", pch=16, cex=1.2) text(brain$Body.Weight..kg., brain$Brain.Weight..kg., labels = ifelse(brain$Species=="Man" | brain$Species=="Asianelephant" | brain$Species=="Africanelephant", as.character(brain$Species), ""), pos = 1) #Add labels to plot based on a conditional statement. If the Species attribute is Man OR Asianelephant OR Africanelephant, then use the Species attribute to label the point, placing label below point. Otherwise, do nothing (i.e. don't label the point). #Make scatterplot of Body Weight (x) vs Brain Weight (y) (Page 115) #Method 2 of 2 plot(brain$Body.Weight..kg., brain$Brain.Weight..kg., main="Scatterplot of Brain Weight (g) vs. Body Weight (kg)", xlab="Body Weight (kg)", ylab="Brain Weight (g)", col="red", pch=16, cex=1.2) pointsToLabel <- subset(brain, brain$Species=="Man" | brain$Species=="Asianelephant" | brain$Species=="Africanelephant") #Create a subset of the data that contains only those record you want labeled myLabels <- c("African\nelephant", "Asian\nelephant", "Human") #Create a vector of labels points(pointsToLabel$Body.Weight..kg., pointsToLabel$Brain.Weight..kg., type="n") #Add the three new points to the plot invisibly text(pointsToLabel$Body.Weight..kg., pointsToLabel$Brain.Weight..kg., labels = myLabels, pos=1) #Manually add text to the three points, setting text below points. When text goes off the edge of the plot, you need to add additional arguments to plot it where you want it. #Calculate correlation of Body Weight and Brain Weight (Page 115) pearsonCorrelation <- cor(brain$Body.Weight..kg., brain$Brain.Weight..kg., method = "pearson") cat("Pearson correlation of Diameter and Volume = ", round(pearsonCorrelation,3)) text(4500,1500,paste("Pearson correlation of Diameter\nand Volume = ", round(pearsonCorrelation,3))) #Put some text on the plot #--------- #Repeate the above, but make data transformations first. (Page 117) #Create two new log-transformed variables brain$logBody <- log(brain$Body.Weight..kg.) #log() defaults to natural log brain$logBrain <- log(brain$Brain.Weight..kg.) #Make scatterplot of log(Body Weight) (x) vs log(Brain Weight) (y). (Page 117) plot(brain$logBody, brain$logBrain, main="Scatterplot of log(Body Weight) vs. log(Brain Weight)", xlab="log(Body Weight)", ylab="log(Brain Weight)", col="red", pch=16, cex=1.2) pearsonCorrelation <- cor(brain$logBody, brain$logBrain, method = "pearson") #Compute Pearson's correlation coefficient (r) cat("Pearson correlation of Diameter and Volume = ", round(pearsonCorrelation,3)) #Concatenate and print the results text(5,0,paste("Pearson correlation of Diameter\nand Volume = ", round(pearsonCorrelation,3))) #Put some text on the plot # END OF SCRIPT #