#####################################################################################
##
##  Intro STATS : SCATTERPLOTS AND CORRELATION
##  JDRS, HBG (Last updated Sept. 8, 2017)
##  http://www.reuningscherer.net/stat10x/r/
##
#####################################################################################

## Make a scatterplot and get Pearson's correlation coefficient ##

#Get Alleghany Forest Tree Data
Forest <- read.csv("http://reuningscherer.net/stat10x/data/Alleghany Forest.csv")

#Make scatterplot of Diameter (x) vs Volume (y) (Page 119)
plot(Forest$Diameter, Forest$Volume, xlab="Diameter", ylab="Volume", main="Alleghany Forest:\nBlack Cherry", pch=19, col='blue') 
#Comments on Code Above :
#  "xlabl"/"ylab" control axis labels; "main" controls primary title; 
#  "pch" controls symbol type (see ?Points); "\n" = carriage return; "col" changes plot character color


#Calculate correlation of Volume and Diamter (Page 111)
#Method 1 of 2
pearsonCorrelation <- cor(Forest$Diameter, Forest$Volume, method = "pearson") 
#Comment on code above
#   Create correlation object. This function returns the correlation coefficient (r) but not a confidence interval or p-value

cat("Pearson correlation of Diameter and Volume = ", round(pearsonCorrelation,3)) 
#Concatenate and print the result

#Method 2 or 2
corTestObject <- cor.test(Forest$Diameter, Forest$Volume) 
#Comment on above
#  Create a correlation object. This function both computes the correlation and performs a statistical test of H0: the correlation = 0 vs H1: the correlation != 0.

cat("Pearson correlation of Diameter and Volume = ", round(corTestObject$estimate,3), "\nP-Value = ", round(corTestObject$p.value, 3)) #Concatenate and print the results


#####################################################################################
#####################################################################################


## Make scatterplots of the World Bank 2013 data and reproduce scatterplots ##
## from Page 104, with various examples of using plotting arguments         ##

#Get World Bank Data
WB <- read.csv("http://reuningscherer.net/stat10x/data/WorldBank2013.csv")

#Make a scatterplot of Life Expectancy (x) vs Fertility (y) (Page 104)
plot(WB$LifeExp, WB$Fertility, xlab="Life Expectancy", ylab="Fertility", main="World Bank Data 2013:\nFactors Associated with Fertility Rate ", pch=16, col="red", cex=1.2, xlim=c(40,85), ylim=c(0,7.2)) 
#Comment on above
#  "col" controls the point color. "xlim"/"ylim" controls the limits of the axes. Note that it requires a two-element vector created with the combine function c(). 
#  If you don't specify the axis ranges the plot axes will be shortened slightly in comparison to Minitab. Generally you don't need to use these ranges. 
#  I use them here to illustrate a common argument.  "cex" is a multiplicative factor for scaling the point size. 

#Make a scatterplot of GNI per Capita (x) vs Fertility (y) (Page 104)
plot(WB$GNI, WB$Fertility, xlab="GNI per Capita", ylab="Fertility", main="World Bank Data 2013:\nFactors Associated with Fertility Rate ", pch=16, col="red", cex=1.2, axes=F) 
#Create the plot but suppress the axes 

axis(2) #Add the y axis, which has units formatted the way we want
my.axis <- as.integer(axTicks(1)) #Create a vector of x-axis labels as integers (instead of in scientific formatting)
axis(1, at=axTicks(1), labels=my.axis) #Add the x axis back in with the correct labels
box() #Add the bounding box to match default

#Make a scatterplot of GNI per Capita (x) vs Fertility (y) with some more complicated graphical paramenters. (Page 104)
oldPar <- par() #Store old graphical parameters for a 'reset' at the end
par(bg = "papayawhip", fg="black") #Change the plot background color to match notes
plot(WB$GNI, WB$Fertility, xlab="GNI per Capita", ylab="Fertility", main="World Bank Data 2013:\nFactors Associated with Fertility Rate ", pch=16, col="red", cex=1.2, axes=F, panel.first= rect(par("usr")[1],par("usr")[3],par("usr")[2],par("usr")[4],col = "white")) #Create the plot but suppress the axes. Extract from the graphical parameters associated with this particular plotting operation the coordinate extremes of the primary plotting area, and use them to create a rectangle that gets plotted first (panel.first), before the point data. 
axis(2) #Add back in the y axis, which already has units formatted the way we want
myAxis <- as.integer(axTicks(1)) #Create a vector of x-axis labels as integers (instead of in scientific formatting) for easy of reading
axis(1, at=axTicks(1), labels=my.axis) #Add the x axis back in with the correct labels, making use of the myAxis object
par(oldPar) #Reset the graphical parameters to what they were before we changed things


#####################################################################################
#####################################################################################

## Explore the effects of data transformation using the mammal body-brain data ##

#Get the mammal data
brain <- read.csv("http://reuningscherer.net/stat10x/data/Body Brain Mammals.csv")

#Make scatterplot of Body Weight (x) vs Brain Weight (y) (Page 115)
#Method 1 of 2
plot(brain$Body.Weight..kg., brain$Brain.Weight..kg., main="Scatterplot of Brain Weight (g) vs. Body Weight (kg)", xlab="Body Weight (kg)", ylab="Brain Weight (g)", xlim=c(-100, 7000),col="red", pch=16, cex=1.2)
text(brain$Body.Weight..kg., brain$Brain.Weight..kg., labels = ifelse(brain$Species=="Man" | brain$Species=="Asianelephant" | brain$Species=="Africanelephant", as.character(brain$Species), ""), pos = 1) 
#Add labels to plot based on a conditional statement. If the Species attribute is Man OR Asianelephant OR Africanelephant, then use the Species attribute to label the point, placing label below point. Otherwise, do nothing (i.e. don't label the point).

#Make scatterplot of Body Weight (x) vs Brain Weight (y) (Page 115)
#Method 2 of 2
plot(brain$Body.Weight..kg., brain$Brain.Weight..kg., main="Scatterplot of Brain Weight (g) vs. Body Weight (kg)", xlab="Body Weight (kg)", ylab="Brain Weight (g)", col="red", pch=16, cex=1.2)
pointsToLabel <- subset(brain, brain$Species=="Man" | brain$Species=="Asianelephant" | brain$Species=="Africanelephant") #Create a subset of the data that contains only those record you want labeled
myLabels <- c("African\nelephant", "Asian\nelephant", "Human") #Create a vector of labels
points(pointsToLabel$Body.Weight..kg., pointsToLabel$Brain.Weight..kg., type="n") #Add the three new points to the plot invisibly
text(pointsToLabel$Body.Weight..kg., pointsToLabel$Brain.Weight..kg., labels = myLabels, pos=1) #Manually add text to the three points, setting text below points. When text goes off the edge of the plot, you need to add additional arguments to plot it where you want it.

#Calculate correlation of Body Weight and Brain Weight (Page 115)
pearsonCorrelation <- cor(brain$Body.Weight..kg., brain$Brain.Weight..kg., method = "pearson")
cat("Pearson correlation of Diameter and Volume = ", round(pearsonCorrelation,3))
text(4500,1500,paste("Pearson correlation of Diameter\nand Volume = ", round(pearsonCorrelation,3))) #Put some text on the plot

#---------

#Repeate the above, but make data transformations first. (Page 117)

#Create two new log-transformed variables
brain$logBody <- log(brain$Body.Weight..kg.) #log() defaults to natural log
brain$logBrain <- log(brain$Brain.Weight..kg.)

#Make scatterplot of log(Body Weight) (x) vs log(Brain Weight) (y). (Page 117)
plot(brain$logBody, brain$logBrain, main="Scatterplot of log(Body Weight) vs. log(Brain Weight)", xlab="log(Body Weight)", ylab="log(Brain Weight)", col="red", pch=16, cex=1.2)
pearsonCorrelation <- cor(brain$logBody, brain$logBrain, method = "pearson") #Compute Pearson's correlation coefficient (r)
cat("Pearson correlation of Diameter and Volume = ", round(pearsonCorrelation,3)) #Concatenate and print the results
text(5,0,paste("Pearson correlation of Diameter\nand Volume = ", round(pearsonCorrelation,3))) #Put some text on the plot

# END OF SCRIPT #