#***************************************************************************
#***************************************************************************
##
##  Intro STATS:
##  SIMULATING CENTRAL LIMIT THEOREM----
##  Henry Glick (JDRS) (Last updated Oct. 10, 2017)
##  http://www.reuningscherer.net/stat10x/r/
##
#***************************************************************************
#***************************************************************************

## Simulation: Homework 4, problem 7----
## Instructions for simulating Central Limit Theorem in practice
fakeData <- rexp(1000, rate = 1/1) #Create exponentially distributed data with n=1000 and mean of 1.

#Create a normal probability plot a la Minitab with custom function. See OLS_and_Residuals_Example_1.R on the course website for details on how this function works.
minitabQQ <- function(modelResiduals){
  x <- sort(modelResiduals) 
  y <- qnorm(ppoints(length(x)))
  plot(x,y, type="n", axes=F, ylab="Percent", xlab="Residual", main="Normal Probability Plot"); axis(1);box()
  probs <- c(0.01, 0.05, seq(0.1, 0.9, by = 0.1), 0.95, 0.99)
  qprobs <- qnorm(probs)
  axis(2, at = qprobs, labels = 100 * probs)
  points(x,y, pch=16, col="red", cex=0.75)
  xl <- quantile(x, c(0.25, 0.75)); yl <- qnorm(c(0.25, 0.75))
  slope <- diff(yl)/diff(xl); intercept <- yl[1] - slope * xl[1]
  abline(intercept, slope, col = "blue")
}

minitabQQ(fakeData) #Look at the QQ plot

#Simulate the data for question 7b
myData <- data.frame(matrix(nrow = 1000, ncol=0))#Initialize and empty dataframe
myMeans <- data.frame(matrix(nrow = 1000, ncol=0))#Initialize and empty dataframe

#Make 20 columns of exponentially distributed random variables of size 1000 with mean 1. Simultaneously, compute means of all subsets (1; 1 and 2; 1 and 2 and 3; etc.).
for (i in 1:20){ 
  fakeData <- rexp(1000, rate = 1/1) #Create fake data
  myData <- cbind(myData, fakeData) #Stitch it together
  myMeans <- cbind(myMeans, rowMeans(myData)) #Stich together row-by-row means of fake data
}
baseNames <- paste("Sample_",1:20,sep="") #Create names for random variables
meanNames <- paste("Mean_",1:20,sep="") #Create names for means
colnames(myData) <- baseNames #Rename random variables
colnames(myMeans) <- meanNames #Rename means

oldpar <- par() #Store old graphical parameters for reset at end
par(mfrow=c(2,2)) #Set up a 2x2 plotting window
for (i in c(1,5,10,20)){ #For each value in the vector, here representing the number of simulated datasets across which we are averaging...
  minitabQQ(myMeans[,paste("Mean_",i,sep="")]) #Make a QQ plot of the means
}
par(oldpar) #Reset graphical parameters


#***************************************************************************
#***************************************************************************

#Visualizing sampling distributions----

#Here we perform a simulation experiment in which we repeatedly sample data from a distribution with known parameters. Then we compute confidence intervals for each sample mean and visualize those means on a plot.

#Load the Energy Crisis Data, 1974 dataset
EC1974 <- read.csv("http://reuningscherer.net/stat10x/data/EnvAttitudes.1974.csv")
v35 <- na.omit(EC1974$v35) #5-point scale of agreement. Question: We should forget about reducing pollution until our energy problems are solved.

numDraws = 200 #How many samples should we get? Experiment by adjusting this value
mu = mean(v35) #The mean of the distribution from which we are sampling
n = 50 #The sample size we will be drawing
SD = sd(v35) #The standard deviation of the distribution from which we are sampling
draws = matrix(rnorm(numDraws * n, mu, SD), n) #Simulate normally distributed data. Produce n.draw*n number of values from a normal distribution with mean mu and standard deviation SD. Then, divide these into as many columns as we want samples (numDraws). Thus, as it is written, we are creating 100 samples of size 50 and storing one sample in each of 100 columns of a matrix.

#Now we construct 95% confidence intervals for each sample.
get.conf.int = function(x) t.test(x)$conf.int #Define a function that extracts the confidence interval (conf.int) from a t.test() object. 
conf.int = apply(draws, 2, get.conf.int) #Apply our function to every sample (i.e., to each column). The result is a matrix with two rows. In each column the first row contains the lower confidence interval boundary and the second row contains the upper boundary. 

#Create a visual
plot(range(conf.int), c(0, 1 + numDraws), type = "n", xlab = "Mean Response (Forget Pollution?)", ylab = "Sample Number") #Create an empty plot to populate using the range of our conf.int matrix and the number of samples to define the limits.
for (i in 1:numDraws){
  if (max(conf.int[, i]) < mean(v35) | min(conf.int[, i]) >mean(v35)){
    lines(conf.int[, i], rep(i, 2), lwd = 0.5, col="red")
  } else {
    lines(conf.int[, i], rep(i, 2), lwd = 0.5, col="gray55") #x coordinates = the interval values and y coordinates = the vertical position given by what number draw the sample was.
  }
} 
abline(v = mean(v35), lwd = 2, lty = 1, col="green") #Plot a vertical line at mean of sampling distribution

pct <- sum(conf.int[1, ] < mu & conf.int[2, ] > mu)/numDraws #What percentage of sample confidence intervals do not cover mu?
usr <- par( "usr" ) #Extract some plotting parameters to use in adding text below
text( usr[ 1 ]+((max(conf.int)-min(conf.int))/20), usr[ 4 ]-(numDraws/20), paste(pct*100, "% of intervals\ncover mu (", round(mu,3),")", sep=""), adj = c( 0, 1 ), col = "red" ) #Add interesting text to plot