#*************************************************************************** #*************************************************************************** ## ## Intro STATS: ## SIMULATING CENTRAL LIMIT THEOREM---- ## Henry Glick (JDRS) (Last updated Oct. 10, 2017) ## http://www.reuningscherer.net/stat10x/r/ ## #*************************************************************************** #*************************************************************************** ## Simulation: Homework 4, problem 7---- ## Instructions for simulating Central Limit Theorem in practice fakeData <- rexp(1000, rate = 1/1) #Create exponentially distributed data with n=1000 and mean of 1. #Create a normal probability plot a la Minitab with custom function. See OLS_and_Residuals_Example_1.R on the course website for details on how this function works. minitabQQ <- function(modelResiduals){ x <- sort(modelResiduals) y <- qnorm(ppoints(length(x))) plot(x,y, type="n", axes=F, ylab="Percent", xlab="Residual", main="Normal Probability Plot"); axis(1);box() probs <- c(0.01, 0.05, seq(0.1, 0.9, by = 0.1), 0.95, 0.99) qprobs <- qnorm(probs) axis(2, at = qprobs, labels = 100 * probs) points(x,y, pch=16, col="red", cex=0.75) xl <- quantile(x, c(0.25, 0.75)); yl <- qnorm(c(0.25, 0.75)) slope <- diff(yl)/diff(xl); intercept <- yl[1] - slope * xl[1] abline(intercept, slope, col = "blue") } minitabQQ(fakeData) #Look at the QQ plot #Simulate the data for question 7b myData <- data.frame(matrix(nrow = 1000, ncol=0))#Initialize and empty dataframe myMeans <- data.frame(matrix(nrow = 1000, ncol=0))#Initialize and empty dataframe #Make 20 columns of exponentially distributed random variables of size 1000 with mean 1. Simultaneously, compute means of all subsets (1; 1 and 2; 1 and 2 and 3; etc.). for (i in 1:20){ fakeData <- rexp(1000, rate = 1/1) #Create fake data myData <- cbind(myData, fakeData) #Stitch it together myMeans <- cbind(myMeans, rowMeans(myData)) #Stich together row-by-row means of fake data } baseNames <- paste("Sample_",1:20,sep="") #Create names for random variables meanNames <- paste("Mean_",1:20,sep="") #Create names for means colnames(myData) <- baseNames #Rename random variables colnames(myMeans) <- meanNames #Rename means oldpar <- par() #Store old graphical parameters for reset at end par(mfrow=c(2,2)) #Set up a 2x2 plotting window for (i in c(1,5,10,20)){ #For each value in the vector, here representing the number of simulated datasets across which we are averaging... minitabQQ(myMeans[,paste("Mean_",i,sep="")]) #Make a QQ plot of the means } par(oldpar) #Reset graphical parameters #*************************************************************************** #*************************************************************************** #Visualizing sampling distributions---- #Here we perform a simulation experiment in which we repeatedly sample data from a distribution with known parameters. Then we compute confidence intervals for each sample mean and visualize those means on a plot. #Load the Energy Crisis Data, 1974 dataset EC1974 <- read.csv("http://reuningscherer.net/stat10x/data/EnvAttitudes.1974.csv") v35 <- na.omit(EC1974$v35) #5-point scale of agreement. Question: We should forget about reducing pollution until our energy problems are solved. numDraws = 200 #How many samples should we get? Experiment by adjusting this value mu = mean(v35) #The mean of the distribution from which we are sampling n = 50 #The sample size we will be drawing SD = sd(v35) #The standard deviation of the distribution from which we are sampling draws = matrix(rnorm(numDraws * n, mu, SD), n) #Simulate normally distributed data. Produce n.draw*n number of values from a normal distribution with mean mu and standard deviation SD. Then, divide these into as many columns as we want samples (numDraws). Thus, as it is written, we are creating 100 samples of size 50 and storing one sample in each of 100 columns of a matrix. #Now we construct 95% confidence intervals for each sample. get.conf.int = function(x) t.test(x)$conf.int #Define a function that extracts the confidence interval (conf.int) from a t.test() object. conf.int = apply(draws, 2, get.conf.int) #Apply our function to every sample (i.e., to each column). The result is a matrix with two rows. In each column the first row contains the lower confidence interval boundary and the second row contains the upper boundary. #Create a visual plot(range(conf.int), c(0, 1 + numDraws), type = "n", xlab = "Mean Response (Forget Pollution?)", ylab = "Sample Number") #Create an empty plot to populate using the range of our conf.int matrix and the number of samples to define the limits. for (i in 1:numDraws){ if (max(conf.int[, i]) < mean(v35) | min(conf.int[, i]) >mean(v35)){ lines(conf.int[, i], rep(i, 2), lwd = 0.5, col="red") } else { lines(conf.int[, i], rep(i, 2), lwd = 0.5, col="gray55") #x coordinates = the interval values and y coordinates = the vertical position given by what number draw the sample was. } } abline(v = mean(v35), lwd = 2, lty = 1, col="green") #Plot a vertical line at mean of sampling distribution pct <- sum(conf.int[1, ] < mu & conf.int[2, ] > mu)/numDraws #What percentage of sample confidence intervals do not cover mu? usr <- par( "usr" ) #Extract some plotting parameters to use in adding text below text( usr[ 1 ]+((max(conf.int)-min(conf.int))/20), usr[ 4 ]-(numDraws/20), paste(pct*100, "% of intervals\ncover mu (", round(mu,3),")", sep=""), adj = c( 0, 1 ), col = "red" ) #Add interesting text to plot