#*************************************************************************** #*************************************************************************** ## ## Intro STATS: ## CONFIDENCE INTERVALS AND HYPOTHESIS TESTS---- ## Henry Glick (JDRS) (Last updated Oct. 12, 2017) ## http://www.reuningscherer.net/stat10x/r/ ## #*************************************************************************** #*************************************************************************** ## Read Me!---- #This script is long and has many examples. To make navigation easy, it is in your best interest to use the RStudio "Document Outline" pane (a hyperlinked table of contents) to quickly navigate to relevant portions of the script. Each heading followed by "----" defines a new section. Subsections are separated with "*****". In most cases the computations are demonstrated using both built-in functions ("automated" approaches) and manual computation. Page reference numbers apply to lecture notes as of Autumn 2017. ## Load data---- #Load various datasets used below #Load the Energy Crisis Data, 1974 dataset EC1974 <- read.csv("http://reuningscherer.net/stat10x/data/EnvAttitudes.1974.csv") v35 <- na.omit(EC1974$v35) #Obtain environmental attitudes data from EC1974. Be sure to clean the data of missing values since the NAs cause problems #Southern Focus Survey, 1997 SFocus <- read.csv("http://reuningscherer.net/stat10x/data/SouthernFocus1997RED.csv") #Treated Tick data waterTicks <- c(1,1,1, rep(0,89)) #Ticks treated with water fungusTicks <- c(rep(0,36), rep(1,40)) #Ticks treated with fungus #RoundUp roundup <- read.csv("http://reuningscherer.net/stat10x/data/RoundUp.csv") #World Bank Year 2000 WB <- read.csv("http://reuningscherer.net/stat10x/data/WB.2000.csv") WB <- WB[which(WB$gnipc>=500),] #As noted in the video "Two Sample Tests: Pooled Tests", this dataset is subset to include only those countries with aa GNI per capita of at least $500 illitfem <- na.omit(WB$illitfem) #Be sure to clean the data of missing values since the NAs cause problems illitmal <- na.omit(WB$illitmal) #Be sure to clean the data of missing values since the NAs cause problems #***************************************************************** #***************************************************************** ## One-sample confidence intervals (p. 395)---- #.....Automated: Two-tail t-distribution (p. 391-395)---- #Automatically compute a two-tailed one-sample t-test to obtain a confidence interval for the sample mean. t.test(x=v35, alternative="two.sided", conf.level = 0.95) #(p. 394) This function can be used in several ways. Here we are using it solely to obtain a confidence interval for the sample mean by taking the mean and adding or subtracting the critical t-value*(sd/sqrt(n)) to obtain a confidence interval for the mean. By default this function also performs a hypothesis test (covered below). When we use this function solely for a one-sample confidence interval, the function implicitly tests the sample mean against the value of 0 to evaluate a difference. Therefore, the t-statistic that is returned is not particularly interesting since it is the t-value associated with the sample mean should the true distribution have had a mean of 0 (which is a silly thing to think). Just ignore this information. #.....Manual: Two-tail t-distribution (p. 391-395)---- #Manually compute a two-tailed one-sample t-test to obtain a confidence interval for the sample mean. n <- length(v35); names(n) <- "n" #Obtain and name n. names() sets an attribute so the statistics are labeled when they are joined together at the end. s <- sd(v35); names(s) <- "stdev" #Obtain and name standard deviation myMean <- mean(v35); names(myMean) <- "mean" #Obtain and name mean criticalT <- qt(1-0.05/2, df=n-1); names(criticalT) <- "t-value" #Obtain and name a two-tailed critical t-value associated with a 95% confidence interval se <- s/sqrt(n); names(se) <- "se" #Obtain and name standard error of the mean error <- criticalT*se #Compute marginOfError lowerBound <- mean(v35)-error #Compute lower CI boundary upperBound <- mean(v35)+error #Compute upper CI boundary CI95 <- paste("(",round(lowerBound,4),", ",round(upperBound,4),")", sep=""); names(CI95) <- "CI_95%" #Concatenate and name the complete confidence interval data.frame("v35", n, s, se, myMean, criticalT, CI95) #(p. 394) Take a look at the data. #*************************************************************************** ##.....Automated: Two-tail z-distribution (p. 382)---- #This presumes that the TRUE VARIANCE IS KNOWN for the the population of interest #R does not have a z-test function built-in, so we create one to match the t.test() function. Just run the function and then use it below. Note that this function can handle different confidence levels and both two-tailed and one-tailed tests (covered below)... z.test <- function(dataVector, sigma, confidence, alternative){ #The function takes four arguments: your data (as a vector of values), the true standard deviation, your chosen level of confidence, and the type of hypothesis test ("less", "greater", or "two.sided"). names(sigma) <- "sigma" #Create a name for your sigma argument to enhance readability of output n <- length(dataVector) #Get n se <- sigma/sqrt(length(dataVector)); names(se) <- "se" #Compute and name standard error myMean <- mean(dataVector) #Get the sample mean #Set up conditional reporting based the tail choice of the user... if(alternative == "less"){ criticalZ <- qnorm((1-confidence), lower.tail = T); names(criticalZ) <- "z-value" #Compute critical z-value used to evaluate statistical significance. One-tailed, testing for greater than or equal to. CI95 <- paste("(-Inf, ", round(myMean+criticalZ*se,4),")", sep=""); names(CI95) <- "CI_95%" #Compute and name a confidence interval per page 382 } else if (alternative == "greater"){ criticalZ <- qnorm((1-confidence), lower.tail = F); names(criticalZ) <- "z-value" #Compute critical z-value used to evaluate statistical significance. One-tailed, testing for less than or equal to. CI95 <- paste("(", round(myMean-criticalZ*se,4),", Inf)", sep=""); names(CI95) <- "CI_95%" #Compute and name a confidence interval per page 382 } else if (alternative=="two.sided"){ criticalZ <- qnorm((1-confidence)/2, lower.tail = F); names(criticalZ) <- "z-value" #Compute critical z-value used to evaluate statistical significance. Two-tailed, testing for any kind of difference. CI95 <- paste("(",round(myMean-criticalZ*se,4),", ", round(myMean+criticalZ*se,4),")", sep=""); names(CI95) <- "CI_95%" #Compute and name a confidence interval per page 382 } return(data.frame("v35", n, sigma, se, myMean, criticalZ, CI95)) #Return from the function a printout of the useful information } z.test(dataVector=v35, sigma=sd(v35), confidence= 0.95, alternative = "two.sided") #We can use function to compute a two-tailed one-sample z-test, treating our sample standard deviation as the true standard deviation. See function for details on arguments. z.test(dataVector=v35[1:50], sigma=sd(v35[1:50]), confidence= 0.95, "two.sided") #Notice how reducing the size of the sample increases the width of the confidence interval since with fewer records we can be less confident. As sample size increases, the results of z.test() will approach those of t.test(). #*************************************************************************** #*************************************************************************** #.....Automated: Binomial z-distribution (p. 398-402)---- #Replicate the treated tick data described in the notes waterTicks <- c(1,1,1, rep(0,89)) #Ticks treated with water fungusTicks <- c(rep(0,36), rep(1,40)) #Ticks treated with fungus #One-sample proportion test for binomials to create a confidence interval for binomial proportion. Replicating binomial example of treated ticks (page 399, video: Confidence Intervals for Binomials) #Tests of binomial proportions can be computed using the prop.test() function with x = number of success for each trial and n = number of trials. prop.test(x=sum(fungusTicks), n=length(fungusTicks), conf.level = 0.95, correct = TRUE) #Continuity correction is TRUE (engaged) by default. #.....Manual: Binomial z-distribution (ex. 1, 398-402)----- n <- length(fungusTicks) #Number of records pHat <- sum(fungusTicks)/n #Compute estimated proportion criticalZ <- qnorm((1-0.95)/2, lower.tail = F) #Compute critical z-value to use in margin of error computation using **95%** confidence error <- criticalZ*sqrt((pHat*(1-pHat))/n) #Compute error CI95 <- paste("(",round(pHat-error,4),", ", round(pHat+error,4),")", sep=""); names(CI95) <- "CI_95%" cat("Two-tailed CI for pHat = ", CI95) #.....Manual: Binomial z-distribution (ex. 2, p. 400)----- #Replicating binomial example of American beliefs (page 400, video: Confidence Intervals for Binomials) creationists <- c(rep(1,380), rep(0,1000-380)) n <- length(creationists) #Number of records pHat <- sum(creationists)/n #Compute estimated proportion criticalZ <- qnorm((1-0.99)/2, lower.tail = F) #Compute critical z-value to use in error computation using **99%** confidence error <- criticalZ*sqrt((pHat*(1-pHat))/n) #Compute error CI95 <- paste("(",round(pHat-error,4),", ", round(pHat+error,4),")", sep=""); names(CI95) <- "CI_95%" cat("Two-tailed CI for pHat = ", CI95) #*************************************************************************** #*************************************************************************** # One-sample tests: One mean (p. 418)---- #Both two-tailed and one-tailed examples are shown using the EC1974 data and gas price data #One-sample hypothesis tests for one mean. Examples using environmental attitudes data. #.....Automated: Two-tail (p. 418)---- t.test(v35, mu=3, alternative="two.sided", conf.level = 0.95) #Compute a two-tailed one-sample t-test to evaluate whether the sample mean is significantly DIFFERENT (in no particular direction) from some stipulated mean value ("mu"). Here I have just made up a value for mu as an example. Notice that the confidence interval [2.74, 2.92] does not include mu. The sample mean is therefore significantly different from mu and the probability of obtaining the sample mean given that it's true distribution had mean mu, is 0.0002. We DO NOT know the nature (direction) of the difference between the mean of v35 and mu. #.....Automated: One-tail (p. 418)---- t.test(v35,mu=3, alternative="less", conf.level = 0.95) #Compute a one-tailed one-sample t-test to evaluate whether the sample mean is significantly LESS THAN some stipulated mean (mu). Notice that the confidence interval [-infinity, 2.91] does not include mu. The sample mean is therefore significantly LESS THAN mu and the probability of obtaining the sample mean given that it's true distribution had mean mu, is 0.0001. t.test(v35, mu=2.5, alternative="greater", conf.level = 0.95) #Compute a one-tailed one-sample t-test to evaluate whether the sample mean is significantly GREATER THAN some stipulated mean (mu). Notice that the confidence interval [2.76, +infinity] does not include mu. The sample mean is therefore significantly GREATER THAN mu and the probability of obtaining the sample mean given that it's true distribution had mean mu, is essentially 0. #.....Manual: Two-tail (p. 393)---- #Two-tailed manual example using gas price data (page 393, video Hypothesis Tests: Two-Sample Tests). Here our sample of data are computed differences. Thus, we are evaluating whether the mean difference in gas prices is significantly different from 0 (two-tailed test). n <- 10 s <- 0.614278 se <- s/sqrt(n) myMean <- -0.606000 H0 <- 0 criticalT <- qt(1-0.05/2, df=n-1); names(criticalT) <- "t-value" #Obtain and name critical t-value associated with a 95% confidence interval error <- criticalT*se #Compute marginOfError lowerBound <- myMean-error #Compute lower CI boundary upperBound <- myMean+error #Compute upper CI boundary CI95 <- paste("(",round(lowerBound,4),", ",round(upperBound,4),")", sep=""); names(CI95) <- "CI_95%" #Concatenate and name the complete confidence interval tStat <- (myMean-H0)/(s/sqrt(n)) #Page 415. Note, there is an extraneous sqrt sign in the lecture notes. Values from page 419. twoSidedPValue <- 2 * pt(q=tStat, df=9) #Page 416. Get p-value for a two-sided t-test, evaluating whether mean difference is different from 0. ifelse(twoSidedPValue <= 0.05, paste("This test returns a statistically significant test statistic of: ", round(tStat,4),"with a p-value of: ", round(twoSidedPValue, 4), " and a confidence interval of: ", CI95), "This test does not return a statistically significant result") #.....Manual: One-tail (p. 419)---- #One-sided tests done manually using Gas in Cabs example on page 419 (two-sided tests shown above). Here we test to see if the difference between the two gases is LESS THAN OR EQUAL to 0. n <- 10; s <- 0.614278; se <- s/sqrt(10); myMean <- -0.606000; mu <- 0 tStat <- (myMean-mu)/(s/sqrt(n)) #Page 415. Note, there is an extraneous sqrt sign in the lecture notes. Values from page 419. oneSidedPValue <- pt(q=tStat, df=n-1) #Page 419. Get p-value for a one-sided t-test, evaluating whether mean is greater than or less than 0. criticalT <- qt(1-0.05, df=n-1) errorOneTail <- criticalT*se #Compute marginOfError upperBound <- myMean+error #Compute upper CI boundary #Print out the results CI95OneTail <- paste("(Inf, ", round(upperBound,4),")", sep=""); names(CI95) <- "CI_95%" #Compute confidence interval cat("This test produced a t-statistic of ", tStat, " with probability ", oneSidedPValue, "\nA 95% CI is ", CI95OneTail, " with a margin of error", errorOneTail) #Show results. Notice that the interval does not contain 0. #*************************************************************************** #*************************************************************************** # One-sample tests: One proportion (p. 428)---- #Both two-tailed and one-tailed examples are shown using the Treated Tick data #..... Automated: Two-tail (p. 423-428)---- prop.test(x=sum(fungusTicks), n=length(fungusTicks), p = 0.03, alternative = "two.sided", conf.level = 0.95, correct = T) #An optional p argument controls what you think the proportion should be (equivalent to "mu" when using t.test function), and performs a test between the sample proportion and your stipulated p. Continuity correction is TRUE (engaged) by default (Page 428). #.....Automated: One-tail (p. 427)---- prop.test(x=sum(fungusTicks), n=length(fungusTicks), p = 0.03, alternative = "less", conf.level = 0.95, correct = T) #As above, but test to see if sample proportion (pHat) is <= 0.03. #One-tailed, automated example: prop.test(x=sum(fungusTicks), n=length(fungusTicks), p = 0.03, alternative = "greater", conf.level = 0.95, correct = T) #As above, but test to see if sample proportion (pHat) is >= 0.03. #.....Manual: Two-tail (p. 423-428)---- n <- length(fungusTicks) #Number of records pHat <- sum(fungusTicks)/n #Compute estimated proportion pHat <- 0.52 p0 <- 0.03 #The usual death rate for ticks over a given time span (page 423) zStat <- (pHat-p0)/sqrt(p0*(1-p0)/n) #Compute the z statistic associated with the difference between observed and true proportions (page 424). pValue <- pnorm(abs(zStat), lower.tail = F) #What is the probability of observing a z-statistic GREATER THAN OR EQUAL TO zStat? About 0. Thus, the fungus death rate is significantly greater than the p0 death rate. (Page 427) cat("pValue is: ", pValue) #Compute confidence intervals as desired... criticalZTwoTail <- qnorm((1-0.95)/2, lower.tail = F) #Compute two-tailed critical z to match page 428. errorTwoTail <- criticalZTwoTail*sqrt(pHat*(1-pHat)/n)#Compute margin of error for two-tailed test CI95TwoTail <- paste("(",round(pHat-errorTwoTail,4),", ", round(pHat+errorTwoTail,4),")", sep=""); names(CI95) <- "CI_95%" #Compute confidence interval (Page 428) cat("CI for pHat (", pHat, ") = ", CI95TwoTail, " with margin of error", errorTwoTail) #Showr results #.....Manual: One-tail (p. 423-427)---- #Making use of values from the automated approach direclty above... criticalZOneTail <- qnorm((1-0.95), lower.tail = F) #Compute one-tailed critical z-value to use in margin of error computation using **95%** confidence (page 398). Note that the tick example in the notes (page 428) reports a two-tailed confidence interval even though the hypothesis test is directional. errorOneTail <- criticalZOneTail*sqrt(pHat*(1-pHat)/n) #Compute margin of error for one-tailed test CI95OneTail <- paste("(", round(pHat-errorOneTail,4),", Inf)", sep=""); names(CI95) <- "CI_95%" #Compute confidence interval cat("CI for pHat (", pHat, ") = ", CI95OneTail, " with margin of error", errorOneTail) #Showr results #*************************************************************************** #*************************************************************************** #Two-sample t-tests: Two means (p. 451)---- #Two-sample t-test to evaluate the difference in two means (i.e., determine whether two samples came from different distributions) #Example 1: Compare two concentrations of roundup for radishes using a one-tailed two-sample t-test #Prepare the data... roundup$Species <- as.character(roundup$Species) #Convert the "Species" variable of the "roundup" dataset from class "factor" to class "character" so we can employ conditional statements radish0 <- roundup[roundup$Species=="Radish" & roundup$Concentration==0, "Dry.Weight"] #Subset the roundup dataset to obtain only those records where Species= Radish and Concentration = 0%. radish100 <- roundup[roundup$Species=="Radish" & roundup$Concentration==1, "Dry.Weight"] #Subset the roundup dataset to obtain only those records where Species= Radish and Concentration = 100%. #Automated t-test... #.....Automated: Two-tailed (ex. 1, p. 445-449, 456)---- #Compare central confidence interval with non-central confidence interval t.test(radish0, radish100, alternative = "two.sided") #Page 455-456. Note that this interval contains 0 and is therefore not statistically significant given a stipulated alpha of 0.05. #.....Automated: One-tailed (ex. 1, p. 452-455)---- t.test(radish0, radish100, alternative = "greater") #(Page 448, 455-456) Here we use "greater" to perform a one-tailed test in which our alternative hypothesis is that the second sample (radish100) has a significantly greater mean than our first sample (radish0) #.....Manual: Two-tailed (ex. 1, p. 445-449, 456)---- tStat <- (mean(radish0)-mean(radish100)-0)/sqrt((var(radish0)/length(radish0))+(var(radish100)/length(radish100)))#Get t-statistic (page 447-448, 449). Here we subtract the hypothesized difference between the means (0) from the difference in the means pValue <- pt(tStat, df=min(length(radish0),length(radish100))-1, lower.tail = F)#Next get probability of obtaining that t-statistic. Here we look at the upper tail only, since we are interested in the probability of obtaining a test statistics >= tStat myDF=min(length(radish0), length(radish100))-1 criticalT <- qt(1-0.05/2, df=min(length(radish0), length(radish100))-1, lower.tail = F) s <- sqrt((var(radish0)/length(radish0))+(var(radish100)/length(radish100))) #Sample SD of the difference in two means, ASSUMING UNEQUAL VARIANCES. See the following for details on Minitab's compuations based on assuming equal vs unequal variances: http://support.minitab.com/en-us/minitab-express/1/help-and-how-to/basic-statistics/inference/how-to/two-samples/2-sample-t/methods-and-formulas/methods-and-formulas/ differenceInMeans <- mean(radish0)-mean(radish100) #Difference between sample means CI95TwoTail <- paste("(",round(differenceInMeans+criticalT*s,4),", ", round(differenceInMeans-criticalT*s,4),")", sep=""); names(CI95TwoTail) <- "CI_95%" #Compute confidence interval (Page 456) cat("The computed t-statistic = ", tStat, " with probability =", pValue, " and a two-sided CI of: ", CI95TwoTail) #Print results per page 456 #.....Manual: One-tailed (p. 452-455)---- #Repeat, but make a confidence interval that is one-sided criticalT <- qt(1-0.05, df=myDF, lower.tail = F) #Obtain critical t-value associated with a non-central 95% confidence interval, looking to test that difference between the two sample means is > 0. CI95OneTail <- paste("(",round(differenceInMeans+criticalT*s,4), ", Inf)",sep=""); names(CI95OneTail) <- "CI_95%" #Compute confidence interval (Page 428) cat("The computed t-statistic = ", tStat, " with probability =", pValue, " and a one-sided CI of: ", CI95OneTail) #Print results per page 452 #*************************************************************************** #Example 2: Compare the frequency with which people from two different income brackets contemplate their own deaths (Page 457) #DEATH = On a scale of 1 to 10, with 1 being never and 10 being all the time, how often would you say you think about your own death? #I-INCOME = I am going to read you a list of income categories. Please stop me when a category best describes your total household income before taxes. (Recoded for use with online analysis); RANGE: 1 to 7 lowIncome <- SFocus[SFocus$I.INCOME==1, "DEATH"] #Get the death contemplation ratings for those in income bracket 1. highIncome <- SFocus[SFocus$I.INCOME==7, "DEATH"] #Get the death contemplation ratings for those in income bracket 7. lowIncome <- na.omit(lowIncome) #Make your life easy by removing NA values now... highIncome <- na.omit(highIncome) #.....Automated: Two-tailed (ex. 2, p. 457)---- t.test(lowIncome, highIncome, alternative = "two.sided") #Perform a two-sided hypothesis test since we have no prior ideas about how the means of these two samples will differ (Page 457). #.....Manual: Two-tailed (ex. 2, p. 457)---- mean1 <- mean(lowIncome) #Get mean mean2 <- mean(highIncome) n1 <- length(lowIncome) #Get n n2 <- length(highIncome) var1 <- var(lowIncome) #Get variance var2 <- var(highIncome) tStat <- ((mean1-mean2)-0)/(sqrt((var1/n1)+(var2/n2))) #Get t-statistic (page 447-448, 449) #Get probability of obtaining that t-statistic. Here we rely on pooled degrees of freedom (page 457) #pooledDFv1 <- length(na.omit(lowIncome))+length(na.omit(highIncome)-2) #Per page 449 pooledDF = floor((var(lowIncome)+var(highIncome)^2)/((var(lowIncome)/((length(lowIncome)-1))) + ((var(highIncome)/(length(highIncome)-1))))) #DF with unequal variances according to Minitab pValue <- pt(tStat, df= pooledDF, lower.tail = F) #Use the pooled degrees of freedom, computed as n1 + n2 -2 s <- sqrt((var(lowIncome)/length(lowIncome))+(var(highIncome)/length(highIncome))) #Sample SD of the difference in two means, ASSUMING UNEQUAL VARIANCES. See the following for details on Minitab's compuations: http://support.minitab.com/en-us/minitab-express/1/help-and-how-to/basic-statistics/inference/how-to/two-samples/2-sample-t/methods-and-formulas/methods-and-formulas/ #s <- sqrt((var1/n1)+(var2/n2)) #Assuming euqal variances differenceInMeans <- mean(lowIncome)-mean(highIncome) #Difference between sample means criticalT <- qt(1-0.05/2, df=pooledDF, lower.tail = F) #Get critical t-statistic for two-tailed test CI95TwoTail <- paste("(",round(differenceInMeans+criticalT*s,4),", ", round(differenceInMeans-criticalT*s,4),")", sep=""); names(CI95) <- "CI_95%" #Compute confidence interval (Page 456) cat("The computed t-statistic = ", tStat, " with probability =", pValue, " and a two-sided CI of: ", CI95TwoTail) #Print results per page 457. NOTE: values differ slightly because the dataset has changed since slides were created or minor differences between this and Minitab's internal operations. #*************************************************************************** #Example 3: World Bank data from 2000 ##.....Automated: Two-tailed (ex. 3, p. 458-459)---- t.test(illitfem, illitmal, alternative = "two.sided", conf.level = 0.95) #NOTE: Values do not match lecture notes because dataset has changed. (page 458-459). Notice that had we chosen 99% confidence and alphs of 0.01, we would fail to reject the null hypothesis that the means of these two distributions are different. #*************************************************************************** #*************************************************************************** #Two-proportion z-tests (p. 469)---- #Video: Two Sample Tests: Binomials #.....Automated: Two-tailed (ex. 1, p. 468-472)---- #Example 1: Using Kelly's kmart handwashing data (Page 468-472) #An automated example first day1 <- 10 #Number of success on day 1 day2 <- 28 #Number of successon day 2 n1 <- n2 <- 40 #Number of trials on day 1 and on day 2 prop.test(x=c(day1,day2), n=c(n1,n2), correct=FALSE) #Compute relevant statistics for test of equality for two proportions (Page 470 and 472) #.....Manual: Two-tailed (ex. 1, p. 470-472)---- pHatDay1 <- day1/n1 #Estimated proportion for day 1 (Page 468) pHatDay2 <- day2/n2 #As above criticalZTwoTail <- qnorm((1-0.95)/2, lower.tail = F) #Obtain a critical z-score associated with a two-tailed test mySD <- sqrt( ((pHatDay1*(1-pHatDay1))/n1) + ((pHatDay2*(1-pHatDay2))/n2) ) #Compute the standard deviation for the sum of two independent binomials (Page 470) difference <- pHatDay1-pHatDay2 #Compute difference in estimated proportions (Page 470) lowerBound <- difference-criticalZTwoTail*mySD #Compute lower CI boundary (Page 470) upperBound <- difference+ criticalZTwoTail*mySD #Compute upper CI boundary (Page 470) pHat <- (day1+day2)/(n1+n2) #Compute joint proportion of successes (Page 471) Sp <- sqrt( pHat*(1-pHat)*((1/n1)+(1/n2))) #Pooled estimate of standard deviation for binomial (Page 471) zStat <- (difference-0)/Sp #Compute a z-statistic based on the estimated difference in proportions and the pooled standard deviation (Page 472) pValue <- pnorm(abs(zStat), lower.tail = F) #Obtain a pValue for the z-statistics from a standard normal distribution. If alpha is 0.05, we reject the null hypotheses that these two proportions come from the same distribution (Page 472) CI95TwoTail <- paste("(",round(lowerBound,4),", ",round(upperBound,4),")", sep="") cat("The computed z-statistic = ", zStat, " with probability =", pValue, " and a two-sided CI of: ", CI95TwoTail) #Print out results. (Page 469 and 472) #*************************************************************************** #.....Automated: Two-tailed (ex. 2. p. 473)---- #Example 2: Testing the equality of two proportions using the treated tick data (Page 473) #An automated example first... waterSuccess <- 3 #Number of success on day 1 fungusSuccess <- 40 #Number of successon day 2 n1 <- 92 #Number of trials on day 1 and on day 2 n2 <- 76 prop.test(x=c(waterSuccess,fungusSuccess), n=c(n1,n2), correct=FALSE) #Compute relevant statistics for test of equality for two proportions (Page 473) #.....Manual: Two-tailed (ex. 2, p. 473)---- pHatWater <- waterSuccess/n1 #Estimated proportion for day 1 (Page 468) pHatFungus <- fungusSuccess/n2 #As above criticalZTwoTail <- qnorm((1-0.95)/2, lower.tail = F) #Obtain a critical z-score associated with a two-tailed test mySD <- sqrt( ((pHatWater*(1-pHatWater))/n1) + ((pHatFungus*(1-pHatFungus))/n2) ) #Compute the standard deviation for the sum of two independent binomials (Page 470) difference <- pHatWater-pHatFungus #Compute difference in estimated proportions (Page 470) lowerBound <- difference-criticalZTwoTail*mySD #Compute lower CI boundary (Page 470) upperBound <- difference+ criticalZTwoTail*mySD #Compute upper CI boundary (Page 470) pHat <- (waterSuccess+fungusSuccess)/(n1+n2) #Compute joint proportion of successes (Page 471) Sp <- sqrt( pHat*(1-pHat)*((1/n1)+(1/n2))) #Pooled estimate of standard deviation for binomial (Page 471) zStat <- difference/Sp #Compute a z-statistic based on the estimated difference in proportions and the pooled standard deviation (Page 472) pValue <- pnorm(abs(zStat), lower.tail = F) #Obtain a pValue for the z-statistics from a standard normal distribution. If alpha is 0.05, we reject the null hypotheses that these two proportions come from the same distribution (Page 472) CI95TwoTail <- paste("(",round(lowerBound,4),", ",round(upperBound,4),")", sep="") cat("The computed z-statistic = ", zStat, " with probability =", pValue, " and a two-sided CI of: ", CI95TwoTail) #Print out results. (Page 473) #*************************************************************************** #*************************************************************************** #Pooled sample t-tests (p. 461)---- #Video: Two Sample Tests: Pooled Tests #Compute t-tests as above, but control for whether or not the variances of the population distributions are considered equal. #.....Automated: Two-sided (ex. 1, p. 458, 460-463)---- #Example 1: World Bank illiteracy data. These examples do not match lecture notes since the dataset has changed. Note also that the differences between the following two examples are minimal, given that the sample sizes are >50 (see page 462). t.test(illitfem, illitmal, alternative = "two.sided") #Page 460-463. By default this function treats the samples as coming from distributions with UNEQUAL VARIANCES. You can add "var.equal=F" if you want to be explicit. t.test(illitfem, illitmal, alternative = "two.sided", var.equal = T) #Page 460-463. As above, but treating samples as coming from distributions with EQUAL VARIANCES. Equal variances allows more flexibility, or greater degrees of freedom since fewer things are uncertain and need to be estimated. #.....Automated: Two-sided (ex. 2, p. 464-465)---- #Example 2, using the roundup data. This example draws on subsets of data created above logRadish0 <- log(radish0) #Log transform the dry weights to alter the distribution shape and improve ratio of standard deviations (page 463-464) logRadish100 <- log(radish100) #As above sd(radish0)/sd(radish100) #Ratio of original data is ~18 (page 463) sd(logRadish0)/sd(logRadish100) #Ratio of log-transformed data is close to 2 t.test(logRadish0, logRadish100, alternative="greater", var.equal=F) #One-sided two-sample t-test. Here we assume the variances of the two distirbutions are not equal and therefore do not pool the data when estimating sigma. Notice the p-value and compare with the following. (Page 464) t.test(logRadish0, logRadish100, alternative="greater", var.equal=T) #As above, but here we pool the data in estimating sigma. Notice the smaller p-value (page 465). #*************************************************************************** #*************************************************************************** #Paired sample t-test (p. 467)---- #An example using the World Bank illiteracy data #Here the differences between countries were overwhelming the differences between genders. So, we compute a difference between genders before computing a one-sample t-test genderDifference <- na.omit(WB$illitfem-WB$illitmal) #Subtract male illieracy rates from female on a country-by-country basis. Then drop missing values (Page 466) t.test(genderDifference, mu=0, alternative="two.sided", conf.level = 0.95) #Values differ from those reported in the notes since the dataset has changed. This shows that female illiteracy rates are significantly higher than male rates (Page 466-467) #*************************************************************************** #*************************************************************************** ##Chi-squared test (p. 489)---- #.....Automated (ex. 2, p. 474-490)---- #Automated example using the treated tick data (Page 474-490) rows <- 5 cols <- 2 observedData <- as.table(matrix(c(49,37,21,6,2,1,13,29,44,48), nrow=rows, ncol=cols, byrow = F, dimnames=list(c("Control", "4*10^6 spores/ml", "4*10^7 spores/ml", "4*10^8 spores/ml", "4*10^9 spores/ml"), c("Lived", "Dead")))) #Create a contingency table for the treated tick data. Note that this object must be of class "table" with named dimensions (Page 474). chisqResults <- chisq.test(observedData) #Perform a chi-squared test and store the restuls chisqResults #General test results (Page 484) chisqResults$expected #Access elements of results object, such as the expect counts(Page 481) chisqStat <- chisqResults$statistic #Get actual chi-squared test statistic (Page 484) myDF <- (rows-1)*(cols-1) #Compute degrees of freedom for hypothesis testing (Page 487) pValue <- pchisq(q=chisqStat, df=myDF, lower.tail = F) #Compute a p-value associated with obtaining our chi-squared test statistics (Page 488) cat("This test produced a chi-squared test statistics of: ", chisqStat, " with a probability of: ", pValue) #Return some results. The pValue speaks to the likelihood of seeing the observed data if there is in fact no relationship between fungus and tick mortality (Page 488-489). #*************************************************************************** #.....Manual (ex. 2, p. 474-490)---- #Manual example using the treated tick data (Page 474-490) observedData <- matrix(c(49,37,21,6,2,1,13,29,44,48), nrow=5, ncol=2, byrow = F, dimnames=list(c("Control", "4*10^6 spores/ml", "4*10^7 spores/ml", "4*10^8 spores/ml", "4*10^9 spores/ml"), c("Lived", "Dead"))) #Create a contingency table for the treated tick data. Here we make use of a matrix instead of a 'table' (Page 475). #Create a matrix of expected counts... expectedData <- matrix(NA, nrow=5, ncol=2) #Initialize an empty matrix for(i in 1:nrow(observedData)){ #For each row... for (j in 1:ncol(observedData)){ #For each column... expectedData[i,j] <- sum(observedData[i,])*sum(observedData[,j])/sum(observedData) #Compute the expect count per page 483 } } #Create a matrix of chi-squared statistics chisqData <- matrix(NA, nrow=5, ncol=2) #Initialize and empty matrix for(i in 1:nrow(expectedData)){ #For each row... for (j in 1:ncol(expectedData)){ #For each column... chisqData[i,j] <- ((observedData[i,j]-expectedData[i,j])^2)/expectedData[i,j] #Compute the chi-squared statistic using the observed and expected data } } cat("Chi-squared statistic = ", sum(chisqData)) #Obtain the chi-squared test statistic by summing across matrix entries (Page 484) #*************************************************************************** #*************************************************************************** ## END OF SCRIPT ##