#*************************************************************************** #*************************************************************************** ## ## Intro STATS: ## MAKING AND MANAGING DISTRIBUTIONS---- ## Henry Glick (JDRS) (Last updated Oct. 10, 2017) ## http://www.reuningscherer.net/stat10x/r/ ## #*************************************************************************** #*************************************************************************** ##Notes on handling distributions in R---- #Every distribution that R handles has four functions. There is a root name (e.g., norm, binom, etc.) prefixed by a letter (e.g., p, q, etc.). The letter dictates the function. #p for "probability", the cumulative distribution function (c. d. f.); provides direct lookup of intervals #q for "quantile", the inverse c. d. f.; provides inverse look-up. The inverse cumulative distribution function (ICDF) gives the value of the variable that is associated with a specific cumulative probability. #d for "density", the probability density function (p. f. or p. d. f.); provides direct look-up of points #r for "random", used for creating a random variable (data!) having the specified distribution #*************************************************************************** ## Binomial distributions (page 318+)---- dbinom(x=10, size=15, prob=0.83) #Example from page 320. This answers the question: What's the probability of exactly 10 people wearing seatbelts? This function can also take a vector for argument x. #Compare with manual method on page 320: nChooseK <- function(n, k){ factorial(n)/(factorial(k)*factorial(n-k)) } nChooseK(15,10)*0.83^10*0.17^5 #See also: (result <- pbinom(q=10,size=15,prob=0.83)) #Cumulative probability function; answers the question: What's the probability of 10 or fewer people wearing seatbelts? #Compare pbinom() with: dbinom(0, size=15, prob=0.83) + dbinom(1, size=15, prob=0.83) + dbinom(2, size=15, prob=0.83) + dbinom(3, size=15, prob=0.83) + dbinom(4, size=15, prob=0.83) + dbinom(5, size=15, prob=0.83) + dbinom(6, size=15, prob=0.83) + dbinom(7, size=15, prob=0.83) + dbinom(8, size=15, prob=0.83) + dbinom(9, size=15, prob=0.83) + dbinom(10, size=15, prob=0.83) #Note also, you can control the tail of the cumulative probability distribution that is return with the optional argument, lower = TRUE/FALSE. pbinom(q=12,size=15,prob=0.83, lower=FALSE) #Page 320. This answers the question: What's the probability of 13 or more people wearing seatbelts? Notice that q = 12, since we want 13 OR more. qbinom(p=result,size=15,prob=0.83) #This returns the quantile (percentile) of a binomial distribution with size = n and stated probability. Note the relationship between pbinom() and qbinom(). This function will return the random variable value based on the probability, where as pbinom() returns the probability based on the random variable value. rbinom(n= 15, size=20, p=0.83) #Generate data with a binomial distribution. We use this function to generate n independent random variables, each of which is the result of some number of trials (size) with the stated probability. rbinom(n= 15, size=20, p=0.83) means: perform a test 20 times where the probability of success is 0.83. Return the number of successes across the 20 trials. Repeat this process 15 times. rbinom(n= 15, size=1, p=0.5) #If the size = 1, this returns bernoulli trials. Using 0.5 as the probability is like simulating coin tosses. #*************************************************************************** ## Poisson distributions (page 342+)---- #Remember, the mean and variance of a Poisson distribution are the same, so only one is specified (here, lambda). dpois(x=5, lambda=7) #Page 343. Density function, Lamda = mean per unit time/space. What is the probability of exactly 5 deaths occuring in the next hour. #Page 343. What is the probability of four or more deaths in the next hour. Comapre the next two lines. 1-(dpois(0, lambda =7) + dpois(1, lambda=7) + dpois(2, lambda=7) + dpois(3, lambda=7)) ppois(q=3, lambda=7, lower=F) #Cumulative distribution function with upper tail returned dpois(x=15, lambda = 7*2) #Page 343. Probability of exactly 15 deaths in the next TWO hours. Note the extension of the unit time from 7 to 14. ppois(q=1175, lambda=1176, lower=F) #Page 344-346. What's the probability of 1176 OR more overdose deaths in one week? myProb <- ppois(q=3, lambda=7) qpois(p=myProb, lambda = 7) #Find the nth random variable value associated with a given probability. This returns the quantile (percentile) of a Poisson distribution with size = n and stated probability. Note the relationship between ppois() and qpois(). This function will return the random variable value based on the probability, where as ppois() returns the probability based on the random variable value. #Make some fake data with a Poisson distribution and mean/variance of 1176. fakeData <- rpois(100000, lambda = 1176) hist(fakeData, breaks=100); abline(v=mean(temp), col="red") mean(fakeData); var(fakeData) #As n approaches large, these values will more closely match the true stipulated lambda (compare with 10 million) #*************************************************************************** ## Exponential distributions (page(369+)---- pexp(6, rate=1/(3/1), lower.tail = F) #Page 368. What's the probability that you will have to wait more than 6 minutes at the checkout, given a mean rate of 3 minutes per one checkout? The rate argument is specified as 1/mean. qexp(0.75, rate = 1/3) #What's the third quartile of checkout waiting times given a mean rate of 3 minutes per one checkout? Answer is in minutes. pexp(1, rate=1/(7300/18000)) #Page 371. What's the probability that at least one vaccum tube fails in the first day of operation? dexp(1, rate = 1/(7300/18000)) #What's the probability that EXACTLY one vaccum tube will fail the first day? fakeData <- rexp(n=1000000, rate = 1/(3/1)) #Create fake supermarket data with an exponential distribution, having n observations and a super market checkout rate of three minutes per one person. hist(fakeData, breaks=100); abline(v=mean(fakeData), col="red") #Look at the data mean(fakeData); var(fakeData); sqrt(var(fakeData)) #The mean of exponential distribution is always mu and variance mu^2 #*************************************************************************** ## t distributions (page 395+)---- dt(x=0, df=592) #Returns the density function value for a particular point (not a range of values). This answers the question: What is the probability associated with value x? (result <- qt(1-0.05/2, df=592)) #Calculate critical t-value associated with the 95th percentile of the random variable. That is, the t-value at which the probability of the random variable is less than or equal to the stated probability. Here we use 0.975, or 1-0.05/2 (to split the 0.05 between two tails) to produce a critical value for a 95% confidence interval with 593 observations and n-1 degrees of freedom. pt(result,592) #Return the probability associated with a particular t random variable and n-1 degrees of freedom. Note how qt() and pt() are related. #Create fake data that has a t distribution with n-1 degrees of freedom n <- 1000 #Adjust this value (e.g., try 1000, 10000, 100000, 1000000) and compare the results of the following lines tData <- rt(n=n, df=n-1) #Create fake t distributed data hist(tData, breaks=100, prob=T) #histogram of t distribution lines(density(tData), lwd=2, lty=6) #Add a nnparametric density curve #*************************************************************************** ## END OF SCRIPT ##