--- title: 'Statistical Thinking using Randomisation and Simulation' subtitle: "Statistical distributions" author: Di Cook (dicook@monash.edu, @visnut) date: "W3.C1" output: xaringan::moon_reader: css: ["default", "myremark.css"] self_contained: false nature: highlightStyle: github highlightLines: true countIncrementalSlides: false --- ```{r setup, include = FALSE} library(knitr) opts_chunk$set( message = FALSE, warning = FALSE, cache = FALSE, echo=FALSE, fig.height = 2, fig.width = 5, collapse = TRUE, comment = "#>" ) options(digits=2) library(dplyr) library(tidyr) library(ggplot2) library(readr) library(gridExtra) ``` # Overview of this class - Random numbers - Mapping random numbers to events for simulation - Statistical distributions - Density functions --- # Random numbers - True random number generators: [Radioactive decay](https://www.fourmilab.ch/hotbits/), [electromagnetic field of a vacuum](https://qrng.anu.edu.au) - Computers only technically provide pseudo-random numbers, using deterministic process, e.g linear congruential, for large $a, b, m$ $$X_{n+1} = (aX_n + b) ~~mod ~~m$$ --- # RANDU - a bad PRNG - Used in the 60s and onwards $$X_{n+1} = 65539 X_n ~~mod ~~2^{31}$$ ```{r fig.width=3.5, fig.height=3.5, fig.align='center'} randu <- read.csv("randu.csv") ggplot(randu, aes(x=V1, y=V2)) + geom_point() + theme(aspect.ratio=1) ``` --- # Mersenne Twister - algorithm is a twisted generalised feedback shift register (TGFSR) - based on a Marsenne prime, $2^m-1$ - most commonly used today - each integer will occur the same number of times in a period --- # Using random numbers to estimate things - Suppose we want to estimate the proportion of the class in 2420 vs 5242 - Now we know that there are 36 5242 and 152 2420 students enrolled in the class. BUT SUPPOSE WE DON'T KNOW THIS! - We see a random sample of 20 students, and have to guess what the proportion for the whole class is. --- # Using random numbers - Random number tables (old fashioned) deliver single digits 0, 1, ..., 9 - When using these you need to ensure that you map these digits or combinations of the digits to match the probabilities of events - For example, use random numbers to sample students from class + There are 188 students in the class + Each student, or possible group of students, needs to have an equal chance of being selected + Need to use three sequential digits + BUT there are 1000 three digit numbers, so either we will throw away 822 of them, or we could map a person to multiple numbers (5) and throw away only 60 + If any person is selected more than once, throw out repeats ```{r eval=FALSE} etc2420 <- read_delim("../../admin/ETC2420.txt", delim=" ", col_names=c("first", "last", "id")) etc2420 <- etc2420 %>% select(-id) %>% mutate(section="2420") etc5242 <- read_delim("../../admin/ETC5242.txt", delim=" ", col_names=c("first", "last", "id")) etc5242 <- etc5242 %>% select(-id) %>% mutate(section="5242") class_all <- bind_rows(etc2420, etc5242) save(class_all, file="classlist.rda") ``` ```{r} load("classlist.rda") nums <- NULL for (i in 0:9) for (j in 0:9) for (k in 0:9) nums <- c(nums, paste0(i, j, k)) class_all$number <- nums[1:188] ``` --- # Assign every class member a number ```{r} class_all ``` --- # Generate random digits ```{r} set.seed(64) rn <- sample(0:9, 1002, replace=TRUE) rn ``` --- # Group in threes ```{r} rn_gp <- matrix(rn, ncol=3, byrow=TRUE) head(rn_gp, 20) ``` --- # Throw away numbers > 187 ```{r} rn_gp_filter <- as.numeric(paste0(rn_gp[,1], rn_gp[,2], rn_gp[,3])) rn_gp_filter <- rn_gp_filter[rn_gp_filter<188] rn_gp_filter[1:20] ``` --- # Find class members ```{r} mysample <- class_all[rn_gp_filter[1:20]+1,] mysample ``` --- # Compute proportion Estimated proportion is: ```{r} nrow(mysample[mysample$section=="2420",])/20 ``` True proportion is 156/188=0.81. --- # Simpler approach ```{r} class_all <- class_all %>% select(-number) ``` ```{r echo=TRUE} class_all %>% sample_n(20) ``` --- # Statistical distributions - Uniform - Normal - Exponential - Binomial - Pareto - Weibull - Gamma - Lognormal --- # Random numbers = Uniform ```{r fig.align='center', fig.width=8, fig.height=3, fig.show='hold'} df <- data.frame(x=sample(0:9, 10000, replace = TRUE), y=rep(0:9, 1000)) p1 <- ggplot(df, aes(x=y)) + geom_bar() + scale_x_continuous("", breaks=0:10, labels=0:10) + ggtitle("Theory") p2 <- ggplot(df, aes(x=x)) + geom_bar() + scale_x_continuous("", breaks=0:10, labels=0:10) + ggtitle("Sample") grid.arrange(p1, p2, ncol=2) ``` - symmetric, unimodal, uniform - e.g. $U\{0, ..., 9\}$ - e.g. $P(X=x) = f(x) = 1/10, ~~ x \in \{0, ..., 9\}$ --- # Normal distribution - Gaussian, bell-shaped - symmetric, unimodal - $N(\mu, \sigma)$ $$f(x~|~\mu, \sigma) = \frac{1}{\sqrt{2\pi}\sigma} e^{-\frac{(x-\mu)^2}{2\sigma^2}}, ~~~ -\infty 0") ifelse(q<1, 0, 1-1/q^c) } qpareto <- function(p,c){ if (c<=0) stop("c must be positive > 0") if (any(p<0)|any(p>1)) # Symbol | denotes logical OR stop("p must be between 0 and 1") q <- (1-p)^(-1/c) q } rpareto <- function(n, c){ if (c<=0) stop("c must be positive") rp <- runif(n)^(-1/c) rp } ``` $$ f(x~|~\alpha, \lambda) = \frac{\alpha\lambda^\alpha}{(\lambda+x)^{\alpha+1}} ~~~x>0, \alpha>0, \lambda>0 $$ - Used to describe allocation of wealth, sizes of human settlement - Heavier tailed than exponential distribution ```{r fig.align='center', fig.width=8, fig.height=3, fig.show='hold'} df <- data.frame(x=seq(1, 10, 0.1), d=dpareto(seq(1, 10, 0.1), 1)) p1 <- ggplot(df, aes(x=x, y=d)) + geom_line() + xlab("") + ylab("") + scale_x_continuous(breaks=seq(1,10,1), labels=seq(1,10,1)) + ggtitle("Theory") df2 <- data.frame(r=rpareto(1000, 1)) p2 <- ggplot(df2, aes(x=r)) + geom_histogram(binwidth=1) + scale_x_continuous(breaks=seq(1,10,1), labels=seq(1,10,1), limits=c(0.5,10.5)) + xlab("") + ylab("") + ggtitle("Sample") grid.arrange(p1, p2, ncol=2) ``` --- # Weibull $$f(x~|~\lambda, k) = \frac{k}{\lambda}\left(\frac{x}{\lambda}\right)^{k-1} e^{(-x/\lambda)^k}, ~~~ x\geq 0$$ - used for particle size distribution, failure analysis, delivery time, extreme value theory - shape changes considerably with different $k$ ```{r fig.align='center', fig.width=8, fig.height=3, fig.show='hold'} df <- data.frame(x=seq(0.0025, 2.5, 0.0025), r=rweibull(1000, shape=1.5), d=dweibull(seq(0.0025, 2.5, 0.0025), shape=1.5)) p1 <- ggplot(df, aes(x=x, y=d)) + geom_line() + xlab("") + ylab("") + geom_hline(yintercept=0, colour="grey70") + ggtitle("Theory") + xlim(c(0,4.5)) p2 <- ggplot(df, aes(x=r, y=..density..)) + geom_histogram(binwidth=0.5) + xlab("") + ylab("") + geom_line(aes(x=x, y=d), color="red") + ggtitle("Sample") + xlim(c(0,4.5)) grid.arrange(p1, p2, ncol=2) ``` --- # Gamma $$f(x~|~\alpha, \beta) = \frac{\beta^\alpha}{\Gamma(\alpha)} x^{\alpha-1}e^{-x\beta}, ~~~ x\geq 0 ~~\alpha, \beta > 0 $$ - Generalisation of exponential distribution, and also $\chi^2$ - $\alpha$ changes shape substantially - used to model size of insurance claims, rainfall ```{r fig.align='center', fig.width=8, fig.height=3, fig.show='hold'} df <- data.frame(x=seq(0.005, 5, 0.005), r=rgamma(1000, shape=2), d=dgamma(seq(0.005, 5, 0.005), shape=2)) p1 <- ggplot(df, aes(x=x, y=d)) + geom_line() + xlab("") + ylab("") + geom_hline(yintercept=0, colour="grey70") + ggtitle("Theory") + xlim(c(0,4.5)) p2 <- ggplot(df, aes(x=r, y=..density..)) + geom_histogram(binwidth=0.5) + xlab("") + ylab("") + geom_line(aes(x=x, y=d), color="red") + ggtitle("Sample") + xlim(c(0,4.5)) grid.arrange(p1, p2, ncol=2) ``` --- # Lognormal - Also called Galton's distribution - Generated when $Y\sim N(\mu, \sigma)$, and study $X=exp(Y)$ - used for modeling length of comments posted in internet discussion forums, users' dwell time on the online articles, size of living tissue, highly communicable epidemics ```{r fig.align='center', fig.width=8, fig.height=3, fig.show='hold'} df <- data.frame(x=seq(0.005, 5, 0.005), r=rlnorm(1000), d=dlnorm(seq(0.005, 5, 0.005))) p1 <- ggplot(df, aes(x=x, y=d)) + geom_line() + xlab("") + ylab("") + geom_hline(yintercept=0, colour="grey70") + ggtitle("Theory") + xlim(c(0,4.5)) p2 <- ggplot(df, aes(x=r, y=..density..)) + geom_histogram(binwidth=0.5) + xlab("") + ylab("") + geom_line(aes(x=x, y=d), color="red") + ggtitle("Sample") + xlim(c(0,4.5)) grid.arrange(p1, p2, ncol=2) ``` --- # Sampling variability ```{r fig.align='center', fig.width=8, fig.height=5} df <- data.frame(x=sample(0:9, 50, replace = TRUE)) p1 <- ggplot(df, aes(x=x)) + geom_bar() + scale_x_continuous("", breaks=0:10, labels=0:10) + ggtitle("n=50") df <- data.frame(x=sample(0:9, 100, replace = TRUE)) p2 <- ggplot(df, aes(x=x)) + geom_bar() + scale_x_continuous("", breaks=0:10, labels=0:10) + ggtitle("n=100") df <- data.frame(x=sample(0:9, 1000, replace = TRUE)) p3 <- ggplot(df, aes(x=x)) + geom_bar() + scale_x_continuous("", breaks=0:10, labels=0:10) + ggtitle("n=1000") df <- data.frame(x=sample(0:9, 100000, replace = TRUE)) p4 <- ggplot(df, aes(x=x)) + geom_bar() + scale_x_continuous("", breaks=0:10, labels=0:10) + ggtitle("n=100000") grid.arrange(p1, p2, p3, p4, ncol=2) ``` --- # Probability calculations - Probability density functions are useful for computing expected quantities - E.g. Gamma(2,1), what is the probability of seeing $X>3.2$, or $1.53.2") + xlim(c(0,4.5)) p2 <- ggplot(df, aes(x=x, y=d)) + geom_line() + xlab("") + ylab("") + geom_hline(yintercept=0, colour="grey70") + geom_vline(xintercept=c(1.5, 2.5), colour="red") + ggtitle("1.5Creative Commons License
This work is licensed under a Creative Commons Attribution 4.0 International License.