--- title: 'Statistical Thinking using Randomisation and Simulation' subtitle: "Statistical distributions" author: Di Cook (dicook@monash.edu, @visnut) date: "W3.C2" output: xaringan::moon_reader: css: ["default", "myremark.css"] self_contained: false nature: highlightStyle: github highlightLines: true countIncrementalSlides: false --- ```{r setup, include = FALSE} library(knitr) opts_chunk$set( message = FALSE, warning=FALSE, error=FALSE, cache = FALSE, echo=FALSE, fig.height = 2, fig.width = 5, collapse = TRUE, comment = "#>" ) options(digits=2) library(dplyr) library(tidyr) library(ggplot2) library(readr) library(gridExtra) ``` # Overview of this class - Random variables - Central limit theorem - Estimation - Quantiles - Goodness of fit --- # Random variables vs random samples - Conceptually we think about a **random variable** $(X)$ having a distribution $$X \sim N(\mu, \sigma)$$ - Once we collect data, or use simulation we will have a realisation from the distribution, a random sample, observed values: ```{r} x <- rnorm(5, 0, 1) x ``` ```{r fig.align='center', fig.width=3, fig.height=2, fig.show='hold'} df <- data.frame(x=seq(-4,4, 0.008), d=dnorm(seq(-4,4, 0.008))) df2 <- data.frame(x=x, y=0) ggplot(df, aes(x=x, y=d)) + geom_line() + xlab("") + ylab("") + geom_hline(yintercept=0, colour="grey70") + geom_point(data=df2, aes(x=x, y=0), colour="red") ``` ```{r eval=FALSE} x <- rnorm(50, 0, 1) df2 <- data.frame(x=x, y=0) ggplot(df, aes(x=x, y=d)) + geom_line() + xlab("") + ylab("") + geom_hline(yintercept=0, colour="grey70") + geom_point(data=df2, aes(x=x, y=0), colour="red") ``` --- # Central limit theorem - Why the normal model is so fundamental - Regardless what distribution $X$ has, the mean of a sample will have a normal distribution, if the sample size is large: "Let $\{X_1, ..., X_n\}$ be a random sample of size $n$ — that is, a sequence of independent and identically distributed random variables drawn from a distribution mean given by $\mu$ and finite variance given by $\sigma^2$. The sample average is defined $\bar{X} = \sum_{i=1}^{n} X_i/n$, then as $n$ gets large the distribution of $\bar{X}$ approximates $N(\mu, \sigma/\sqrt{n})$." --- # Example: Uniform parent ```{r fig.align='center', fig.width=8, fig.height=5} rs <- runif(10000000) x2 <- apply(matrix(rs, ncol=2), 1, mean) x20 <- apply(matrix(rs, ncol=20), 1, mean) x50 <- apply(matrix(rs, ncol=50), 1, mean) x100 <- apply(matrix(rs, ncol=100), 1, mean) df <- data.frame(pop=rs[1:1000], x2=x2[1:1000], x20=x20[1:1000], x50=x50[1:1000], x100=x100[1:1000]) p1 <- ggplot(data.frame(rs), aes(x=rs)) + geom_histogram(breaks=seq(0,1,0.1)) + ggtitle("POPULATION") + xlab("") p2 <- ggplot(df, aes(x=x2)) + geom_histogram(binwidth=0.1) + ggtitle("n=2") + xlab("") + xlim(c(0,1)) p3 <- ggplot(df, aes(x=x20)) + geom_histogram(binwidth=0.05) + ggtitle("n=20") + xlab("") + xlim(c(0,1)) p4 <- ggplot(df, aes(x=x100)) + geom_histogram(binwidth=0.02) + ggtitle("n=100") + xlab("") + xlim(c(0,1)) grid.arrange(p1, p2, p3, p4, ncol=2) ``` --- # Example: Weibull parent ```{r fig.align='center', fig.width=8, fig.height=5} rs <- rweibull(1000000, 1.5) x2 <- apply(matrix(rs, ncol=2), 1, mean) x20 <- apply(matrix(rs, ncol=20), 1, mean) x50 <- apply(matrix(rs, ncol=50), 1, mean) x100 <- apply(matrix(rs, ncol=100), 1, mean) df <- data.frame(pop=rs[1:1000], x2=x2[1:1000], x20=x20[1:1000], x50=x50[1:1000], x100=x100[1:1000]) p1 <- ggplot(data.frame(rs), aes(x=rs)) + geom_histogram() + ggtitle("POPULATION") + xlab("") + xlim(c(0,6)) p2 <- ggplot(df, aes(x=x2)) + geom_histogram(binwidth=0.2) + ggtitle("n=2") + xlab("") + xlim(c(0,6)) p3 <- ggplot(df, aes(x=x20)) + geom_histogram(binwidth=0.1) + ggtitle("n=20") + xlab("") + xlim(c(0,6)) p4 <- ggplot(df, aes(x=x100)) + geom_histogram(binwidth=0.05) + ggtitle("n=100") + xlab("") + xlim(c(0,6)) grid.arrange(p1, p2, p3, p4, ncol=2) ``` --- # Estimation - Estimate parameters of a distribution from the sample data - Common approach is maximum likelihood estimation - Requires assuming we know the basic functional form --- # Maximum likelihood estimate (MLE) - Estimate the unknown parameter $\theta$ using the value that maximises the probability (i.e. likelihood) of getting the observed sample - Likelihood function $$\begin{aligned} L(\theta) &=P(X_1=x_1,X_2=x_2, ... ,X_n=x_n~|~\theta) \\ &=f(x_1|\theta)f(x_2|\theta)\cdots f(x_n|\theta) \\ &=\prod_{i=1}^n f(x_i;\theta) \end{aligned}$$ - This is now a function of $\theta$. - Use function maximisation to solve. --- # Example - Mean of normal distribution, assume variance is 1 - MLE estimate of the population mean for a normal model is the sample mean - Run this numerically - Suppose we have a sample of two: $x_1=1.0, x_2=3.0$ - Likelihood $$ \begin{aligned} L(\mu) &= \frac{1}{\sqrt{2\pi}}e^{-\frac{(1.0-\mu)^2}{2}}\times \frac{1}{\sqrt{2\pi}}e^{-\frac{(3.0-\mu)^2}{2}}\\ &= \frac{1}{2\pi}e^{-\frac{(1-\mu)^2+(3-\mu)^2}{2}} \end{aligned} $$ --- # Plot it ```{r fig.align='center', fig.width=5, fig.height=3} nmle <- function(mu) { f <- 1/(2*pi) * exp(-((1-mu)^2+(3-mu)^2)/2) return(f) } df <- data.frame(x=seq(1, 3, 0.1), f=nmle(seq(1, 3, 0.1))) ggplot(df, aes(x=x, y=f)) + geom_line() + xlab(expression(mu)) + ylab("L") ``` - The maximum is at 2.0. This is the sample mean, which we can prove algebraically is the MLE. --- # Estimate mean and variance Sample ```{r} set.seed(1031) x <- rnorm(22, 3, 5) x ``` We know it comes from a normal distribution. What are the best guesses for the $\mu, \sigma$? --- Compute the likelihood for a range of values of both parameters. ```{r fig.align='center', fig.width=5, fig.height=5} nmle <- function(x, mu, sigma) { f <- prod(dnorm(x, mu, sigma)) return(f) } mu <- seq(1, 5, 0.1) sig <- seq(2, 7, 0.1) g <- expand.grid(x=mu, y=sig) g$f <- 0 for (i in 1:nrow(g)) { g$f[i] <- nmle(x, g$x[i], g$y[i]) } ggplot(g, aes(x=x, y=y, fill=f)) + geom_tile() + xlab(expression(mu)) + ylab(expression(sigma)) + theme_bw() + scale_fill_continuous("L") + theme(aspect.ratio=1) ``` --- # Quantiles - `quantiles` are cutpoints dividing the range of a probability distribution into contiguous intervals with equal probabilities - 2-quantile is the median (divides the population into two equal halves) - 4-quantile are quartiles, $Q_1, Q_2, Q_3$, dividing the population into four equal chunks - quantiles are values of the random variable $X$ - useful for comparing distributions --- # Example: - 12-quantiles for a $N(0,1)$ ```{r echo=TRUE} qnorm(seq(1/12,11/12,1/12)) ``` - 23-quantiles from a $Gamma(2,1)$ ```{r echo=TRUE} qgamma(seq(1/23,22/23,1/23), 2) ``` --- # Percentiles - indicate the value of $X$ below which a given percentage of observations fall, e.g. 20th percentile is the value that has 20% of values below it - 17th percentile from $N(0,1)$ ```{r echo=TRUE} qnorm(0.17) ``` - 78th percentile from $Gamma(2,1)$ ```{r echo=TRUE} qgamma(0.78, 2) ``` --- # Goodness of fit - Quantile-quantile plot (QQplot) plots theoretical vs sample quantiles - Lets check the distribution of PISA math scores ```{r fig.align='center', fig.width=8, fig.height=4} st <- readRDS("../data/student_sub.rds") aus <- st %>% filter(CNT == "AUS") n <- nrow(aus) aus_math <- data.frame(PV1MATH = sort(scale(aus$PV1MATH)), q=qnorm(c(1 - 0.5^(1/n), (2:(n-1) - 0.3175) / (n + 0.365),0.5^(1/n)))) p1 <- ggplot(aus_math, aes(x=PV1MATH)) + geom_histogram(binwidth=0.5) + xlab("Standardised math score") p2 <- ggplot(aus_math, aes(x=PV1MATH, y=q)) + geom_abline(intercept=0, slope=1) + geom_point() + xlim(c(-5, 5)) + ylim(c(-5, 5)) + xlab("Sample quantiles") + ylab("Theoretical quantile") + theme(aspect.ratio=1) grid.arrange(p1, p2, ncol=2) ``` --- # QQ-Plot computation - Sort and standardise the sample values from low to high - Compute theoretical quantiles (formula below), $n=$ sample size - Plot the theoretical vs sample quantiles $$\begin{aligned} & 1 - 0.5^{(1/n)} ~~~ i=1 \\ ~~~~~~~~~~~~~& \frac{i - 0.3175}{n + 0.365} ~~~ i=2, ..., n-1 \\ & 0.5^{(1/n)} ~~~~~~ ~~~ i=n \end{aligned}$$ --- # Reading QQ-plots - The points should lie along the $X=Y$ line, for the sample to be consistent with the distribution. - How close is good enough? - It depends on the sample size. - Simulate some samples of the same size from the target distribution, and make QQ-plots of these, to compare with the actual data --- ```{r fig.align='center', fig.width=8, fig.height=5} aus_math <- data.frame(PV1MATH = sort(scale(aus$PV1MATH)), q=qnorm(c(1 - 0.5^(1/n), (2:(n-1) - 0.3175) / (n + 0.365),0.5^(1/n))), s1=sort(rnorm(n)), s2=sort(rnorm(n)), s3=sort(rnorm(n)), s4=sort(rnorm(n)), s5=sort(rnorm(n))) p1 <- ggplot(aus_math, aes(x=PV1MATH, y=q)) + geom_abline(intercept=0, slope=1) + geom_point() + xlim(c(-5, 5)) + ylim(c(-5, 5)) + xlab("Sample quantiles") + ylab("Theoretical quantile") + theme(aspect.ratio=1) + ggtitle("(a)") p2 <- ggplot(aus_math, aes(x=s1, y=q)) + geom_abline(intercept=0, slope=1) + geom_point() + xlim(c(-5, 5)) + ylim(c(-5, 5)) + xlab("Sample quantiles") + ylab("Theoretical quantile") + theme(aspect.ratio=1) + ggtitle("(b)") p3 <- ggplot(aus_math, aes(x=s2, y=q)) + geom_abline(intercept=0, slope=1) + geom_point() + xlim(c(-5, 5)) + ylim(c(-5, 5)) + xlab("Sample quantiles") + ylab("Theoretical quantile") + theme(aspect.ratio=1) + ggtitle("(c)") p4 <- ggplot(aus_math, aes(x=s3, y=q)) + geom_abline(intercept=0, slope=1) + geom_point() + xlim(c(-5, 5)) + ylim(c(-5, 5)) + xlab("Sample quantiles") + ylab("Theoretical quantile") + theme(aspect.ratio=1) + ggtitle("(d)") p5 <- ggplot(aus_math, aes(x=s4, y=q)) + geom_abline(intercept=0, slope=1) + geom_point() + xlim(c(-5, 5)) + ylim(c(-5, 5)) + xlab("Sample quantiles") + ylab("Theoretical quantile") + theme(aspect.ratio=1) + ggtitle("(e)") p6 <- ggplot(aus_math, aes(x=s5, y=q)) + geom_abline(intercept=0, slope=1) + geom_point() + xlim(c(-5, 5)) + ylim(c(-5, 5)) + xlab("Sample quantiles") + ylab("Theoretical quantile") + theme(aspect.ratio=1) + ggtitle("(f)") grid.arrange(p1, p2, p3, p4, p5, p6, ncol=3) ``` --- # How different is exponential from Pareto? Exponential: $$f(x~|~\lambda) = e^{-\lambda x} ~~ x\geq 0$$ Pareto: $$ f(x~|~\alpha, \lambda) = \frac{\alpha\lambda^\alpha}{(\lambda+x)^{\alpha+1}} ~~~x>0, \alpha>0, \lambda>0 $$ ```{r fig.align='center', fig.width=8, fig.height=3, fig.show='hold'} rpareto <- function(n, c){ if (c<=0) stop("c must be positive") rp <- runif(n)^(-1/c) rp } df <- data.frame(re=sort(scale(rexp(1000))), rp=sort(scale(rpareto(1000, 1)))) p1 <- ggplot(df, aes(x=re)) + geom_histogram() + ggtitle("Exp") p2 <- ggplot(df, aes(x=rp)) + geom_histogram() + ggtitle("Pareto") p3 <- ggplot(df, aes(x=re, y=rp)) + geom_abline(intercept=0, slope=1) + geom_point() + xlab("exp") + ylab("Pareto") + ggtitle("QQ-plot") + theme(aspect.ratio=1) grid.arrange(p1, p2, p3, ncol=3) ``` --- # How different can exponentials be? One of these is an exponential vs pareto, the rest are samples from exponentials. Can you pick the pareto vs exponential plot? ```{r fig.align='center', fig.width=8, fig.height=5, fig.show='hold'} df <- data.frame(re=sort(scale(rexp(100))), rp=sort(scale(rpareto(100, 1))), re2=sort(scale(rexp(100))), re3=sort(scale(rexp(100))), re4=sort(scale(rexp(100))), re5=sort(scale(rexp(100)))) p1 <- ggplot(df, aes(x=re, y=rp)) + geom_abline(intercept=0, slope=1) + geom_point() + xlab("") + ylab("") + theme(aspect.ratio=1) + ggtitle("(a)") p2 <- ggplot(df, aes(x=re, y=re2)) + geom_abline(intercept=0, slope=1) + geom_point() + xlab("") + ylab("") + theme(aspect.ratio=1) + ggtitle("(b)") p3 <- ggplot(df, aes(x=re, y=re3)) + geom_abline(intercept=0, slope=1) + geom_point() + xlab("") + ylab("") + theme(aspect.ratio=1) + ggtitle("(c)") p4 <- ggplot(df, aes(x=re, y=re3)) + geom_abline(intercept=0, slope=1) + geom_point() + xlab("") + ylab("") + theme(aspect.ratio=1) + ggtitle("(d)") p5 <- ggplot(df, aes(x=re, y=re4)) + geom_abline(intercept=0, slope=1) + geom_point() + xlab("") + ylab("") + theme(aspect.ratio=1) + ggtitle("(e)") p6 <- ggplot(df, aes(x=re, y=re5)) + geom_abline(intercept=0, slope=1) + geom_point() + xlab("") + ylab("") + theme(aspect.ratio=1) + ggtitle("(f)") grid.arrange(p1, p2, p3, p4, p5, p6, ncol=3) ``` --- # Resources - [wikipedia](https://en.wikipedia.org/wiki/Quantile) - [PSU 414](https://onlinecourses.science.psu.edu/stat414/node/191) --- class: inverse middle # Share and share alike Creative Commons License
This work is licensed under a Creative Commons Attribution 4.0 International License.