---
title: 'Statistical Thinking using Randomisation and Simulation'
subtitle: "Statistical distributions"
author: Di Cook (dicook@monash.edu, @visnut)
date: "W3.C2"
output:
xaringan::moon_reader:
css: ["default", "myremark.css"]
self_contained: false
nature:
highlightStyle: github
highlightLines: true
countIncrementalSlides: false
---
```{r setup, include = FALSE}
library(knitr)
opts_chunk$set(
message = FALSE,
warning=FALSE,
error=FALSE,
cache = FALSE,
echo=FALSE,
fig.height = 2,
fig.width = 5,
collapse = TRUE,
comment = "#>"
)
options(digits=2)
library(dplyr)
library(tidyr)
library(ggplot2)
library(readr)
library(gridExtra)
```
# Overview of this class
- Random variables
- Central limit theorem
- Estimation
- Quantiles
- Goodness of fit
---
# Random variables vs random samples
- Conceptually we think about a **random variable** $(X)$ having a distribution
$$X \sim N(\mu, \sigma)$$
- Once we collect data, or use simulation we will have a realisation from the distribution, a random sample, observed values:
```{r}
x <- rnorm(5, 0, 1)
x
```
```{r fig.align='center', fig.width=3, fig.height=2, fig.show='hold'}
df <- data.frame(x=seq(-4,4, 0.008), d=dnorm(seq(-4,4, 0.008)))
df2 <- data.frame(x=x, y=0)
ggplot(df, aes(x=x, y=d)) + geom_line() + xlab("") + ylab("") +
geom_hline(yintercept=0, colour="grey70") +
geom_point(data=df2, aes(x=x, y=0), colour="red")
```
```{r eval=FALSE}
x <- rnorm(50, 0, 1)
df2 <- data.frame(x=x, y=0)
ggplot(df, aes(x=x, y=d)) + geom_line() + xlab("") + ylab("") +
geom_hline(yintercept=0, colour="grey70") +
geom_point(data=df2, aes(x=x, y=0), colour="red")
```
---
# Central limit theorem
- Why the normal model is so fundamental
- Regardless what distribution $X$ has, the mean of a sample will have a normal distribution, if the sample size is large:
"Let $\{X_1, ..., X_n\}$ be a random sample of size $n$ — that is, a sequence of independent and identically distributed random variables drawn from a distribution mean given by $\mu$ and finite variance given by $\sigma^2$. The sample average is defined $\bar{X} = \sum_{i=1}^{n} X_i/n$, then as $n$ gets large the distribution of $\bar{X}$ approximates $N(\mu, \sigma/\sqrt{n})$."
---
# Example: Uniform parent
```{r fig.align='center', fig.width=8, fig.height=5}
rs <- runif(10000000)
x2 <- apply(matrix(rs, ncol=2), 1, mean)
x20 <- apply(matrix(rs, ncol=20), 1, mean)
x50 <- apply(matrix(rs, ncol=50), 1, mean)
x100 <- apply(matrix(rs, ncol=100), 1, mean)
df <- data.frame(pop=rs[1:1000], x2=x2[1:1000], x20=x20[1:1000],
x50=x50[1:1000], x100=x100[1:1000])
p1 <- ggplot(data.frame(rs), aes(x=rs)) + geom_histogram(breaks=seq(0,1,0.1)) +
ggtitle("POPULATION") + xlab("")
p2 <- ggplot(df, aes(x=x2)) + geom_histogram(binwidth=0.1) +
ggtitle("n=2") + xlab("") + xlim(c(0,1))
p3 <- ggplot(df, aes(x=x20)) + geom_histogram(binwidth=0.05) +
ggtitle("n=20") + xlab("") + xlim(c(0,1))
p4 <- ggplot(df, aes(x=x100)) + geom_histogram(binwidth=0.02) +
ggtitle("n=100") + xlab("") + xlim(c(0,1))
grid.arrange(p1, p2, p3, p4, ncol=2)
```
---
# Example: Weibull parent
```{r fig.align='center', fig.width=8, fig.height=5}
rs <- rweibull(1000000, 1.5)
x2 <- apply(matrix(rs, ncol=2), 1, mean)
x20 <- apply(matrix(rs, ncol=20), 1, mean)
x50 <- apply(matrix(rs, ncol=50), 1, mean)
x100 <- apply(matrix(rs, ncol=100), 1, mean)
df <- data.frame(pop=rs[1:1000], x2=x2[1:1000], x20=x20[1:1000],
x50=x50[1:1000], x100=x100[1:1000])
p1 <- ggplot(data.frame(rs), aes(x=rs)) + geom_histogram() +
ggtitle("POPULATION") + xlab("") + xlim(c(0,6))
p2 <- ggplot(df, aes(x=x2)) + geom_histogram(binwidth=0.2) +
ggtitle("n=2") + xlab("") + xlim(c(0,6))
p3 <- ggplot(df, aes(x=x20)) + geom_histogram(binwidth=0.1) +
ggtitle("n=20") + xlab("") + xlim(c(0,6))
p4 <- ggplot(df, aes(x=x100)) + geom_histogram(binwidth=0.05) +
ggtitle("n=100") + xlab("") + xlim(c(0,6))
grid.arrange(p1, p2, p3, p4, ncol=2)
```
---
# Estimation
- Estimate parameters of a distribution from the sample data
- Common approach is maximum likelihood estimation
- Requires assuming we know the basic functional form
---
# Maximum likelihood estimate (MLE)
- Estimate the unknown parameter $\theta$ using the value that maximises the probability (i.e. likelihood) of getting the observed sample
- Likelihood function
$$\begin{aligned}
L(\theta) &=P(X_1=x_1,X_2=x_2, ... ,X_n=x_n~|~\theta) \\ &=f(x_1|\theta)f(x_2|\theta)\cdots f(x_n|\theta) \\
&=\prod_{i=1}^n f(x_i;\theta)
\end{aligned}$$
- This is now a function of $\theta$.
- Use function maximisation to solve.
---
# Example - Mean of normal distribution, assume variance is 1
- MLE estimate of the population mean for a normal model is the sample mean
- Run this numerically
- Suppose we have a sample of two: $x_1=1.0, x_2=3.0$
- Likelihood
$$
\begin{aligned}
L(\mu) &= \frac{1}{\sqrt{2\pi}}e^{-\frac{(1.0-\mu)^2}{2}}\times
\frac{1}{\sqrt{2\pi}}e^{-\frac{(3.0-\mu)^2}{2}}\\
&= \frac{1}{2\pi}e^{-\frac{(1-\mu)^2+(3-\mu)^2}{2}}
\end{aligned}
$$
---
# Plot it
```{r fig.align='center', fig.width=5, fig.height=3}
nmle <- function(mu) {
f <- 1/(2*pi) * exp(-((1-mu)^2+(3-mu)^2)/2)
return(f)
}
df <- data.frame(x=seq(1, 3, 0.1), f=nmle(seq(1, 3, 0.1)))
ggplot(df, aes(x=x, y=f)) + geom_line() + xlab(expression(mu)) + ylab("L")
```
- The maximum is at 2.0. This is the sample mean, which we can prove algebraically is the MLE.
---
# Estimate mean and variance
Sample
```{r}
set.seed(1031)
x <- rnorm(22, 3, 5)
x
```
We know it comes from a normal distribution. What are the best guesses for the $\mu, \sigma$?
---
Compute the likelihood for a range of values of both parameters.
```{r fig.align='center', fig.width=5, fig.height=5}
nmle <- function(x, mu, sigma) {
f <- prod(dnorm(x, mu, sigma))
return(f)
}
mu <- seq(1, 5, 0.1)
sig <- seq(2, 7, 0.1)
g <- expand.grid(x=mu, y=sig)
g$f <- 0
for (i in 1:nrow(g)) {
g$f[i] <- nmle(x, g$x[i], g$y[i])
}
ggplot(g, aes(x=x, y=y, fill=f)) + geom_tile() + xlab(expression(mu)) + ylab(expression(sigma)) + theme_bw() +
scale_fill_continuous("L") +
theme(aspect.ratio=1)
```
---
# Quantiles
- `quantiles` are cutpoints dividing the range of a probability distribution into contiguous intervals with equal probabilities
- 2-quantile is the median (divides the population into two equal halves)
- 4-quantile are quartiles, $Q_1, Q_2, Q_3$, dividing the population into four equal chunks
- quantiles are values of the random variable $X$
- useful for comparing distributions
---
# Example:
- 12-quantiles for a $N(0,1)$
```{r echo=TRUE}
qnorm(seq(1/12,11/12,1/12))
```
- 23-quantiles from a $Gamma(2,1)$
```{r echo=TRUE}
qgamma(seq(1/23,22/23,1/23), 2)
```
---
# Percentiles
- indicate the value of $X$ below which a given percentage of observations fall, e.g. 20th percentile is the value that has 20% of values below it
- 17th percentile from $N(0,1)$
```{r echo=TRUE}
qnorm(0.17)
```
- 78th percentile from $Gamma(2,1)$
```{r echo=TRUE}
qgamma(0.78, 2)
```
---
# Goodness of fit
- Quantile-quantile plot (QQplot) plots theoretical vs sample quantiles
- Lets check the distribution of PISA math scores
```{r fig.align='center', fig.width=8, fig.height=4}
st <- readRDS("../data/student_sub.rds")
aus <- st %>% filter(CNT == "AUS")
n <- nrow(aus)
aus_math <- data.frame(PV1MATH = sort(scale(aus$PV1MATH)),
q=qnorm(c(1 - 0.5^(1/n), (2:(n-1) - 0.3175) / (n + 0.365),0.5^(1/n))))
p1 <- ggplot(aus_math, aes(x=PV1MATH)) + geom_histogram(binwidth=0.5) +
xlab("Standardised math score")
p2 <- ggplot(aus_math, aes(x=PV1MATH, y=q)) +
geom_abline(intercept=0, slope=1) +
geom_point() + xlim(c(-5, 5)) + ylim(c(-5, 5)) +
xlab("Sample quantiles") + ylab("Theoretical quantile") +
theme(aspect.ratio=1)
grid.arrange(p1, p2, ncol=2)
```
---
# QQ-Plot computation
- Sort and standardise the sample values from low to high
- Compute theoretical quantiles (formula below), $n=$ sample size
- Plot the theoretical vs sample quantiles
$$\begin{aligned}
& 1 - 0.5^{(1/n)} ~~~ i=1 \\
~~~~~~~~~~~~~& \frac{i - 0.3175}{n + 0.365} ~~~ i=2, ..., n-1 \\
& 0.5^{(1/n)} ~~~~~~ ~~~ i=n
\end{aligned}$$
---
# Reading QQ-plots
- The points should lie along the $X=Y$ line, for the sample to be consistent with the distribution.
- How close is good enough?
- It depends on the sample size.
- Simulate some samples of the same size from the target distribution, and make QQ-plots of these, to compare with the actual data
---
```{r fig.align='center', fig.width=8, fig.height=5}
aus_math <- data.frame(PV1MATH = sort(scale(aus$PV1MATH)),
q=qnorm(c(1 - 0.5^(1/n), (2:(n-1) - 0.3175) / (n + 0.365),0.5^(1/n))),
s1=sort(rnorm(n)), s2=sort(rnorm(n)), s3=sort(rnorm(n)),
s4=sort(rnorm(n)), s5=sort(rnorm(n)))
p1 <- ggplot(aus_math, aes(x=PV1MATH, y=q)) +
geom_abline(intercept=0, slope=1) +
geom_point() + xlim(c(-5, 5)) + ylim(c(-5, 5)) +
xlab("Sample quantiles") + ylab("Theoretical quantile") +
theme(aspect.ratio=1) + ggtitle("(a)")
p2 <- ggplot(aus_math, aes(x=s1, y=q)) +
geom_abline(intercept=0, slope=1) +
geom_point() + xlim(c(-5, 5)) + ylim(c(-5, 5)) +
xlab("Sample quantiles") + ylab("Theoretical quantile") +
theme(aspect.ratio=1) + ggtitle("(b)")
p3 <- ggplot(aus_math, aes(x=s2, y=q)) +
geom_abline(intercept=0, slope=1) +
geom_point() + xlim(c(-5, 5)) + ylim(c(-5, 5)) +
xlab("Sample quantiles") + ylab("Theoretical quantile") +
theme(aspect.ratio=1) + ggtitle("(c)")
p4 <- ggplot(aus_math, aes(x=s3, y=q)) +
geom_abline(intercept=0, slope=1) +
geom_point() + xlim(c(-5, 5)) + ylim(c(-5, 5)) +
xlab("Sample quantiles") + ylab("Theoretical quantile") +
theme(aspect.ratio=1) + ggtitle("(d)")
p5 <- ggplot(aus_math, aes(x=s4, y=q)) +
geom_abline(intercept=0, slope=1) +
geom_point() + xlim(c(-5, 5)) + ylim(c(-5, 5)) +
xlab("Sample quantiles") + ylab("Theoretical quantile") +
theme(aspect.ratio=1) + ggtitle("(e)")
p6 <- ggplot(aus_math, aes(x=s5, y=q)) +
geom_abline(intercept=0, slope=1) +
geom_point() + xlim(c(-5, 5)) + ylim(c(-5, 5)) +
xlab("Sample quantiles") + ylab("Theoretical quantile") +
theme(aspect.ratio=1) + ggtitle("(f)")
grid.arrange(p1, p2, p3, p4, p5, p6, ncol=3)
```
---
# How different is exponential from Pareto?
Exponential:
$$f(x~|~\lambda) = e^{-\lambda x} ~~ x\geq 0$$
Pareto:
$$
f(x~|~\alpha, \lambda) = \frac{\alpha\lambda^\alpha}{(\lambda+x)^{\alpha+1}} ~~~x>0, \alpha>0, \lambda>0
$$
```{r fig.align='center', fig.width=8, fig.height=3, fig.show='hold'}
rpareto <- function(n, c){
if (c<=0) stop("c must be positive")
rp <- runif(n)^(-1/c)
rp
}
df <- data.frame(re=sort(scale(rexp(1000))), rp=sort(scale(rpareto(1000, 1))))
p1 <- ggplot(df, aes(x=re)) + geom_histogram() + ggtitle("Exp")
p2 <- ggplot(df, aes(x=rp)) + geom_histogram() + ggtitle("Pareto")
p3 <- ggplot(df, aes(x=re, y=rp)) +
geom_abline(intercept=0, slope=1) +
geom_point() +
xlab("exp") + ylab("Pareto") + ggtitle("QQ-plot") + theme(aspect.ratio=1)
grid.arrange(p1, p2, p3, ncol=3)
```
---
# How different can exponentials be?
One of these is an exponential vs pareto, the rest are samples from exponentials. Can you pick the pareto vs exponential plot?
```{r fig.align='center', fig.width=8, fig.height=5, fig.show='hold'}
df <- data.frame(re=sort(scale(rexp(100))), rp=sort(scale(rpareto(100, 1))),
re2=sort(scale(rexp(100))), re3=sort(scale(rexp(100))),
re4=sort(scale(rexp(100))), re5=sort(scale(rexp(100))))
p1 <- ggplot(df, aes(x=re, y=rp)) +
geom_abline(intercept=0, slope=1) +
geom_point() + xlab("") + ylab("") +
theme(aspect.ratio=1) + ggtitle("(a)")
p2 <- ggplot(df, aes(x=re, y=re2)) +
geom_abline(intercept=0, slope=1) +
geom_point() + xlab("") + ylab("") +
theme(aspect.ratio=1) + ggtitle("(b)")
p3 <- ggplot(df, aes(x=re, y=re3)) +
geom_abline(intercept=0, slope=1) +
geom_point() + xlab("") + ylab("") +
theme(aspect.ratio=1) + ggtitle("(c)")
p4 <- ggplot(df, aes(x=re, y=re3)) +
geom_abline(intercept=0, slope=1) +
geom_point() + xlab("") + ylab("") +
theme(aspect.ratio=1) + ggtitle("(d)")
p5 <- ggplot(df, aes(x=re, y=re4)) +
geom_abline(intercept=0, slope=1) +
geom_point() + xlab("") + ylab("") +
theme(aspect.ratio=1) + ggtitle("(e)")
p6 <- ggplot(df, aes(x=re, y=re5)) +
geom_abline(intercept=0, slope=1) +
geom_point() + xlab("") + ylab("") +
theme(aspect.ratio=1) + ggtitle("(f)")
grid.arrange(p1, p2, p3, p4, p5, p6, ncol=3)
```
---
# Resources
- [wikipedia](https://en.wikipedia.org/wiki/Quantile)
- [PSU 414](https://onlinecourses.science.psu.edu/stat414/node/191)
---
class: inverse middle
# Share and share alike
This work is licensed under a Creative Commons Attribution 4.0 International License.