Continuous Probability Distributions

1
2
3
4
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use("seaborn")
1
2
3
4
5
6
heights = np.loadtxt("https://raw.githubusercontent.com/gagolews/" +
"teaching-data/master/marek/nhanes_adult_female_height_2020.txt")
sns.histplot(heights, stat="density", kde=True)
plt.show()

import scipy.stats

Normal Distribution

Normal Distribution

1
2
3
4
5
6
7
8
μ = np.mean(heights)  # an estimator of expected value
σ = np.std(heights, ddof=1) # an estimator of standard deviation
μ, σ

sns.histplot(heights, stat="density", kde=True)
x = np.linspace(np.min(heights), np.max(heights), 1000)
plt.plot(x, scipy.stats.norm.pdf(x, μ, σ), "r:") # a dotted red curve
plt.show()

Comparing Cumulative Distribution Functions

1
2
3
4
5
6
7
8
9
10
11
n = len(heights)
x = np.linspace(np.min(heights), np.max(heights), 1001)
probs = scipy.stats.norm.cdf(x, μ, σ) # sample the CDF at many points
plt.plot(x, probs, "r--", label="Theoretical CDF")
heights_sorted = np.sort(heights)
plt.plot(heights_sorted, np.arange(1, n+1)/n,
drawstyle="steps-post", label="Empirical CDF")
plt.xlabel("$x$")
plt.ylabel("Prob(height $\\leq$ x)")
plt.legend()
plt.show()

Q-Q plots

Q-Q plots

1
2
3
4
5
6
7
8
9
n = len(heights)
q = np.arange(1, n+1)/(n+1) # 1/(n+1), 2/(n+2), ..., n/(n+1)
heights_sorted = np.sort(heights) # theoretical quantiles
quantiles = scipy.stats.norm.ppf(q, μ, σ) # sample quantiles
plt.plot(quantiles, heights_sorted, "o")
plt.axline((heights_sorted[n//2], heights_sorted[n//2]), slope=1)
plt.xlabel("Theoretical quantiles")
plt.ylabel("Sample quantiles")
plt.show()

Log-normal Distribution

Log-normal Distribution

1
2
3
4
5
6
7
8
income = np.loadtxt("https://raw.githubusercontent.com/gagolews/" +
"teaching-data/master/marek/uk_income_simulated_2020.txt")
sns.histplot(income, stat="density", kde=True, log_scale=True)
plt.show()

μ = np.mean(np.log(income))
σ = np.std(np.log(income), ddof=1)
μ, σ

1
2
3
4
sns.histplot(income, stat="density", kde=True)
x = np.linspace(np.min(income), np.max(income), 1000)
plt.plot(x, scipy.stats.lognorm.pdf(x, s=σ, scale=np.exp(μ)), "r:")
plt.show()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
n = len(income)
q = np.arange(1, n+1)/(n+1)
income_sorted = np.sort(income)

plt.subplot(121)
quantiles = scipy.stats.lognorm.ppf(q, s=σ, scale=np.exp(μ))
plt.plot(quantiles, income_sorted, "o")
plt.axline((income_sorted[n//2], income_sorted[n//2]), slope=1)
plt.xlabel("Log-normal theoretical quantiles")
plt.ylabel("Sample quantiles")
plt.xscale("log")
plt.yscale("log")

plt.subplot(122)
quantiles2 = scipy.stats.norm.ppf(
q, np.mean(income), np.std(income_sorted, ddof=1)
)
plt.plot(quantiles2, income_sorted, "o")
plt.axline((income_sorted[n//2], income_sorted[n//2]), slope=1)
plt.xlabel("Normal theoretical quantiles")

plt.show()

Pareto Distribution

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
cities = np.loadtxt("https://raw.githubusercontent.com/gagolews/" +
"teaching-data/master/other/us_cities_2000.txt")
len(cities), sum(cities) # number of cities, total population

sns.histplot(cities, bins=20, log_scale=True)
plt.show()

min_size = 10_000
large_cities = cities[cities >= min_size]
len(large_cities), sum(large_cities) # number of cities, total population

sns.histplot(large_cities, bins=20, log_scale=(True, True))
plt.show()

α = 1/np.mean(np.log(large_cities/min_size))
α

b = np.geomspace(min_size, np.max(large_cities), 21) # bins' boundaries
c = np.histogram(large_cities, bins=b)[0] # apply binning
c = c/np.sum(c) # normalise so that it adds to 1

p = np.diff(scipy.stats.pareto.cdf(b, α, scale=min_size))
p = p/np.sum(p) # normalise so that it adds to 1

midb = 0.5*(b[:-1]+b[1:]) # mid-bins
plt.bar(midb, width=np.diff(b), height=c, edgecolor="black")
plt.plot(midb, p, "r:")
plt.xscale("log")
plt.yscale("log")
plt.show()


n = len(large_cities)
q = np.arange(1, n+1)/(n+1)
cities_sorted = np.sort(large_cities)
quantiles = scipy.stats.pareto.ppf(q, α, scale=min_size)
plt.plot(quantiles, cities_sorted, "o")
plt.axline((quantiles[n//2], cities_sorted[n//2]), slope=1)
plt.xlabel("Theoretical quantiles")
plt.ylabel("Sample quantiles")
plt.xscale("log")
plt.yscale("log")
plt.show()

Uniform Distribution

1
2
3
4
5
6
7
8
9
lotto = np.loadtxt("https://raw.githubusercontent.com/gagolews/" +
"teaching-data/master/marek/lotto_table.txt")
plt.bar(np.arange(1, 50), width=1, height=lotto, edgecolor="black")
plt.show()

x = np.arange(1, 50)
plt.bar(x, width=1, height=lotto/np.sum(lotto), edgecolor="black")
plt.plot(x, scipy.stats.uniform.pdf(x, 1, 49), "r:")
plt.show()

Generating Pseudorandom Numbers

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
np.random.rand(5)
scipy.stats.uniform.rvs(-10, 25, size=5) # from -10 to -10+25

np.random.seed(123) # set seed
np.random.rand(5)

np.random.seed(123) # set seed
np.random.rand(5)

scipy.stats.uniform.rvs(size=5, random_state=123)

sample = scipy.stats.norm.rvs(100, 16, size=250, random_state=12641)
sns.histplot(sample, stat="density", kde=True)
x = np.linspace(36, 164, 1001)
plt.plot(x, scipy.stats.norm.pdf(x, 100, 16), "r:")
plt.show()

Distribution Mixtures

1
2
3
4
5
6
7
8
9
10
11
12
13
peds = np.loadtxt("https://raw.githubusercontent.com/gagolews/" +
"teaching-data/master/marek/southern_cross_station_peds_2019_dec.txt")
plt.bar(np.arange(0, 24), width=1, height=peds, edgecolor="black")
plt.show()

plt.bar(np.arange(0, 24), width=1, height=peds/np.sum(peds), edgecolor="black")
x = np.arange(0, 25, 0.1)
p1 = scipy.stats.norm.pdf(x, 8, 1)
p2 = scipy.stats.norm.pdf(x, 12, 1)
p3 = scipy.stats.norm.pdf(x, 17, 2)
p = 0.35*p1 + 0.1*p2 + 0.55*p3 # weighted (convex) combination of 3 densities
plt.plot(x, p, "r:")
plt.show()