Continuous Probability Distributions 1 2 3 4 import numpy as npimport matplotlib.pyplot as pltimport seaborn as snsplt.style.use("seaborn" )
1 2 3 4 5 6 heights = np.loadtxt("https://raw.githubusercontent.com/gagolews/" + "teaching-data/master/marek/nhanes_adult_female_height_2020.txt" ) sns.histplot(heights, stat="density" , kde=True ) plt.show() import scipy.stats
Normal Distribution Normal Distribution 1 2 3 4 5 6 7 8 μ = np.mean(heights) σ = np.std(heights, ddof=1 ) μ, σ sns.histplot(heights, stat="density" , kde=True ) x = np.linspace(np.min (heights), np.max (heights), 1000 ) plt.plot(x, scipy.stats.norm.pdf(x, μ, σ), "r:" ) plt.show()
Comparing Cumulative Distribution Functions 1 2 3 4 5 6 7 8 9 10 11 n = len (heights) x = np.linspace(np.min (heights), np.max (heights), 1001 ) probs = scipy.stats.norm.cdf(x, μ, σ) plt.plot(x, probs, "r--" , label="Theoretical CDF" ) heights_sorted = np.sort(heights) plt.plot(heights_sorted, np.arange(1 , n+1 )/n, drawstyle="steps-post" , label="Empirical CDF" ) plt.xlabel("$x$" ) plt.ylabel("Prob(height $\\leq$ x)" ) plt.legend() plt.show()
Q-Q plots Q-Q plots 1 2 3 4 5 6 7 8 9 n = len (heights) q = np.arange(1 , n+1 )/(n+1 ) heights_sorted = np.sort(heights) quantiles = scipy.stats.norm.ppf(q, μ, σ) plt.plot(quantiles, heights_sorted, "o" ) plt.axline((heights_sorted[n//2 ], heights_sorted[n//2 ]), slope=1 ) plt.xlabel("Theoretical quantiles" ) plt.ylabel("Sample quantiles" ) plt.show()
Log-normal Distribution Log-normal Distribution 1 2 3 4 5 6 7 8 income = np.loadtxt("https://raw.githubusercontent.com/gagolews/" + "teaching-data/master/marek/uk_income_simulated_2020.txt" ) sns.histplot(income, stat="density" , kde=True , log_scale=True ) plt.show() μ = np.mean(np.log(income)) σ = np.std(np.log(income), ddof=1 ) μ, σ
1 2 3 4 sns.histplot(income, stat="density" , kde=True ) x = np.linspace(np.min (income), np.max (income), 1000 ) plt.plot(x, scipy.stats.lognorm.pdf(x, s=σ, scale=np.exp(μ)), "r:" ) plt.show()
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 n = len (income) q = np.arange(1 , n+1 )/(n+1 ) income_sorted = np.sort(income) plt.subplot(121 ) quantiles = scipy.stats.lognorm.ppf(q, s=σ, scale=np.exp(μ)) plt.plot(quantiles, income_sorted, "o" ) plt.axline((income_sorted[n//2 ], income_sorted[n//2 ]), slope=1 ) plt.xlabel("Log-normal theoretical quantiles" ) plt.ylabel("Sample quantiles" ) plt.xscale("log" ) plt.yscale("log" ) plt.subplot(122 ) quantiles2 = scipy.stats.norm.ppf( q, np.mean(income), np.std(income_sorted, ddof=1 ) ) plt.plot(quantiles2, income_sorted, "o" ) plt.axline((income_sorted[n//2 ], income_sorted[n//2 ]), slope=1 ) plt.xlabel("Normal theoretical quantiles" ) plt.show()
Pareto Distribution 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 cities = np.loadtxt("https://raw.githubusercontent.com/gagolews/" + "teaching-data/master/other/us_cities_2000.txt" ) len (cities), sum (cities) sns.histplot(cities, bins=20 , log_scale=True ) plt.show() min_size = 10_000 large_cities = cities[cities >= min_size] len (large_cities), sum (large_cities) sns.histplot(large_cities, bins=20 , log_scale=(True , True )) plt.show() α = 1 /np.mean(np.log(large_cities/min_size)) α b = np.geomspace(min_size, np.max (large_cities), 21 ) c = np.histogram(large_cities, bins=b)[0 ] c = c/np.sum (c) p = np.diff(scipy.stats.pareto.cdf(b, α, scale=min_size)) p = p/np.sum (p) midb = 0.5 *(b[:-1 ]+b[1 :]) plt.bar(midb, width=np.diff(b), height=c, edgecolor="black" ) plt.plot(midb, p, "r:" ) plt.xscale("log" ) plt.yscale("log" ) plt.show() n = len (large_cities) q = np.arange(1 , n+1 )/(n+1 ) cities_sorted = np.sort(large_cities) quantiles = scipy.stats.pareto.ppf(q, α, scale=min_size) plt.plot(quantiles, cities_sorted, "o" ) plt.axline((quantiles[n//2 ], cities_sorted[n//2 ]), slope=1 ) plt.xlabel("Theoretical quantiles" ) plt.ylabel("Sample quantiles" ) plt.xscale("log" ) plt.yscale("log" ) plt.show()
1 2 3 4 5 6 7 8 9 lotto = np.loadtxt("https://raw.githubusercontent.com/gagolews/" + "teaching-data/master/marek/lotto_table.txt" ) plt.bar(np.arange(1 , 50 ), width=1 , height=lotto, edgecolor="black" ) plt.show() x = np.arange(1 , 50 ) plt.bar(x, width=1 , height=lotto/np.sum (lotto), edgecolor="black" ) plt.plot(x, scipy.stats.uniform.pdf(x, 1 , 49 ), "r:" ) plt.show()
Generating Pseudorandom Numbers 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 np.random.rand(5 ) scipy.stats.uniform.rvs(-10 , 25 , size=5 ) np.random.seed(123 ) np.random.rand(5 ) np.random.seed(123 ) np.random.rand(5 ) scipy.stats.uniform.rvs(size=5 , random_state=123 ) sample = scipy.stats.norm.rvs(100 , 16 , size=250 , random_state=12641 ) sns.histplot(sample, stat="density" , kde=True ) x = np.linspace(36 , 164 , 1001 ) plt.plot(x, scipy.stats.norm.pdf(x, 100 , 16 ), "r:" ) plt.show()
Distribution Mixtures 1 2 3 4 5 6 7 8 9 10 11 12 13 peds = np.loadtxt("https://raw.githubusercontent.com/gagolews/" + "teaching-data/master/marek/southern_cross_station_peds_2019_dec.txt" ) plt.bar(np.arange(0 , 24 ), width=1 , height=peds, edgecolor="black" ) plt.show() plt.bar(np.arange(0 , 24 ), width=1 , height=peds/np.sum (peds), edgecolor="black" ) x = np.arange(0 , 25 , 0.1 ) p1 = scipy.stats.norm.pdf(x, 8 , 1 ) p2 = scipy.stats.norm.pdf(x, 12 , 1 ) p3 = scipy.stats.norm.pdf(x, 17 , 2 ) p = 0.35 *p1 + 0.1 *p2 + 0.55 *p3 plt.plot(x, p, "r:" ) plt.show()