Inspecting the Distribution of Numberic Data

1
2
3
4
5
import numpy as np
heights = np.loadtxt("https://raw.githubusercontent.com/gagolews/" +
"teaching-data/master/marek/nhanes_adult_female_height_2020.txt")

np.random.choice(heights, 24, replace=False)

Histograms

1
2
3
4
5
6
7
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use("seaborn")
sns.__version__ # FYI

sns.histplot(heights, bins=11)
plt.show()
1
2
3
4
income = np.loadtxt("https://raw.githubusercontent.com/gagolews/" +
"teaching-data/master/marek/uk_income_simulated_2020.txt")
sns.histplot(income, stat="percent", bins=20)
plt.show()
1
2
3
4
5
6
7
marathon = np.loadtxt("https://raw.githubusercontent.com/gagolews/" +
"teaching-data/master/marek/37_pzu_warsaw_marathon_mins.txt")

marathon[:5] # preview top 5 (data are sorted increasingly)

sns.histplot(marathon[marathon < 180])
plt.show()

Binning

1
2
3
4
5
6
plt.subplot(121)  # 1 row, 2 columns, 1st plot
sns.histplot(income, bins=5)
plt.subplot(122) # 1 row, 2 columns, 2nd plot
sns.histplot(income, bins=200)
plt.ylabel(None)
plt.show()
1
2
3
4
5
6
peds = np.loadtxt("https://raw.githubusercontent.com/gagolews/" +
"teaching-data/master/marek/southern_cross_station_peds_2019_dec.txt")
peds

plt.bar(np.arange(0, 24), width=1, height=peds, edgecolor="black")
plt.show()
1
2
3
4
matura = np.loadtxt("https://raw.githubusercontent.com/gagolews/" +
"teaching-data/master/marek/matura_2019_polish.txt")
plt.bar(np.arange(0, 71), width=1, height=matura, edgecolor="black")
plt.show()

Cumulative Counts

1
2
sns.histplot(heights, stat="percent", cumulative=True)
plt.show()
1
2
3
4
5
6
n = len(heights)
heights_sorted = np.sort(heights)
plt.plot(heights_sorted, np.arange(1, n+1)/n, drawstyle="steps-post")
plt.xlabel("$x$")
plt.ylabel("$\\hat{F}_n(x)$, i.e., Prob(height $\\leq$ x)")
plt.show()

Log-scale

1
2
3
4
5
cities = np.loadtxt("https://raw.githubusercontent.com/gagolews/" +
"teaching-data/master/other/us_cities_2000.txt")

large_cities = cities[cities >= 10000]
large_cities[-5:] # data are sorted
1
2
3
4
5
sns.histplot(large_cities, bins=20)
plt.show()

sns.histplot(large_cities, bins=20, log_scale=True)
plt.show()