Visualising Multidimensional Data and Measuring Correlation
1 2 3 4 5 6 7 8 9
import numpy as np import pandas as pd body = pd.read_csv("https://raw.githubusercontent.com/gagolews/" + "teaching-data/master/marek/nhanes_adult_female_bmx_2020.csv", comment="#") body = body.to_numpy() # data frames will be covered later body.shape
body[:6, :] # 6 first rows, all columns
Scatterplots
2D Data
1 2 3 4 5 6 7 8 9 10 11
import matplotlib.pyplot as plt import seaborn as sns plt.style.use("seaborn") plt.scatter(body[:, 1], body[:, 3]) # x=body[:, 1], y=body[:, 3] plt.xlabel("standing height (cm)") plt.ylabel("upper leg length (cm)") plt.show()
with $s_x$, $s_y$ denoting the standard deviations and $\bar{x}$, $\bar{y}$ being the means of $x = (x_1, \cdots, x_n)$ and $y=(y_1, \cdots, y_n)$, respectively.
1 2 3 4 5
x = body[:, 4] # arm circumference y = body[:, 5] # hip circumference x_std = (x-np.mean(x))/np.std(x, ddof=1) y_std = (y-np.mean(y))/np.std(y, ddof=1) np.sum(x_std*y_std)/(len(x)-1)
1 2
import scipy.stats scipy.stats.pearsonr(x, y)[0] # returns more than we ask for
1 2 3 4 5
defplot_corr(x, y): r = scipy.stats.pearsonr(x, y)[0] ρ = scipy.stats.spearmanr(x, y)[0] plt.scatter(x, y, label=f"r = {r:.3}\nρ = {ρ:.3}") plt.legend()
x = np.random.rand(100) y = 0.5*x e = np.random.randn(len(x)) # random white noise (of mean 0) plt.subplot(221) plot_corr(x, y) plt.subplot(222) plot_corr(x, y+0.05*e) # add some noise plt.subplot(223) plot_corr(x, y+0.1*e) # more noise plt.subplot(224) plot_corr(x, y+0.25*e) # even more noise plt.show()
1 2 3 4 5 6 7 8 9
plt.subplot(221) plot_corr(x, np.random.rand(100)) # independent (not correlated) plt.subplot(222) plot_corr(x, (2*x-1)**2-1) # quadratic dependence plt.subplot(223) plot_corr(x, np.abs(2*x-1)) # another form of dependence plt.subplot(224) plot_corr(x, 0.25*np.sin(8*np.pi*x)) # another plt.show()