Visualising Multidimensional Data and Measuring Correlation

1
2
3
4
5
6
7
8
9
import numpy as np
import pandas as pd
body = pd.read_csv("https://raw.githubusercontent.com/gagolews/" +
"teaching-data/master/marek/nhanes_adult_female_bmx_2020.csv",
comment="#")
body = body.to_numpy() # data frames will be covered later
body.shape

body[:6, :] # 6 first rows, all columns

Scatterplots

2D Data

1
2
3
4
5
6
7
8
9
10
11
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use("seaborn")
plt.scatter(body[:, 1], body[:, 3]) # x=body[:, 1], y=body[:, 3]
plt.xlabel("standing height (cm)")
plt.ylabel("upper leg length (cm)")
plt.show()

body[np.argmin(body[:, 1]), [1, 3]]

body[np.argmax(body[:, 3]), [1, 3]]
1
2
3
4
5
6
plt.scatter(body[:, 1], body[:, 3], c="#00000011")
plt.plot(np.mean(body[:, 1]), np.mean(body[:, 3]), "rX") # the centroid
plt.xlabel("standing height (cm)")
plt.ylabel("upper leg length (cm)")
plt.axis("equal")
plt.show()

3D Data and Beyond

1
2
3
4
5
6
7
8
fig = plt.figure()
ax = fig.add_subplot(projection="3d")
ax.scatter(body[:, 1], body[:, 3], body[:, 0], color="#00000011")
ax.view_init(elev=30, azim=20, vertical_axis="y")
ax.set_xlabel("standing height (cm)")
ax.set_ylabel("upper leg length (cm)")
ax.set_zlabel("weight (kg)")
plt.show()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
from matplotlib import cm
plt.scatter(
body[:, 4], # x
body[:, 5], # y
c=body[:, 0], # "z" - colours
cmap=cm.get_cmap("copper"), # colour map
alpha=0.5 # opaqueness level between 0 and 1
)
plt.xlabel("arm circumference (cm)")
plt.ylabel("hip circumference (cm)")
plt.axis("equal")
plt.rcParams["axes.grid"] = False
cbar = plt.colorbar()
plt.rcParams["axes.grid"] = True
cbar.set_label("weight (kg)")
plt.show()

Scatterplot Matrix

1
2
3
4
5
6
7
8
sns.pairplot(data=pd.DataFrame(
body[:, [0, 1, 4, 5]],
columns=[
"weight (kg)", "standing height (cm)",
"arm circumference (cm)", "hip circumference (cm)"
]
))
plt.show()

Measuring Correlation

Pearson’s Linear Correlation Coefficient

with $s_x$, $s_y$ denoting the standard deviations and $\bar{x}$, $\bar{y}$ being the means of $x = (x_1, \cdots, x_n)$ and $y=(y_1, \cdots, y_n)$, respectively.

1
2
3
4
5
x = body[:, 4]  # arm circumference
y = body[:, 5] # hip circumference
x_std = (x-np.mean(x))/np.std(x, ddof=1)
y_std = (y-np.mean(y))/np.std(y, ddof=1)
np.sum(x_std*y_std)/(len(x)-1)
1
2
import scipy.stats
scipy.stats.pearsonr(x, y)[0] # returns more than we ask for
1
2
3
4
5
def plot_corr(x, y):
r = scipy.stats.pearsonr(x, y)[0]
ρ = scipy.stats.spearmanr(x, y)[0]
plt.scatter(x, y, label=f"r = {r:.3}\nρ = {ρ:.3}")
plt.legend()
1
2
3
4
5
6
7
8
9
np.random.seed(123)
x = np.random.rand(100)
plt.subplot(121)
plot_corr(x, -0.5*x+3) # negative slope
plt.axis("equal")
plt.subplot(122)
plot_corr(x, 3*x+10) # positive slope
plt.axis("equal")
plt.show()
1
2
3
4
5
6
7
8
9
10
11
12
x = np.random.rand(100)
y = 0.5*x
e = np.random.randn(len(x)) # random white noise (of mean 0)
plt.subplot(221)
plot_corr(x, y)
plt.subplot(222)
plot_corr(x, y+0.05*e) # add some noise
plt.subplot(223)
plot_corr(x, y+0.1*e) # more noise
plt.subplot(224)
plot_corr(x, y+0.25*e) # even more noise
plt.show()
1
2
3
4
5
6
7
8
9
plt.subplot(221)
plot_corr(x, np.random.rand(100)) # independent (not correlated)
plt.subplot(222)
plot_corr(x, (2*x-1)**2-1) # quadratic dependence
plt.subplot(223)
plot_corr(x, np.abs(2*x-1)) # another form of dependence
plt.subplot(224)
plot_corr(x, 0.25*np.sin(8*np.pi*x)) # another
plt.show()
1
2
3
4
5
6
7
8
9
plt.subplot(221)
plot_corr(x, 0.7*np.sin(2*np.pi*x))
plt.subplot(222)
plot_corr(x, np.log(x+1))
plt.subplot(223)
plot_corr(x, np.exp(x**2))
plt.subplot(224)
plot_corr(x, 1/x)
plt.show()

Correlation Heatmap

1
2
3
4
5
6
7
8
9
10
11
order = [4, 5, 6, 0, 2, 1, 3]
cols = np.array(["weight", "height", "arm len",
"leg len", "arm circ", "hip circ", "waist circ"])
C = np.corrcoef(body.T)
sns.heatmap(
C[np.ix_(order, order)],
xticklabels=cols[order],
yticklabels=cols[order],
annot=True, fmt=".2f", cmap=cm.get_cmap("copper")
)
plt.show()

Linear Correlation Coefficients on Transformed Data

1
2
3
4
5
6
7
8
9
10
11
12
13
14
world = pd.read_csv("https://raw.githubusercontent.com/gagolews/" +
"teaching-data/master/marek/world_factbook_2020_subset1.csv",
comment="#")
world = world.to_numpy()
world[:6, :] # preview

scipy.stats.pearsonr(world[:, 0], world[:, 1])[0]
scipy.stats.pearsonr(np.log(world[:, 0]), world[:, 1])[0]

plt.subplot(121)
plot_corr(world[:, 0], world[:, 1])
plt.subplot(122)
plot_corr(np.log(world[:, 0]), world[:, 1])
plt.show()

Spearman’s Rank Correlation Coefficient

Spearman’s Rank Correlation Coefficient

1
2
3
4
5
6
7
8
scipy.stats.spearmanr(world[:, 0], world[:, 1])[0]

scipy.stats.pearsonr(
scipy.stats.rankdata(world[:, 0]),
scipy.stats.rankdata(world[:, 1])
)[0]

scipy.stats.spearmanr(np.log(world[:, 0]), -np.sqrt(world[:, 1]))[0]