Multivariate Categorical and Relational Data

1
2
3
4
5
import numpy as np
marathon = np.loadtxt("https://raw.githubusercontent.com/gagolews/" +
"teaching-data/master/marek/37_pzu_warsaw_marathon_3groups_top1000.txt",
delimiter=",", dtype=str)
marathon[:6, :] # preview

Two-Way Contingency Tables

1
2
3
np.unique(marathon[:, 0])

np.unique(marathon[:, 1])
1
2
3
4
5
6
import scipy.stats
l, v = scipy.stats.contingency.crosstab(marathon[:, 0], marathon[:, 1])
l, v

import marek
marek.print_labelled_array(l, v)
1
2
3
marek.print_labelled_array(l[0], np.sum(v, axis=1))

marek.print_labelled_array(l[1], np.sum(v, axis=0))

Visualising

Heat Map

1
2
3
4
5
6
7
8
9
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use("seaborn")
from matplotlib import cm
sns.heatmap(
v, xticklabels=l[1], yticklabels=l[0],
annot=True, fmt="d", cmap=cm.get_cmap("copper")
)
plt.show()

Bar Plot

1
2
3
4
5
6
ind = np.arange(len(l[1]))
plt.bar(ind-0.25, height=v[0, :], width=0.4, label=l[0][0])
plt.bar(ind+0.25, height=v[1, :], width=0.4, label=l[0][1])
plt.xticks(ind, l[1])
plt.legend()
plt.show()
1
2
3
4
5
6
7
8
ind = np.arange(len(l[0]))
p_sex = v/np.sum(v, axis=1).reshape(-1, 1)*100.0 # normalise each row (100%)
c_sex = np.insert(np.cumsum(p_sex, axis=1), 0, 0, axis=1) # prepend column [0]
for j in range(p_sex.shape[1]):
plt.barh(ind, left=c_sex[:, j], width=p_sex[:, j], label=l[1][j])
plt.yticks(ind, l[0])
plt.legend(ncol=len(l[1]))
plt.show()

Higher-Order Contingency Tables

1
2
3
4
5
marathon = np.column_stack((
marathon,
np.where(marathon[:, 2] == "PL", "PL",
np.where(marathon[:, 2] == "KE", "KE", "XX"))
))
1
2
3
4
5
6
7
8
l, v = scipy.stats.contingency.crosstab(marathon[:, 3], marathon[:, 0], marathon[:, 1])
l, v

marek.print_labelled_array(l, v)

marek.print_labelled_array((l[0], l[1]), np.sum(v, axis=2))

marek.print_labelled_array(l[0], np.sum(v, axis=(1, 2)))

Measuring Association

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
l = [
["Arthritis", "Asthma", "Back problems", "Cancer (malignant neoplasms)",
"Chronic obstructive pulmonary disease", "Diabetes mellitus",
"Heart, stroke and vascular disease", "Kidney disease",
"Mental and behavioural conditions", "Osteoporosis"],
["15-44", "45-64", "65+"]
]
v = 1000*np.array([
[ 360.2, 1489.0, 1772.2],
[1069.7, 741.9, 433.7],
[1469.6, 1513.3, 955.3],
[ 28.1, 162.7, 237.5],
[ 103.8, 207.0, 251.9],
[ 135.4, 427.3, 607.7],
[ 94.0, 344.4, 716.0],
[ 29.6, 67.7, 123.3],
[2218.9, 1390.6, 725.0],
[ 36.1, 312.3, 564.7],
]).astype(int)
marek.print_labelled_array(l, v)
1
2
3
4
5
scipy.stats.contingency.association(v)

e = np.sum(v, axis=1).reshape(-1, 1)*np.sum(v, axis=0).reshape(1, -1)/np.sum(v)
chisq = np.sum((v - e)**2/e)
np.sqrt(chisq/np.sum(v)/(np.min(v.shape)-1))