Multivariate Categorical and Relational Data

import numpy as np
marathon = np.loadtxt("https://raw.githubusercontent.com/gagolews/" +
    "teaching-data/master/marek/37_pzu_warsaw_marathon_3groups_top1000.txt",
    delimiter=",", dtype=str)
marathon[:6, :]  # preview

Two-Way Contingency Tables

1
2
3

np.unique(marathon[:, 0])

np.unique(marathon[:, 1])

import scipy.stats
l, v = scipy.stats.contingency.crosstab(marathon[:, 0], marathon[:, 1])
l, v

import marek
marek.print_labelled_array(l, v)

1
2
3

marek.print_labelled_array(l[0], np.sum(v, axis=1))

marek.print_labelled_array(l[1], np.sum(v, axis=0))

Visualising

Heat Map

import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use("seaborn")
from matplotlib import cm
sns.heatmap(
    v, xticklabels=l[1], yticklabels=l[0],
    annot=True, fmt="d", cmap=cm.get_cmap("copper")
)
plt.show()

Bar Plot

ind = np.arange(len(l[1]))
plt.bar(ind-0.25, height=v[0, :], width=0.4, label=l[0][0])
plt.bar(ind+0.25, height=v[1, :], width=0.4, label=l[0][1])
plt.xticks(ind, l[1])
plt.legend()
plt.show()

ind = np.arange(len(l[0]))
p_sex = v/np.sum(v, axis=1).reshape(-1, 1)*100.0  # normalise each row (100%)
c_sex = np.insert(np.cumsum(p_sex, axis=1), 0, 0, axis=1)  # prepend column [0]
for j in range(p_sex.shape[1]):
    plt.barh(ind, left=c_sex[:, j], width=p_sex[:, j], label=l[1][j])
plt.yticks(ind, l[0])
plt.legend(ncol=len(l[1]))
plt.show()

Higher-Order Contingency Tables

marathon = np.column_stack((
    marathon,
    np.where(marathon[:, 2] == "PL", "PL",
        np.where(marathon[:, 2] == "KE", "KE", "XX"))
))

l, v = scipy.stats.contingency.crosstab(marathon[:, 3], marathon[:, 0], marathon[:, 1])
l, v

marek.print_labelled_array(l, v)

marek.print_labelled_array((l[0], l[1]), np.sum(v, axis=2))

marek.print_labelled_array(l[0], np.sum(v, axis=(1, 2)))

Measuring Association

l = [
    ["Arthritis", "Asthma", "Back problems", "Cancer (malignant neoplasms)",
     "Chronic obstructive pulmonary disease", "Diabetes mellitus",
     "Heart, stroke and vascular disease", "Kidney disease",
     "Mental and behavioural conditions", "Osteoporosis"],
    ["15-44", "45-64", "65+"]
]
v = 1000*np.array([
    [ 360.2,     1489.0,      1772.2],
    [1069.7,      741.9,       433.7],
    [1469.6,     1513.3,       955.3],
    [  28.1,      162.7,       237.5],
    [ 103.8,      207.0,       251.9],
    [ 135.4,      427.3,       607.7],
    [  94.0,      344.4,       716.0],
    [  29.6,       67.7,       123.3],
    [2218.9,     1390.6,       725.0],
    [  36.1,      312.3,       564.7],
]).astype(int)
marek.print_labelled_array(l, v)

scipy.stats.contingency.association(v)

e = np.sum(v, axis=1).reshape(-1, 1)*np.sum(v, axis=0).reshape(1, -1)/np.sum(v)
chisq = np.sum((v - e)**2/e)
np.sqrt(chisq/np.sum(v)/(np.min(v.shape)-1))