Handling Categorical Data Representing Categorical Data Two common ways to represent a categorical variable with k distinct levels is by storing it as:
a vector of strings,
a vector of integers between 0 (inclusive) and k (exclusive).
1 2 3 4 5 6 7 8 import numpy as npcountries = np.loadtxt("https://raw.githubusercontent.com/gagolews/" + "teaching-data/master/marek/37_pzu_warsaw_marathon_country.txt" , dtype="str" ) x = countries[:16 ] x np.unique(x)
Encoding and Decoding Factors 1 2 3 4 5 6 categories, codes = np.unique(x, return_inverse=True ) categories, codes categories[codes] np.array(["Ethiopia" , "Israel" , "Kenya" , "Morocco" , "Poland" ])[codes]
Binning Numeric Data 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 marathon = np.loadtxt("https://raw.githubusercontent.com/gagolews/" + "teaching-data/master/marek/37_pzu_warsaw_marathon_mins.txt" ) t = marathon[:16 ] t bins = [130 , 140 , 150 ] codes = np.searchsorted(bins, t) codes bins2 = np.r_[-np.inf, bins, np.inf] categories = np.array( [f"({bins2[i-1 ]} , {bins2[i]} ]" for i in range (1 , len (bins2))] ) categories categories[codes]
Generating Pseudorandom Labels 1 2 3 4 5 np.random.seed(123 ) np.random.choice( ["spam" , "bacon" , "eggs" , "tempeh" ], p=[0.7 , 0.1 , 0.15 , 0.05 ], replace=True , size=16 )
Frequency Distributions Counting 1 2 3 4 5 6 x = countries[:16 ] np.unique(x, return_counts=True ) categories, codes = np.unique(x, return_inverse=True ) counts = np.bincount(codes) counts
1 2 3 4 for category, count in zip (categories, counts): print (f"{category:4 } : {count:5 } " ) counts/np.sum (counts)
Visualising Bar Plots 1 2 3 4 5 6 import matplotlib.pyplot as pltplt.style.use("seaborn" ) ind = np.arange(len (categories)) plt.bar(ind, height=counts) plt.xticks(ind, categories) plt.show()
1 2 3 4 5 plt.bar([1 , 2 ], height=[51.03 , 48.97 ]) plt.xticks([1 , 2 ], ["Duda" , "Trzaskowski" ]) plt.ylabel("%" ) plt.ylim(48.9 , 51.1 ) plt.show()
1 2 3 4 5 6 plt.bar([1 , 2 ], height=[51.03 , 48.97 ]) plt.xticks([1 , 2 ], ["Duda" , "Trzaskowski" ]) plt.ylabel("%" ) plt.ylim(0 , 250 ) plt.yticks([0 , 100 ]) plt.show()
Error Bars 1 2 3 4 5 6 c, n = 516 , 1017 plt.bar([1 , 2 ], height=[c/n, (n-c)/n]) plt.errorbar([1 , 2 ], [c/n, (n-c)/n], yerr=0.031 , fmt="r" ) plt.xticks([1 , 2 ], ["Duda" , "Trzaskowski" ]) plt.ylabel("%" ) plt.show()
Pareto Charts 1 2 3 4 5 6 7 categories = np.array([ "Unauthorised drug" , "Wrong IV rate" , "Wrong patient" , "Dose missed" , "Under dose" , "Wrong calculation" ,"Wrong route" , "Wrong drug" , "Wrong time" , "Technique error" , "Duplicated drugs" , "Over dose" ]) counts = np.array([1 , 4 , 53 , 92 , 7 , 16 , 27 , 76 , 83 , 3 , 9 , 59 ]) np.sum (counts)
1 2 3 4 5 o = np.argsort(counts)[::-1 ] categories = categories[o] counts = counts[o] for category, count in zip (categories, counts): print (f"{category:20 } : {count:2 } " )
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 x = np.arange(len (categories)) p = 100.0 *counts/np.sum (counts) fig, ax1 = plt.subplots() ax1.set_xticks(x-0.5 , categories, rotation=60 ) ax1.set_ylabel("%" ) ax1.bar(x, height=p) ax2 = ax1.twinx() ax2.plot(x, np.cumsum(p), "ro-" ) ax2.grid(visible=False ) ax2.set_ylabel("cumulative %" ) fig.tight_layout() plt.show()
1 2 for category, cumprob in zip (categories, np.round (np.cumsum(p), 1 )): print (f"{category:20 } : {cumprob:5 } %" )
Aggregating 1 2 3 4 5 categories, counts = np.unique(countries, return_counts=True ) categories[np.argmax(counts)] categories2, counts2 = np.unique(countries[:22 ], return_counts=True ) categories2[np.where(counts2 == np.max (counts2))]
Binary Data and Logical Vectors 1 2 3 4 5 6 7 8 9 10 np.array([True , False , True , True , False ]).astype(int ) np.array([-2 , -0.326 , -0.000001 , 0.0 , 0.1 , 1 , 7643 ]).astype(bool ) np.array([-2 , -0.326 , -0.000001 , 0.0 , 0.1 , 1 , 7643 ]) != 0 x = countries[:16 ] x np.sum (x == "KE" ) np.mean(x == "KE" )