Basic Usage of Data Analysis

Getting Started with JupyterLab

JupyterLab is a web-based development environment supporting numerous programming languages, including, of course, Python.

jupyterlab

Scalar Types in Python

Basic Operations on Data Frames

doc

pre-requisite

1
2
3
import numpy as np
import pandas as pd
pd.set_option("display.notebook_repr_html", False) # disable "rich" output

Aggregating

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
np.random.seed(123)
d = pd.DataFrame(dict(
u = np.round(np.random.rand(5), 2),
v = np.round(np.random.randn(5), 2),
w = ["spam", "bacon", "spam", "eggs", "sausage"]
), index=["a", "b", "c", "d", "e"]) # create a sample data frame
d


np.quantile(d.loc[:, "u"], [0, 0.5, 1]) # calculate quantiles of u

np.quantile(d.loc[:, "u"], [0, 0.5, 1], interpolation="nearest") # calculate quantiles of u with nearest interpolation

np.quantile(d.loc[:, ["u", "v"]], [0, 0.5, 1], axis=0) # calculate quantiles of u and v

np.mean(d.loc[:, ["u", "v"]], axis=0) # calculate mean of u and v

d.loc[:, ["u", "v"]].mean(numeric_only=True) # calculate mean of u and v without considering non-numeric columns

d.describe()

Transforming

1
2
3
4
5
6
7
8
9
10
11
12
13
np.exp(d.loc[:, "u"]) # apply exponential function to u

np.exp(d.loc[:, ["u", "v"]])

u = d.loc[:, "u"]
(u-np.mean(u))/np.std(u) # standardize u

d.loc[:, "u"] > d.loc[:, "v"]

uv = d.loc[:, ["u", "v"]].values # convert to numpy array
uv = (uv-np.mean(uv, axis=0))/np.std(uv, axis=0) # standardize u and v
uv

Filtering

Series

1
2
3
4
5
6
7
8
np.random.seed(123)
b = pd.Series(np.round(np.random.uniform(0,1,10),2))
b.index = np.random.permutation(np.r_[0:10])
b

c = b.copy()
c.index = list("abcdefghij")
c

DataFrame

1
2
3
4
5
6
7
8
9
np.random.seed(123)
d = pd.DataFrame(dict(
u = np.round(np.random.rand(5), 2),
v = np.round(np.random.randn(5), 2),
w = ["spam", "bacon", "spam", "eggs", "sausage"],
x = [True, False, True, False, True]
), index=["a", "b", "c", "d", "e"])

d.loc[d.loc[:, "u"] > 0.5, "u":"w"]

Adding Rows and Columns

1
2
d.loc[:, "y"] = d.loc[:, "u"]**2
d

Random Sampling

1
2
3
4
5
6
7
8
9
10
11
12
13
14
body = pd.read_csv("https://raw.githubusercontent.com/gagolews/" +
"teaching-data/master/marek/nhanes_adult_female_bmx_2020.csv",
comment="#")
body.sample(5, random_state=123) # 5 random rows without replacement

x = body.head(10) # this is just an example
x

np.random.seed(123) # reproducibility matters
idx = np.random.permutation(x.shape[0])
idx

k = int(x.shape[0]*0.8) # 80% of the data
x.iloc[idx[:k], :] # 80% of the data

Hierarchical Indices

1
2
3
4
5
6
7
8
9
10
11
np.random.seed(123)
d = pd.DataFrame(dict(
year = sorted([2023, 2024, 2025]*4),
quarter = ["Q1", "Q2", "Q3", "Q4"]*3,
data = np.round(np.random.rand(12), 2)
)).set_index(["year", "quarter"])
d

d.loc[2023, :]
d.loc[[(2023, "Q1"), (2024, "Q3")], :]

Reshaping and Fusing Datasets

1
2
3
4
5
6
7
8
9
10
11
12
13
import numpy as np
import pandas as pd
pd.set_option("display.notebook_repr_html", False) # disable "rich" output

air = pd.read_csv("https://raw.githubusercontent.com/gagolews/" +
"teaching-data/master/marek/air_quality_2018_means.csv",
comment="#")
air = (
air.
loc[air.param_id.isin(["BPM2.5", "NO2", "CO"]), :].
reset_index(drop=True)
)
air

Sorting

1
2
3
air.sort_values("value", ascending=False)
air.sort_values(["param_id", "value"], ascending=[True, False])

Stacking and Unstacking

1
2
3
air_wide = air.set_index(["sp_name", "param_id"]).unstack().loc[:, "value"]

air_wide.T.rename_axis(index="location", columns="param").stack().rename("value").reset_index()

Set-Theoretic Operations

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
A = pd.read_csv("https://raw.githubusercontent.com/gagolews/" +
"teaching-data/master/marek/some_birth_dates1.csv",
comment="#")
A

B = pd.read_csv("https://raw.githubusercontent.com/gagolews/" +
"teaching-data/master/marek/some_birth_dates2.csv",
comment="#")
B

A.loc[A.Name.isin(B.Name), :]

A.loc[~A.Name.isin(B.Name), :]

pd.concat((A, B.loc[~B.Name.isin(A.Name), :]))

Joining (Merging)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
param = pd.read_csv("https://raw.githubusercontent.com/gagolews/" +
"teaching-data/master/marek/air_quality_2018_param.csv",
comment="#")
param.rename(dict(param_std_unit_of_measure="unit"), axis=1)

A = pd.DataFrame({
"x": ["a0", "a1", "a2", "a3"],
"y": ["b0", "b1", "b2", "b3"]
})
A

B = pd.DataFrame({
"x": ["a0", "a2", "a2", "a4"],
"z": ["c0", "c1", "c2", "c3"]
})
B

pd.merge(A, B, on="x")

pd.merge(A, B, how="left", on="x")
pd.merge(A, B, how="right", on="x")
pd.merge(A, B, how="outer", on="x")

Observation Grouping

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option("display.notebook_repr_html", False) # disable "rich" output
plt.style.use("seaborn")

trees = pd.read_csv("https://raw.githubusercontent.com/gagolews/" +
"teaching-data/master/marek/urban_forest_subset2.csv",
comment="#")
trees = trees.rename({
"Common Name": "Species",
"Diameter Breast Height": "Diameter",
"Year Planted": "Year",
"Located In": "Location",
"Age Description": "Age",
}, axis=1).loc[:, ["Species", "Location", "Age", "Diameter", "Year"]]
trees.head()

Manual Splitting Into Subgroups

1
2
3
4
5
6
7
8
np.split(np.arange(10)*10, [3, 7])

trees_srt = trees.sort_values("Location", kind="stable")

levels, where = np.unique(trees_srt.loc[:, "Location"], return_index=True)
levels, where

trees_grp = np.split(trees_srt, where[1:])
1
2
3
4
5
6
7
8
9
10
11
12
13
14
for i in range(len(levels)):
print(f"level='{levels[i]}'; preview:")
print(trees_grp[i].iloc[ [0, -1], : ])
print("")

trees_agg = [
dict(
level=t.loc[:, "Location"].iloc[0],
diam_mean=np.mean(t.loc[:, "Diameter"]),
year_mean=np.mean(t.loc[:, "Year"])
)
for t in trees_grp
]
trees_agg

The groupby Methods

1
2
3
4
5
type(trees.groupby("Location"))
type(trees.groupby("Location")["Diameter"]) # or (...).Diameter

trees.groupby("Location").size()
trees.groupby(["Species", "Location"]).size().rename("Counts").reset_index()

Aggregating Data in Groups

1
2
3
trees.groupby("Location").mean(numeric_only=True).reset_index()
trees.groupby("Location")["Diameter"].describe().reset_index()
trees.groupby("Age")[["Diameter", "Year"]].aggregate([np.mean, np.median]).reset_index()
1
2
3
4
5
6
7
8
(trees.loc[:, ["Age", "Diameter", "Year"]]
.groupby("Age").aggregate(lambda x: (np.max(x)-np.min(x))/2)
.reset_index())

mr = lambda x: (np.max(x)-np.min(x))/2
mr.__name__ = "midrange"
(trees.loc[:, ["Age", "Diameter", "Year"]]
.groupby("Age").aggregate([np.mean, mr]).reset_index())

Transforming Data in Groups

1
2
3
4
5
6
7
def standardise(x):
return (x-np.mean(x))/np.std(x)

trees["Diameter_Species_Std"] = (trees.loc[:, ["Species", "Diameter"]]
.groupby("Species").transform(standardise))
(trees.loc[:, ["Species", "Diameter", "Diameter_Species_Std"]]
.groupby("Species").aggregate([np.mean, np.std]))

Plotting Data in Groups with seaborn

1
2
3
4
5
6
7
8
9
10
sns.boxplot(x="Diameter", y="Species", data=trees)
plt.show()

sns.barplot(
y="Counts",
x="Species",
hue="Location",
data=trees.groupby(["Species", "Location"]).size().rename("Counts").reset_index()
)
plt.show()

Database Access

1
2
3
4
5
6
7
import sqlite3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option("display.notebook_repr_html", False) # disable "rich" output
plt.style.use("seaborn")
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
Tags = pd.read_csv("https://raw.githubusercontent.com/gagolews/" +
"teaching-data/master/travel_stackexchange_com_2017/Tags.csv.gz",
comment="#")
Tags.head(3)

Users = pd.read_csv("https://raw.githubusercontent.com/gagolews/" +
"teaching-data/master/travel_stackexchange_com_2017/Users.csv.gz",
comment="#")
Users.head(3)

Badges = pd.read_csv("https://raw.githubusercontent.com/gagolews/" +
"teaching-data/master/travel_stackexchange_com_2017/Badges.csv.gz",
comment="#")

Posts = pd.read_csv("https://raw.githubusercontent.com/gagolews/" +
"teaching-data/master/travel_stackexchange_com_2017/Posts.csv.gz",
comment="#")
Posts.head(3)

Votes = pd.read_csv("https://raw.githubusercontent.com/gagolews/" +
"teaching-data/master/travel_stackexchange_com_2017/Votes.csv.gz",
comment="#")
Votes.head(3)

Exporting Data

1
2
3
4
5
6
7
8
9
10
import tempfile, os.path
dbfile = os.path.join(tempfile.mkdtemp(), "travel.db")
print(dbfile)

conn = sqlite3.connect(dbfile)
Tags.to_sql("Tags", conn, index=False)
Users.to_sql("Users", conn, index=False)
Badges.to_sql("Badges", conn, index=False)
Posts.to_sql("Posts", conn, index=False)
Votes.to_sql("Votes", conn, index=False)

Example SQL Queries

1
2
3
4
5
6
7
8
9
10
pd.read_sql_query("""
SELECT * FROM Tags LIMIT 3
""", conn)

res1a = pd.read_sql_query("""
SELECT * FROM Tags LIMIT 3
""", conn)

res1b = Tags.head(3)
res1a.equals(res1b)

Filtering

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
pd.read_sql_query("""
SELECT * FROM Tags WHERE TagName LIKE '%europe%'
""", conn)

pd.read_sql_query("""
SELECT TagName, Count
FROM Tags
WHERE TagName IN ('poland', 'australia', 'china')
""", conn)

pd.read_sql_query("""
SELECT Title, Score, ViewCount, FavoriteCount
FROM Posts
WHERE PostTypeId=1 AND
ViewCount>=10000 AND
FavoriteCount BETWEEN 35 AND 100
""", conn)

Ordering

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
pd.read_sql_query("""
SELECT Title, Score
FROM Posts
WHERE ParentId IS NULL AND Title IS NOT NULL
ORDER BY Score DESC
LIMIT 5
""", conn)

pd.read_sql_query("""
SELECT DISTINCT Name
FROM Badges
WHERE UserId=23
""", conn)

pd.read_sql_query("""
SELECT DISTINCT Name, strftime('%Y', Date) AS Year
FROM Badges
WHERE UserId=23
""", conn)

Grouping and Aggregating

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
pd.read_sql_query("""
SELECT
Name,
COUNT(*) AS Count,
MIN(strftime('%Y', Date)) AS MinYear,
AVG(strftime('%Y', Date)) AS MeanYear,
MAX(strftime('%Y', Date)) AS MaxYear
FROM Badges
WHERE UserId=23
GROUP BY Name
ORDER BY Count DESC
LIMIT 4
""", conn)

pd.read_sql_query("""
SELECT
Name,
strftime('%Y', Date) AS Year,
COUNT(*) AS Count
FROM Badges
WHERE UserId=23
GROUP BY Name, Year
HAVING Count > 1
ORDER BY Count DESC
""", conn)

Joining

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
pd.read_sql_query("""
SELECT Tags.TagName, Tags.Count, Posts.OwnerUserId,
Users.Age, Users.Location, Users.DisplayName
FROM Tags
JOIN Posts ON Posts.Id=Tags.WikiPostId
JOIN Users ON Users.AccountId=Posts.OwnerUserId
WHERE OwnerUserId != -1
ORDER BY Tags.Count DESC LIMIT 5
""", conn)

pd.read_sql_query("""
SELECT UpVotesTab.*, Posts.Title FROM
(
SELECT PostId, COUNT(*) AS UpVotes
FROM Votes
WHERE VoteTypeId=2
GROUP BY PostId
) AS UpVotesTab
JOIN Posts ON UpVotesTab.PostId=Posts.Id
WHERE Posts.PostTypeId=1
ORDER BY UpVotesTab.UpVotes DESC LIMIT 5
""", conn)

Closing the Database Connection

1
conn.close()