Basic Usage of Data Analysis

Getting Started with JupyterLab

JupyterLab is a web-based development environment supporting numerous programming languages, including, of course, Python.

jupyterlab

Scalar Types in Python

Basic Operations on Data Frames

doc

pre-requisite

1
2
3

import numpy as np
import pandas as pd
pd.set_option("display.notebook_repr_html", False)  # disable "rich" output

Aggregating

np.random.seed(123)
d = pd.DataFrame(dict(
    u = np.round(np.random.rand(5), 2),
    v = np.round(np.random.randn(5), 2),
    w = ["spam", "bacon", "spam", "eggs", "sausage"]
), index=["a", "b", "c", "d", "e"]) # create a sample data frame
d


np.quantile(d.loc[:, "u"], [0, 0.5, 1]) # calculate quantiles of u

np.quantile(d.loc[:, "u"], [0, 0.5, 1], interpolation="nearest") # calculate quantiles of u with nearest interpolation

np.quantile(d.loc[:, ["u", "v"]], [0, 0.5, 1], axis=0) # calculate quantiles of u and v

np.mean(d.loc[:, ["u", "v"]], axis=0) # calculate mean of u and v

d.loc[:, ["u", "v"]].mean(numeric_only=True) # calculate mean of u and v without considering non-numeric columns

d.describe()

Transforming

np.exp(d.loc[:, "u"]) # apply exponential function to u

np.exp(d.loc[:, ["u", "v"]])

u = d.loc[:, "u"]
(u-np.mean(u))/np.std(u) # standardize u

d.loc[:, "u"] > d.loc[:, "v"]

uv = d.loc[:, ["u", "v"]].values # convert to numpy array
uv = (uv-np.mean(uv, axis=0))/np.std(uv, axis=0) # standardize u and v
uv

Filtering

Series

np.random.seed(123)
b = pd.Series(np.round(np.random.uniform(0,1,10),2))
b.index =  np.random.permutation(np.r_[0:10])
b

c = b.copy()
c.index =  list("abcdefghij")
c

DataFrame

np.random.seed(123)
d = pd.DataFrame(dict(
    u = np.round(np.random.rand(5), 2),
    v = np.round(np.random.randn(5), 2),
    w = ["spam", "bacon", "spam", "eggs", "sausage"],
    x = [True, False, True, False, True]
), index=["a", "b", "c", "d", "e"])

d.loc[d.loc[:, "u"] > 0.5, "u":"w"]

Adding Rows and Columns

1 2	d.loc[:, "y"] = d.loc[:, "u"]**2 d

Random Sampling

body = pd.read_csv("https://raw.githubusercontent.com/gagolews/" +
    "teaching-data/master/marek/nhanes_adult_female_bmx_2020.csv",
    comment="#")
body.sample(5, random_state=123)  # 5 random rows without replacement

x = body.head(10)  # this is just an example
x

np.random.seed(123)  # reproducibility matters
idx = np.random.permutation(x.shape[0])
idx

k = int(x.shape[0]*0.8) # 80% of the data
x.iloc[idx[:k], :]  # 80% of the data

Hierarchical Indices

np.random.seed(123)
d = pd.DataFrame(dict(
    year = sorted([2023, 2024, 2025]*4),
    quarter = ["Q1", "Q2", "Q3", "Q4"]*3,
    data = np.round(np.random.rand(12), 2)
)).set_index(["year", "quarter"])
d

d.loc[2023, :]
d.loc[[(2023, "Q1"), (2024, "Q3")], :]

Reshaping and Fusing Datasets

import numpy as np
import pandas as pd
pd.set_option("display.notebook_repr_html", False)  # disable "rich" output

air = pd.read_csv("https://raw.githubusercontent.com/gagolews/" +
    "teaching-data/master/marek/air_quality_2018_means.csv",
    comment="#")
air = (
    air.
    loc[air.param_id.isin(["BPM2.5", "NO2", "CO"]), :].
    reset_index(drop=True)
)
air

Sorting

1
2
3

air.sort_values("value", ascending=False)
air.sort_values(["param_id", "value"], ascending=[True, False])

Stacking and Unstacking

1
2
3

air_wide = air.set_index(["sp_name", "param_id"]).unstack().loc[:, "value"]

air_wide.T.rename_axis(index="location", columns="param").stack().rename("value").reset_index()

Set-Theoretic Operations

A = pd.read_csv("https://raw.githubusercontent.com/gagolews/" +
    "teaching-data/master/marek/some_birth_dates1.csv",
    comment="#")
A

B = pd.read_csv("https://raw.githubusercontent.com/gagolews/" +
    "teaching-data/master/marek/some_birth_dates2.csv",
    comment="#")
B

A.loc[A.Name.isin(B.Name), :]

A.loc[~A.Name.isin(B.Name), :]

pd.concat((A, B.loc[~B.Name.isin(A.Name), :]))

Joining (Merging)

param = pd.read_csv("https://raw.githubusercontent.com/gagolews/" +
    "teaching-data/master/marek/air_quality_2018_param.csv",
    comment="#")
param.rename(dict(param_std_unit_of_measure="unit"), axis=1)

A = pd.DataFrame({
    "x": ["a0", "a1", "a2", "a3"],
    "y": ["b0", "b1", "b2", "b3"]
})
A

B = pd.DataFrame({
    "x": ["a0", "a2", "a2", "a4"],
    "z": ["c0", "c1", "c2", "c3"]
})
B

pd.merge(A, B, on="x")

pd.merge(A, B, how="left", on="x")
pd.merge(A, B, how="right", on="x")
pd.merge(A, B, how="outer", on="x")

Observation Grouping

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option("display.notebook_repr_html", False)  # disable "rich" output
plt.style.use("seaborn")

trees = pd.read_csv("https://raw.githubusercontent.com/gagolews/" +
    "teaching-data/master/marek/urban_forest_subset2.csv",
    comment="#")
trees = trees.rename({
    "Common Name": "Species",
    "Diameter Breast Height": "Diameter",
    "Year Planted": "Year",
    "Located In": "Location",
    "Age Description": "Age",
}, axis=1).loc[:, ["Species", "Location", "Age", "Diameter", "Year"]]
trees.head()

Manual Splitting Into Subgroups

np.split(np.arange(10)*10, [3, 7])

trees_srt = trees.sort_values("Location", kind="stable")

levels, where = np.unique(trees_srt.loc[:, "Location"], return_index=True)
levels, where

trees_grp = np.split(trees_srt, where[1:])

for i in range(len(levels)):
    print(f"level='{levels[i]}'; preview:")
    print(trees_grp[i].iloc[ [0, -1], : ])
    print("")

trees_agg = [
    dict(
        level=t.loc[:, "Location"].iloc[0],
        diam_mean=np.mean(t.loc[:, "Diameter"]),
        year_mean=np.mean(t.loc[:, "Year"])
    )
    for t in trees_grp
]
trees_agg

The groupby Methods

type(trees.groupby("Location"))
type(trees.groupby("Location")["Diameter"])  # or (...).Diameter

trees.groupby("Location").size()
trees.groupby(["Species", "Location"]).size().rename("Counts").reset_index()

Aggregating Data in Groups

1
2
3

trees.groupby("Location").mean(numeric_only=True).reset_index()
trees.groupby("Location")["Diameter"].describe().reset_index()
trees.groupby("Age")[["Diameter", "Year"]].aggregate([np.mean, np.median]).reset_index()

(trees.loc[:, ["Age", "Diameter", "Year"]]
    .groupby("Age").aggregate(lambda x: (np.max(x)-np.min(x))/2)
    .reset_index())

mr = lambda x: (np.max(x)-np.min(x))/2
mr.__name__ = "midrange"
(trees.loc[:, ["Age", "Diameter", "Year"]]
    .groupby("Age").aggregate([np.mean, mr]).reset_index())

Transforming Data in Groups

def standardise(x):
    return (x-np.mean(x))/np.std(x)

trees["Diameter_Species_Std"] = (trees.loc[:, ["Species", "Diameter"]]
    .groupby("Species").transform(standardise))
(trees.loc[:, ["Species", "Diameter", "Diameter_Species_Std"]]
    .groupby("Species").aggregate([np.mean, np.std]))

Plotting Data in Groups with seaborn

sns.boxplot(x="Diameter", y="Species", data=trees)
plt.show()

sns.barplot(
    y="Counts",
    x="Species",
    hue="Location",
    data=trees.groupby(["Species", "Location"]).size().rename("Counts").reset_index()
)
plt.show()

Database Access

import sqlite3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option("display.notebook_repr_html", False)  # disable "rich" output
plt.style.use("seaborn")

Tags = pd.read_csv("https://raw.githubusercontent.com/gagolews/" +
    "teaching-data/master/travel_stackexchange_com_2017/Tags.csv.gz",
    comment="#")
Tags.head(3)

Users = pd.read_csv("https://raw.githubusercontent.com/gagolews/" +
    "teaching-data/master/travel_stackexchange_com_2017/Users.csv.gz",
    comment="#")
Users.head(3)

Badges = pd.read_csv("https://raw.githubusercontent.com/gagolews/" +
    "teaching-data/master/travel_stackexchange_com_2017/Badges.csv.gz",
    comment="#")

Posts = pd.read_csv("https://raw.githubusercontent.com/gagolews/" +
    "teaching-data/master/travel_stackexchange_com_2017/Posts.csv.gz",
    comment="#")
Posts.head(3)

Votes = pd.read_csv("https://raw.githubusercontent.com/gagolews/" +
    "teaching-data/master/travel_stackexchange_com_2017/Votes.csv.gz",
    comment="#")
Votes.head(3)

Exporting Data

import tempfile, os.path
dbfile = os.path.join(tempfile.mkdtemp(), "travel.db")
print(dbfile)

conn = sqlite3.connect(dbfile)
Tags.to_sql("Tags", conn, index=False)
Users.to_sql("Users", conn, index=False)
Badges.to_sql("Badges", conn, index=False)
Posts.to_sql("Posts", conn, index=False)
Votes.to_sql("Votes", conn, index=False)

Example SQL Queries

pd.read_sql_query("""
    SELECT * FROM Tags LIMIT 3
""", conn)

res1a = pd.read_sql_query("""
    SELECT * FROM Tags LIMIT 3
""", conn)

res1b = Tags.head(3)
res1a.equals(res1b)

Filtering

pd.read_sql_query("""
    SELECT * FROM Tags WHERE TagName LIKE '%europe%'
""", conn)

pd.read_sql_query("""
    SELECT TagName, Count
    FROM Tags
    WHERE TagName IN ('poland', 'australia', 'china')
""", conn)

pd.read_sql_query("""
    SELECT Title, Score, ViewCount, FavoriteCount
    FROM Posts
    WHERE PostTypeId=1 AND
        ViewCount>=10000 AND
        FavoriteCount BETWEEN 35 AND 100
""", conn)

Ordering

pd.read_sql_query("""
    SELECT Title, Score
    FROM Posts
    WHERE ParentId IS NULL AND Title IS NOT NULL
    ORDER BY Score DESC
    LIMIT 5
""", conn)

pd.read_sql_query("""
    SELECT DISTINCT Name
    FROM Badges
    WHERE UserId=23
""", conn)

pd.read_sql_query("""
    SELECT DISTINCT Name, strftime('%Y', Date) AS Year
    FROM Badges
    WHERE UserId=23
""", conn)

Grouping and Aggregating

pd.read_sql_query("""
    SELECT
        Name,
        COUNT(*) AS Count,
        MIN(strftime('%Y', Date)) AS MinYear,
        AVG(strftime('%Y', Date)) AS MeanYear,
        MAX(strftime('%Y', Date)) AS MaxYear
    FROM Badges
    WHERE UserId=23
    GROUP BY Name
    ORDER BY Count DESC
    LIMIT 4
""", conn)

pd.read_sql_query("""
    SELECT
        Name,
        strftime('%Y', Date) AS Year,
        COUNT(*) AS Count
    FROM Badges
    WHERE UserId=23
    GROUP BY Name, Year
    HAVING Count > 1
    ORDER BY Count DESC
""", conn)

Joining

pd.read_sql_query("""
    SELECT Tags.TagName, Tags.Count, Posts.OwnerUserId,
        Users.Age, Users.Location, Users.DisplayName
    FROM Tags
    JOIN Posts ON Posts.Id=Tags.WikiPostId
    JOIN Users ON Users.AccountId=Posts.OwnerUserId
    WHERE OwnerUserId != -1
    ORDER BY Tags.Count DESC LIMIT 5
""", conn)

pd.read_sql_query("""
    SELECT UpVotesTab.*, Posts.Title FROM
    (
        SELECT PostId, COUNT(*) AS UpVotes
        FROM Votes
        WHERE VoteTypeId=2
        GROUP BY PostId
    ) AS UpVotesTab
    JOIN Posts ON UpVotesTab.PostId=Posts.Id
    WHERE Posts.PostTypeId=1
    ORDER BY UpVotesTab.UpVotes DESC LIMIT 5
""", conn)

Closing the Database Connection

1	conn.close()