np.random.seed(123) d = pd.DataFrame(dict( u = np.round(np.random.rand(5), 2), v = np.round(np.random.randn(5), 2), w = ["spam", "bacon", "spam", "eggs", "sausage"] ), index=["a", "b", "c", "d", "e"]) # create a sample data frame d
np.quantile(d.loc[:, "u"], [0, 0.5, 1]) # calculate quantiles of u
np.quantile(d.loc[:, "u"], [0, 0.5, 1], interpolation="nearest") # calculate quantiles of u with nearest interpolation
np.quantile(d.loc[:, ["u", "v"]], [0, 0.5, 1], axis=0) # calculate quantiles of u and v
np.mean(d.loc[:, ["u", "v"]], axis=0) # calculate mean of u and v
d.loc[:, ["u", "v"]].mean(numeric_only=True) # calculate mean of u and v without considering non-numeric columns
d.describe()
Transforming
1 2 3 4 5 6 7 8 9 10 11 12 13
np.exp(d.loc[:, "u"]) # apply exponential function to u
np.exp(d.loc[:, ["u", "v"]])
u = d.loc[:, "u"] (u-np.mean(u))/np.std(u) # standardize u
d.loc[:, "u"] > d.loc[:, "v"]
uv = d.loc[:, ["u", "v"]].values # convert to numpy array uv = (uv-np.mean(uv, axis=0))/np.std(uv, axis=0) # standardize u and v uv
Filtering
Series
1 2 3 4 5 6 7 8
np.random.seed(123) b = pd.Series(np.round(np.random.uniform(0,1,10),2)) b.index = np.random.permutation(np.r_[0:10]) b
c = b.copy() c.index = list("abcdefghij") c
DataFrame
1 2 3 4 5 6 7 8 9
np.random.seed(123) d = pd.DataFrame(dict( u = np.round(np.random.rand(5), 2), v = np.round(np.random.randn(5), 2), w = ["spam", "bacon", "spam", "eggs", "sausage"], x = [True, False, True, False, True] ), index=["a", "b", "c", "d", "e"])
d.loc[d.loc[:, "u"] > 0.5, "u":"w"]
Adding Rows and Columns
1 2
d.loc[:, "y"] = d.loc[:, "u"]**2 d
Random Sampling
1 2 3 4 5 6 7 8 9 10 11 12 13 14
body = pd.read_csv("https://raw.githubusercontent.com/gagolews/" + "teaching-data/master/marek/nhanes_adult_female_bmx_2020.csv", comment="#") body.sample(5, random_state=123) # 5 random rows without replacement
import numpy as np import pandas as pd pd.set_option("display.notebook_repr_html", False) # disable "rich" output
air = pd.read_csv("https://raw.githubusercontent.com/gagolews/" + "teaching-data/master/marek/air_quality_2018_means.csv", comment="#") air = ( air. loc[air.param_id.isin(["BPM2.5", "NO2", "CO"]), :]. reset_index(drop=True) ) air
pd.read_sql_query(""" SELECT * FROM Tags WHERE TagName LIKE '%europe%' """, conn)
pd.read_sql_query(""" SELECT TagName, Count FROM Tags WHERE TagName IN ('poland', 'australia', 'china') """, conn)
pd.read_sql_query(""" SELECT Title, Score, ViewCount, FavoriteCount FROM Posts WHERE PostTypeId=1 AND ViewCount>=10000 AND FavoriteCount BETWEEN 35 AND 100 """, conn)
Ordering
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
pd.read_sql_query(""" SELECT Title, Score FROM Posts WHERE ParentId IS NULL AND Title IS NOT NULL ORDER BY Score DESC LIMIT 5 """, conn)
pd.read_sql_query(""" SELECT DISTINCT Name FROM Badges WHERE UserId=23 """, conn)
pd.read_sql_query(""" SELECT DISTINCT Name, strftime('%Y', Date) AS Year FROM Badges WHERE UserId=23 """, conn)
pd.read_sql_query(""" SELECT Name, COUNT(*) AS Count, MIN(strftime('%Y', Date)) AS MinYear, AVG(strftime('%Y', Date)) AS MeanYear, MAX(strftime('%Y', Date)) AS MaxYear FROM Badges WHERE UserId=23 GROUP BY Name ORDER BY Count DESC LIMIT 4 """, conn)
pd.read_sql_query(""" SELECT Name, strftime('%Y', Date) AS Year, COUNT(*) AS Count FROM Badges WHERE UserId=23 GROUP BY Name, Year HAVING Count > 1 ORDER BY Count DESC """, conn)
pd.read_sql_query(""" SELECT Tags.TagName, Tags.Count, Posts.OwnerUserId, Users.Age, Users.Location, Users.DisplayName FROM Tags JOIN Posts ON Posts.Id=Tags.WikiPostId JOIN Users ON Users.AccountId=Posts.OwnerUserId WHERE OwnerUserId != -1 ORDER BY Tags.Count DESC LIMIT 5 """, conn)
pd.read_sql_query(""" SELECT UpVotesTab.*, Posts.Title FROM ( SELECT PostId, COUNT(*) AS UpVotes FROM Votes WHERE VoteTypeId=2 GROUP BY PostId ) AS UpVotesTab JOIN Posts ON UpVotesTab.PostId=Posts.Id WHERE Posts.PostTypeId=1 ORDER BY UpVotesTab.UpVotes DESC LIMIT 5 """, conn)