0.1 Setting up environment
library(RSQLite)
library(DBI)
library(tidyverse)
library(nycflights13)
library(reticulate)
reticulate::use_condaenv("r-reticulate",required = TRUE)
np = reticulate::import('numpy')
pd = reticulate::import('pandas')
plt = reticulate::import('matplotlib.pyplot')
con <- dbConnect(RSQLite::SQLite(), ":memory:")
dbWriteTable(con, "flights", flights)
dbWriteTable(con, "airports", airports)
dbWriteTable(con, "planes", planes)
knitr::opts_chunk$set(connection = con)
glimpse(flights)
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
import numpy as np
import pandas as pd
df = r.flights.copy()
0.2 Quick reference
Select columns |
select v1, v2 from df; |
select(df,v1,v2) |
df[[‘v1’,‘v2’]] |
Filter |
select v1,v2 from df where v1 = 1 and v2 = 2; |
filter(df,(v1 == 1 & v2 == 2)) |
df.query(‘v1 == 1 & v2 == 2’) |
Distinct rows |
select distinct v1 from df; |
unique(df$v1) |
df[[‘v1’]].drop_duplicates() |
Sampling |
select * from df sample 10; |
sample_n(df,10) |
df.sample(n=10) |
Sort |
select * from df order by v1, v2 desc; |
arrange(df,v1, desc(v2)) |
df.sort_values([‘v1’,‘v2’],ascending = [True,False]) |
Columns transaformation |
select v1 as col1, a+b as col2 from df |
df %>% rename(col1=v1) %>% mutate(col2=a+b) |
df.rename(colmns={‘v1’:col1}).assign(col2=df[‘a’]+df[‘b’]) |
Quick summary |
NA |
summary(df) |
df.describe() |
Group by |
select grp, avg(v1) as avg from df group by grp; |
df %>% group_by(grp) %>% summarise(avg=mean(v1,na.rm=TRUE)) |
df.groupby(‘grp’).agg({‘avg’:np.mean}) |
|
select grp1, grp2, count(*) from df group by grp; |
df %>% count(grp1,grp2) |
df.groupby(‘grp1’)[‘grp2’].value_counts() |
Pivot |
|
df %>% count(grp1,grp2) %>% spread(grp1,value=n,fill=0) |
df.groupby(‘grp1’)[‘grp2’].value_counts().unstack().fillna(0) |
|
|
|
|
1. SELECT
select distinct year,month,origin,dest,arr_delay
from flights
order by arr_delay desc,dest asc
limit 5;
Table 1: 5 records
2013 |
1 |
JFK |
HNL |
1272 |
2013 |
6 |
JFK |
CMH |
1127 |
2013 |
1 |
EWR |
ORD |
1109 |
2013 |
9 |
JFK |
SFO |
1007 |
2013 |
7 |
JFK |
CVG |
989 |
flights %>%
select(year,month,origin,dest,arr_delay) %>%
arrange(desc(arr_delay),dest) %>%
unique() %>%
head(5)
#
#
#
#
#
#
#
#
cols = ['year','month','origin','dest','arr_delay']
df[cols].sort_values(['arr_delay','dest'],ascending=[False,True]).drop_duplicates().head(5)
#
#
#
#
#
#
2. WHERE
select distinct year,month,origin,dest,arr_delay
from flights
where dest='IAH' and arr_delay >=300
order by arr_delay desc
limit 5;
Table 2: 5 records
2013 |
4 |
JFK |
IAH |
783 |
2013 |
4 |
LGA |
IAH |
435 |
2013 |
6 |
LGA |
IAH |
390 |
2013 |
7 |
EWR |
IAH |
374 |
2013 |
9 |
EWR |
IAH |
373 |
flights %>%
select(year,month,origin,dest,arr_delay) %>%
filter(dest=='IAH' & arr_delay >=300) %>%
arrange(desc(arr_delay)) %>%
unique() %>%
head(5)
#
#
#
#
#
#
#
#
filt = (df['dest']=='IAH') & (df['arr_delay']>=300)
df.loc[filt,cols].sort_values("arr_delay",ascending=False).head(5)
#
#
#
#
#
#
3. GROUPBY
select origin,dest,max(arr_delay),avg(arr_delay),count(*),count(distinct flight) as n_flights
from flights
where dest='IAH'
group by origin,dest;
Table 3: 3 records
EWR |
IAH |
374 |
5.406577 |
3973 |
453 |
JFK |
IAH |
783 |
17.132841 |
274 |
3 |
LGA |
IAH |
435 |
1.450363 |
2951 |
381 |
flights %>%
select(origin,dest,arr_delay,flight) %>%
filter(dest=='IAH') %>%
group_by(origin,dest) %>%
summarise(max_delay=max(arr_delay,na.rm = TRUE)
,mean=mean(arr_delay,na.rm = TRUE)
,count=n()
,n_flight=n_distinct(flight))
#
#
#
#
#
#
#
cols = ['origin','dest','arr_delay','dep_delay','flight']
filt = (df['dest']=='IAH')
grp = ['origin','dest']
df_summ = df.loc[filt,cols].groupby(grp) \
.agg({'arr_delay':[np.max], \
'dep_delay':[np.max,np.mean], \
'flight':pd.Series.nunique})
df_summ.columns = ['_'.join(tup).rstrip('_') for tup in df_summ.columns.values]
df_summ
#
#
#
#
#
4. JOIN
select distinct origin,dest,arr_delay,a.tailnum,b.seats,c.name
from flights a
left join planes b on a.tailnum=b.tailnum
left join airports c on a.origin=c.faa
where dest='IAH' and arr_delay >=300
order by arr_delay desc
limit 5;
Table 4: 5 records
JFK |
IAH |
783 |
N3DGAA |
NA |
John F Kennedy Intl |
LGA |
IAH |
435 |
N811UA |
179 |
La Guardia |
LGA |
IAH |
390 |
N829UA |
179 |
La Guardia |
EWR |
IAH |
374 |
N569UA |
178 |
Newark Liberty Intl |
EWR |
IAH |
373 |
N465UA |
200 |
Newark Liberty Intl |
flights %>%
select(origin,dest,arr_delay,tailnum) %>%
filter(dest=='IAH') %>%
left_join(planes,by = c('tailnum'='tailnum')) %>%
left_join(airports,by = c('origin'='faa')) %>%
select(origin,dest,arr_delay,tailnum,seats,name) %>%
arrange(desc(arr_delay)) %>%
head(5)
#
#
#
#
#
#
#
#
cols = ['origin','dest','arr_delay','tailnum']
filt = (df['dest']=='IAH')
df_base = df.loc[filt,cols].sort_values("arr_delay",ascending=False).head(5)
df_planes = r.planes
df_airports = r.airports
cols2=['origin','dest','arr_delay','tailnum','seats','name']
df_base.merge(df_planes,how='left', on='tailnum') \
.merge(df_airports,how='left',left_on='origin',right_on='faa')[cols2]
#
#
#
#
#
#
5. UNION
select tailnum,model,seats
from planes
where tailnum in ('N10156','N110UW')
union
select tailnum,model,seats
from planes
where tailnum in ('N281AT','N381AA')
Table 5: 4 records
N10156 |
EMB-145XR |
55 |
N110UW |
A320-214 |
182 |
N281AT |
A340-313 |
375 |
N381AA |
DC-7BF |
102 |
df1 <- planes %>% filter(tailnum %in% c('N10156','N110UW')) %>%
select(tailnum,model,seats)
df2 <- planes %>% filter(tailnum %in% c('N281AT','N381AA')) %>%
select(tailnum,model,seats)
df1 %>% union(df2)
#
#
#
#
#
#
#
cols=['tailnum','model','seats']
filt = df_planes['tailnum'].isin(['N10156','N110UW'])
filt2 = df_planes['tailnum'].isin(['N281AT','N381AA'])
pd.concat([df_planes.loc[filt,cols],df_planes.loc[filt2,cols]])
#
#
#
#
#
6. Analytics and aggregate functions
6.1 counts and percentage
select origin,dest,count(*) as cnt
,100 * count(*)/sum(count(*)) over() as prop
from flights
where dest='IAH'
group by origin,dest;
Table 6: 3 records
EWR |
IAH |
3973 |
55 |
JFK |
IAH |
274 |
3 |
LGA |
IAH |
2951 |
40 |
flights %>%
select(origin,dest) %>%
filter(dest=='IAH') %>%
count(origin,dest) %>%
mutate(prop=100*round(prop.table(n),2))
#
#
#
#
#
#
cols = ['origin','dest','flight']
filt = (df['dest']=='IAH')
grp = ['origin','dest']
df_p = df.loc[filt,cols].groupby(grp).agg({'flight':np.size}).reset_index().rename(columns={'flight':'cnt'})
df_p['prop'] = 100 * df_p['cnt']/df_p['cnt'].sum()
print(df_p)
#
#
#
#
6.2 Top N rows per group
select *
from
(
select origin
,dest
,arr_delay
,ROW_NUMBER () OVER (PARTITION BY origin,dest ORDER BY arr_delay DESC) as rank_delay
from flights
) a
where a.rank_delay = 1
limit 10;
Table 7: Displaying records 1 - 10
EWR |
ALB |
328 |
1 |
EWR |
ANC |
39 |
1 |
EWR |
ATL |
796 |
1 |
EWR |
AUS |
349 |
1 |
EWR |
AVL |
228 |
1 |
EWR |
BDL |
266 |
1 |
EWR |
BNA |
364 |
1 |
EWR |
BOS |
422 |
1 |
EWR |
BQN |
208 |
1 |
EWR |
BTV |
306 |
1 |
flights %>%
group_by(origin,dest) %>%
mutate(rank_delay = dplyr::dense_rank(desc(arr_delay))) %>%
select(origin,dest,arr_delay,rank_delay) %>%
arrange(origin,dest,rank_delay) %>%
filter(rank_delay==1) %>%
head(10)
#
#
#
#
#
#
#
#
#
#
#
#
#
#
cols = ['origin','dest','arr_delay','rank_delay']
df.assign(rank_delay = df.sort_values(['arr_delay'], ascending=False) \
.groupby(['origin','dest']) \
.cumcount()+1) \
.query('rank_delay == 1')[cols] \
.sort_values(['origin','dest']).head(10)
#
#
#
#
#
#
#
#
#
#
#
cols = ['origin','dest','arr_delay','rank_delay']
df.assign(rank_delay=df.groupby(['origin','dest'])['arr_delay'] \
.rank(method='first',ascending=False)) \
.query('rank_delay == 1')[cols] \
.sort_values(['origin','dest']).head(10)
#
#
#
#
#
#
#
#
#
#
#
6.3 Find duplicates
select origin,dest,count(*) as cnt
from flights
group by origin,dest
having count(*) >1
limit 5
;
Table 8: 5 records
EWR |
ALB |
439 |
EWR |
ANC |
8 |
EWR |
ATL |
5022 |
EWR |
AUS |
968 |
EWR |
AVL |
265 |
flights %>%
count(origin,dest) %>%
filter(n>1) %>%
head()
#
#
#
#
#
#
#
#
#
cols = ['origin','dest']
df.groupby(cols).size().head()
#
#
#
#
#
#
#