0.1 Setting up environment
library(RSQLite)
library(DBI)
library(tidyverse)
library(nycflights13)
library(reticulate)
reticulate::use_condaenv("r-reticulate",required = TRUE)
np = reticulate::import('numpy')
pd = reticulate::import('pandas')
plt = reticulate::import('matplotlib.pyplot')
con <- dbConnect(RSQLite::SQLite(), ":memory:")
dbWriteTable(con, "flights", flights)
dbWriteTable(con, "airports", airports)
dbWriteTable(con, "planes", planes)
knitr::opts_chunk$set(connection = con)
glimpse(flights)
## Observations: 336,776
## Variables: 19
## $ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, …
## $ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ dep_time <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558,…
## $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600,…
## $ dep_delay <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -…
## $ arr_time <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849…
## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851…
## $ arr_delay <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -…
## $ carrier <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", …
## $ flight <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, …
## $ tailnum <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N39…
## $ origin <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA"…
## $ dest <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD"…
## $ air_time <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, …
## $ distance <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733,…
## $ hour <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, …
## $ minute <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, …
## $ time_hour <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 …
import numpy as np
import pandas as pd
df = r.flights.copy()
0.2 Quick reference
Select columns |
select v1, v2 from df; |
select(df,v1,v2) |
df[[‘v1’,‘v2’]] |
Filter |
select v1,v2 from df where v1 = 1 and v2 = 2; |
filter(df,(v1 == 1 & v2 == 2)) |
df.query(‘v1 == 1 & v2 == 2’) |
Distinct rows |
select distinct v1 from df; |
unique(df$v1) |
df[[‘v1’]].drop_duplicates() |
Sampling |
select * from df sample 10; |
sample_n(df,10) |
df.sample(n=10) |
Sort |
select * from df order by v1, v2 desc; |
arrange(df,v1, desc(v2)) |
df.sort_values([‘v1’,‘v2’],ascending = [True,False]) |
Columns transaformation |
select v1 as col1, a+b as col2 from df |
df %>% rename(col1=v1) %>% mutate(col2=a+b) |
df.rename(colmns={‘v1’:col1}).assign(col2=df[‘a’]+df[‘b’]) |
Quick summary |
NA |
summary(df) |
df.describe() |
Group by |
select grp, avg(v1) as avg from df group by grp; |
df %>% group_by(grp) %>% summarise(avg=mean(v1,na.rm=TRUE)) |
df.groupby(‘grp’).agg({‘avg’:np.mean}) |
|
select grp1, grp2, count(*) from df group by grp; |
df %>% count(grp1,grp2) |
df.groupby(‘grp1’)[‘grp2’].value_counts() |
Pivot |
|
df %>% count(grp1,grp2) %>% spread(grp1,value=n,fill=0) |
df.groupby(‘grp1’)[‘grp2’].value_counts().unstack().fillna(0) |
|
|
|
|
1. SELECT
select distinct year,month,origin,dest,arr_delay
from flights
order by arr_delay desc,dest asc
limit 5;
Table 1: 5 records
2013 |
1 |
JFK |
HNL |
1272 |
2013 |
6 |
JFK |
CMH |
1127 |
2013 |
1 |
EWR |
ORD |
1109 |
2013 |
9 |
JFK |
SFO |
1007 |
2013 |
7 |
JFK |
CVG |
989 |
flights %>%
select(year,month,origin,dest,arr_delay) %>%
arrange(desc(arr_delay),dest) %>%
unique() %>%
head(5)
## # A tibble: 5 x 5
## year month origin dest arr_delay
## <int> <int> <chr> <chr> <dbl>
## 1 2013 1 JFK HNL 1272
## 2 2013 6 JFK CMH 1127
## 3 2013 1 EWR ORD 1109
## 4 2013 9 JFK SFO 1007
## 5 2013 7 JFK CVG 989
cols = ['year','month','origin','dest','arr_delay']
df[cols].sort_values(['arr_delay','dest'],ascending=[False,True]).drop_duplicates().head(5)
## year month origin dest arr_delay
## 7072 2013 1 JFK HNL 1272.0
## 235778 2013 6 JFK CMH 1127.0
## 8239 2013 1 EWR ORD 1109.0
## 327043 2013 9 JFK SFO 1007.0
## 270376 2013 7 JFK CVG 989.0
2. WHERE
select distinct year,month,origin,dest,arr_delay
from flights
where dest='IAH' and arr_delay >=300
order by arr_delay desc
limit 5;
Table 2: 5 records
2013 |
4 |
JFK |
IAH |
783 |
2013 |
4 |
LGA |
IAH |
435 |
2013 |
6 |
LGA |
IAH |
390 |
2013 |
7 |
EWR |
IAH |
374 |
2013 |
9 |
EWR |
IAH |
373 |
flights %>%
select(year,month,origin,dest,arr_delay) %>%
filter(dest=='IAH' & arr_delay >=300) %>%
arrange(desc(arr_delay)) %>%
unique() %>%
head(5)
## # A tibble: 5 x 5
## year month origin dest arr_delay
## <int> <int> <chr> <chr> <dbl>
## 1 2013 4 JFK IAH 783
## 2 2013 4 LGA IAH 435
## 3 2013 6 LGA IAH 390
## 4 2013 7 EWR IAH 374
## 5 2013 9 EWR IAH 373
filt = (df['dest']=='IAH') & (df['arr_delay']>=300)
df.loc[filt,cols].sort_values("arr_delay",ascending=False).head(5)
## year month origin dest arr_delay
## 182284 2013 4 JFK IAH 783.0
## 182153 2013 4 LGA IAH 435.0
## 247626 2013 6 LGA IAH 390.0
## 274063 2013 7 EWR IAH 374.0
## 319938 2013 9 EWR IAH 373.0
3. GROUPBY
select origin,dest,max(arr_delay),avg(arr_delay),count(*),count(distinct flight) as n_flights
from flights
where dest='IAH'
group by origin,dest;
Table 3: 3 records
EWR |
IAH |
374 |
5.406577 |
3973 |
453 |
JFK |
IAH |
783 |
17.132841 |
274 |
3 |
LGA |
IAH |
435 |
1.450363 |
2951 |
381 |
flights %>%
select(origin,dest,arr_delay,flight) %>%
filter(dest=='IAH') %>%
group_by(origin,dest) %>%
summarise(max_delay=max(arr_delay,na.rm = TRUE)
,mean=mean(arr_delay,na.rm = TRUE)
,count=n()
,n_flight=n_distinct(flight))
## # A tibble: 3 x 6
## # Groups: origin [3]
## origin dest max_delay mean count n_flight
## <chr> <chr> <dbl> <dbl> <int> <int>
## 1 EWR IAH 374 5.41 3973 453
## 2 JFK IAH 783 17.1 274 3
## 3 LGA IAH 435 1.45 2951 381
cols = ['origin','dest','arr_delay','dep_delay','flight']
filt = (df['dest']=='IAH')
grp = ['origin','dest']
df_summ = df.loc[filt,cols].groupby(grp) \
.agg({'arr_delay':[np.max], \
'dep_delay':[np.max,np.mean], \
'flight':pd.Series.nunique})
# flatten the multi-index column names
df_summ.columns = ['_'.join(tup).rstrip('_') for tup in df_summ.columns.values]
df_summ
## arr_delay_amax dep_delay_amax dep_delay_mean flight_nunique
## origin dest
## EWR IAH 374.0 397.0 11.826551 453
## JFK IAH 783.0 761.0 15.617647 3
## LGA IAH 435.0 427.0 9.058986 381
4. JOIN
select distinct origin,dest,arr_delay,a.tailnum,b.seats,c.name
from flights a
left join planes b on a.tailnum=b.tailnum
left join airports c on a.origin=c.faa
where dest='IAH' and arr_delay >=300
order by arr_delay desc
limit 5;
Table 4: 5 records
JFK |
IAH |
783 |
N3DGAA |
NA |
John F Kennedy Intl |
LGA |
IAH |
435 |
N811UA |
179 |
La Guardia |
LGA |
IAH |
390 |
N829UA |
179 |
La Guardia |
EWR |
IAH |
374 |
N569UA |
178 |
Newark Liberty Intl |
EWR |
IAH |
373 |
N465UA |
200 |
Newark Liberty Intl |
flights %>%
select(origin,dest,arr_delay,tailnum) %>%
filter(dest=='IAH') %>%
left_join(planes,by = c('tailnum'='tailnum')) %>%
left_join(airports,by = c('origin'='faa')) %>%
select(origin,dest,arr_delay,tailnum,seats,name) %>%
arrange(desc(arr_delay)) %>%
head(5)
## # A tibble: 5 x 6
## origin dest arr_delay tailnum seats name
## <chr> <chr> <dbl> <chr> <int> <chr>
## 1 JFK IAH 783 N3DGAA NA John F Kennedy Intl
## 2 LGA IAH 435 N811UA 179 La Guardia
## 3 LGA IAH 390 N829UA 179 La Guardia
## 4 EWR IAH 374 N569UA 178 Newark Liberty Intl
## 5 EWR IAH 373 N465UA 200 Newark Liberty Intl
cols = ['origin','dest','arr_delay','tailnum']
filt = (df['dest']=='IAH')
df_base = df.loc[filt,cols].sort_values("arr_delay",ascending=False).head(5)
df_planes = r.planes
df_airports = r.airports
cols2=['origin','dest','arr_delay','tailnum','seats','name']
df_base.merge(df_planes,how='left', on='tailnum') \
.merge(df_airports,how='left',left_on='origin',right_on='faa')[cols2]
## origin dest arr_delay tailnum seats name
## 0 JFK IAH 783.0 N3DGAA NaN John F Kennedy Intl
## 1 LGA IAH 435.0 N811UA 179.0 La Guardia
## 2 LGA IAH 390.0 N829UA 179.0 La Guardia
## 3 EWR IAH 374.0 N569UA 178.0 Newark Liberty Intl
## 4 EWR IAH 373.0 N465UA 200.0 Newark Liberty Intl
5. UNION
select tailnum,model,seats
from planes
where tailnum in ('N10156','N110UW')
union
select tailnum,model,seats
from planes
where tailnum in ('N281AT','N381AA')
Table 5: 4 records
N10156 |
EMB-145XR |
55 |
N110UW |
A320-214 |
182 |
N281AT |
A340-313 |
375 |
N381AA |
DC-7BF |
102 |
df1 <- planes %>% filter(tailnum %in% c('N10156','N110UW')) %>%
select(tailnum,model,seats)
df2 <- planes %>% filter(tailnum %in% c('N281AT','N381AA')) %>%
select(tailnum,model,seats)
df1 %>% union(df2)
## # A tibble: 4 x 3
## tailnum model seats
## <chr> <chr> <int>
## 1 N10156 EMB-145XR 55
## 2 N110UW A320-214 182
## 3 N281AT A340-313 375
## 4 N381AA DC-7BF 102
cols=['tailnum','model','seats']
filt = df_planes['tailnum'].isin(['N10156','N110UW'])
filt2 = df_planes['tailnum'].isin(['N281AT','N381AA'])
pd.concat([df_planes.loc[filt,cols],df_planes.loc[filt2,cols]])
## tailnum model seats
## 0 N10156 EMB-145XR 55
## 9 N110UW A320-214 182
## 603 N281AT A340-313 375
## 1037 N381AA DC-7BF 102
6. Analytics and aggregate functions
6.1 counts and percentage
select origin,dest,count(*) as cnt
,100 * count(*)/sum(count(*)) over() as prop
from flights
where dest='IAH'
group by origin,dest;
Table 6: 3 records
EWR |
IAH |
3973 |
55 |
JFK |
IAH |
274 |
3 |
LGA |
IAH |
2951 |
40 |
flights %>%
select(origin,dest) %>%
filter(dest=='IAH') %>%
count(origin,dest) %>%
mutate(prop=100*round(prop.table(n),2))
## # A tibble: 3 x 4
## origin dest n prop
## <chr> <chr> <int> <dbl>
## 1 EWR IAH 3973 55.
## 2 JFK IAH 274 4
## 3 LGA IAH 2951 41
cols = ['origin','dest','flight']
filt = (df['dest']=='IAH')
grp = ['origin','dest']
df_p = df.loc[filt,cols].groupby(grp).agg({'flight':np.size}).reset_index().rename(columns={'flight':'cnt'})
df_p['prop'] = 100 * df_p['cnt']/df_p['cnt'].sum()
print(df_p)
## origin dest cnt prop
## 0 EWR IAH 3973 55.195888
## 1 JFK IAH 274 3.806613
## 2 LGA IAH 2951 40.997499
6.2 Top N rows per group
select *
from
(
select origin
,dest
,arr_delay
,ROW_NUMBER () OVER (PARTITION BY origin,dest ORDER BY arr_delay DESC) as rank_delay
from flights
) a
where a.rank_delay = 1
limit 10;
Table 7: Displaying records 1 - 10
EWR |
ALB |
328 |
1 |
EWR |
ANC |
39 |
1 |
EWR |
ATL |
796 |
1 |
EWR |
AUS |
349 |
1 |
EWR |
AVL |
228 |
1 |
EWR |
BDL |
266 |
1 |
EWR |
BNA |
364 |
1 |
EWR |
BOS |
422 |
1 |
EWR |
BQN |
208 |
1 |
EWR |
BTV |
306 |
1 |
flights %>%
group_by(origin,dest) %>%
mutate(rank_delay = dplyr::dense_rank(desc(arr_delay))) %>%
select(origin,dest,arr_delay,rank_delay) %>%
arrange(origin,dest,rank_delay) %>%
filter(rank_delay==1) %>%
head(10)
## # A tibble: 10 x 4
## # Groups: origin, dest [10]
## origin dest arr_delay rank_delay
## <chr> <chr> <dbl> <int>
## 1 EWR ALB 328 1
## 2 EWR ANC 39 1
## 3 EWR ATL 796 1
## 4 EWR AUS 349 1
## 5 EWR AVL 228 1
## 6 EWR BDL 266 1
## 7 EWR BNA 364 1
## 8 EWR BOS 422 1
## 9 EWR BQN 208 1
## 10 EWR BTV 306 1
cols = ['origin','dest','arr_delay','rank_delay']
df.assign(rank_delay = df.sort_values(['arr_delay'], ascending=False) \
.groupby(['origin','dest']) \
.cumcount()+1) \
.query('rank_delay == 1')[cols] \
.sort_values(['origin','dest']).head(10)
## origin dest arr_delay rank_delay
## 20941 EWR ALB 328.0 1
## 295953 EWR ANC 39.0 1
## 57582 EWR ATL 796.0 1
## 259427 EWR AUS 349.0 1
## 291697 EWR AVL 228.0 1
## 129590 EWR BDL 266.0 1
## 21727 EWR BNA 364.0 1
## 143327 EWR BOS 422.0 1
## 233374 EWR BQN 208.0 1
## 287238 EWR BTV 306.0 1
cols = ['origin','dest','arr_delay','rank_delay']
df.assign(rank_delay=df.groupby(['origin','dest'])['arr_delay'] \
.rank(method='first',ascending=False)) \
.query('rank_delay == 1')[cols] \
.sort_values(['origin','dest']).head(10)
## origin dest arr_delay rank_delay
## 20941 EWR ALB 328.0 1.0
## 295953 EWR ANC 39.0 1.0
## 57582 EWR ATL 796.0 1.0
## 259427 EWR AUS 349.0 1.0
## 291697 EWR AVL 228.0 1.0
## 129590 EWR BDL 266.0 1.0
## 21727 EWR BNA 364.0 1.0
## 143327 EWR BOS 422.0 1.0
## 233374 EWR BQN 208.0 1.0
## 287238 EWR BTV 306.0 1.0
6.3 Find duplicates
select origin,dest,count(*) as cnt
from flights
group by origin,dest
having count(*) >1
limit 5
;
Table 8: 5 records
EWR |
ALB |
439 |
EWR |
ANC |
8 |
EWR |
ATL |
5022 |
EWR |
AUS |
968 |
EWR |
AVL |
265 |
flights %>%
count(origin,dest) %>%
filter(n>1) %>%
head()
## # A tibble: 6 x 3
## origin dest n
## <chr> <chr> <int>
## 1 EWR ALB 439
## 2 EWR ANC 8
## 3 EWR ATL 5022
## 4 EWR AUS 968
## 5 EWR AVL 265
## 6 EWR BDL 443
cols = ['origin','dest']
df.groupby(cols).size().head()
## origin dest
## EWR ALB 439
## ANC 8
## ATL 5022
## AUS 968
## AVL 265
## dtype: int64