SQL R and Python Data manipulation

0.1 Setting up environment

library(RSQLite)
library(DBI)
library(tidyverse)
library(nycflights13)
library(reticulate)

reticulate::use_condaenv("r-reticulate",required = TRUE)
np = reticulate::import('numpy')
pd = reticulate::import('pandas')
plt = reticulate::import('matplotlib.pyplot')

con <- dbConnect(RSQLite::SQLite(), ":memory:")
dbWriteTable(con, "flights", flights)
dbWriteTable(con, "airports", airports)
dbWriteTable(con, "planes", planes)

knitr::opts_chunk$set(connection = con)

glimpse(flights)
## Observations: 336,776
## Variables: 19
## $ year           <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, …
## $ month          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ day            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ dep_time       <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558,…
## $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600,…
## $ dep_delay      <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -…
## $ arr_time       <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849…
## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851…
## $ arr_delay      <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -…
## $ carrier        <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", …
## $ flight         <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, …
## $ tailnum        <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N39…
## $ origin         <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA"…
## $ dest           <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD"…
## $ air_time       <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, …
## $ distance       <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733,…
## $ hour           <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, …
## $ minute         <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, …
## $ time_hour      <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 …
import numpy as np
import pandas as pd

df = r.flights.copy()

0.2 Quick reference

SQL R Python
Select columns select v1, v2 from df; select(df,v1,v2) df[[‘v1’,‘v2’]]
Filter select v1,v2 from df where v1 = 1 and v2 = 2; filter(df,(v1 == 1 & v2 == 2)) df.query(‘v1 == 1 & v2 == 2’)
Distinct rows select distinct v1 from df; unique(df$v1) df[[‘v1’]].drop_duplicates()
Sampling select * from df sample 10; sample_n(df,10) df.sample(n=10)
Sort select * from df order by v1, v2 desc; arrange(df,v1, desc(v2)) df.sort_values([‘v1’,‘v2’],ascending = [True,False])
Columns transaformation select v1 as col1, a+b as col2 from df df %>% rename(col1=v1) %>% mutate(col2=a+b) df.rename(colmns={‘v1’:col1}).assign(col2=df[‘a’]+df[‘b’])
Quick summary NA summary(df) df.describe()
Group by select grp, avg(v1) as avg from df group by grp; df %>% group_by(grp) %>% summarise(avg=mean(v1,na.rm=TRUE)) df.groupby(‘grp’).agg({‘avg’:np.mean})
select grp1, grp2, count(*) from df group by grp; df %>% count(grp1,grp2) df.groupby(‘grp1’)[‘grp2’].value_counts()
Pivot df %>% count(grp1,grp2) %>% spread(grp1,value=n,fill=0) df.groupby(‘grp1’)[‘grp2’].value_counts().unstack().fillna(0)

1. SELECT

select distinct year,month,origin,dest,arr_delay
from flights 
order by arr_delay desc,dest asc
limit 5;
Table 1: 5 records
year month origin dest arr_delay
2013 1 JFK HNL 1272
2013 6 JFK CMH 1127
2013 1 EWR ORD 1109
2013 9 JFK SFO 1007
2013 7 JFK CVG 989
flights %>% 
  select(year,month,origin,dest,arr_delay) %>% 
  arrange(desc(arr_delay),dest) %>%
  unique() %>% 
  head(5)
## # A tibble: 5 x 5
##    year month origin dest  arr_delay
##   <int> <int> <chr>  <chr>     <dbl>
## 1  2013     1 JFK    HNL        1272
## 2  2013     6 JFK    CMH        1127
## 3  2013     1 EWR    ORD        1109
## 4  2013     9 JFK    SFO        1007
## 5  2013     7 JFK    CVG         989

cols = ['year','month','origin','dest','arr_delay']
df[cols].sort_values(['arr_delay','dest'],ascending=[False,True]).drop_duplicates().head(5)
##         year  month origin dest  arr_delay
## 7072    2013      1    JFK  HNL     1272.0
## 235778  2013      6    JFK  CMH     1127.0
## 8239    2013      1    EWR  ORD     1109.0
## 327043  2013      9    JFK  SFO     1007.0
## 270376  2013      7    JFK  CVG      989.0

2. WHERE

select distinct year,month,origin,dest,arr_delay
from flights 
where dest='IAH' and arr_delay >=300
order by arr_delay desc
limit 5;
Table 2: 5 records
year month origin dest arr_delay
2013 4 JFK IAH 783
2013 4 LGA IAH 435
2013 6 LGA IAH 390
2013 7 EWR IAH 374
2013 9 EWR IAH 373
flights %>% 
  select(year,month,origin,dest,arr_delay) %>% 
  filter(dest=='IAH' & arr_delay >=300) %>%
  arrange(desc(arr_delay)) %>%
  unique() %>% 
  head(5)
## # A tibble: 5 x 5
##    year month origin dest  arr_delay
##   <int> <int> <chr>  <chr>     <dbl>
## 1  2013     4 JFK    IAH         783
## 2  2013     4 LGA    IAH         435
## 3  2013     6 LGA    IAH         390
## 4  2013     7 EWR    IAH         374
## 5  2013     9 EWR    IAH         373

filt = (df['dest']=='IAH') & (df['arr_delay']>=300)
df.loc[filt,cols].sort_values("arr_delay",ascending=False).head(5)
##         year  month origin dest  arr_delay
## 182284  2013      4    JFK  IAH      783.0
## 182153  2013      4    LGA  IAH      435.0
## 247626  2013      6    LGA  IAH      390.0
## 274063  2013      7    EWR  IAH      374.0
## 319938  2013      9    EWR  IAH      373.0

3. GROUPBY


select origin,dest,max(arr_delay),avg(arr_delay),count(*),count(distinct flight) as n_flights
from flights 
where dest='IAH'
group by origin,dest;
Table 3: 3 records
origin dest max(arr_delay) avg(arr_delay) count(*) n_flights
EWR IAH 374 5.406577 3973 453
JFK IAH 783 17.132841 274 3
LGA IAH 435 1.450363 2951 381
flights %>% 
  select(origin,dest,arr_delay,flight) %>% 
  filter(dest=='IAH') %>%
  group_by(origin,dest) %>%
  summarise(max_delay=max(arr_delay,na.rm = TRUE)
            ,mean=mean(arr_delay,na.rm = TRUE)
            ,count=n()
            ,n_flight=n_distinct(flight))
## # A tibble: 3 x 6
## # Groups:   origin [3]
##   origin dest  max_delay  mean count n_flight
##   <chr>  <chr>     <dbl> <dbl> <int>    <int>
## 1 EWR    IAH         374  5.41  3973      453
## 2 JFK    IAH         783 17.1    274        3
## 3 LGA    IAH         435  1.45  2951      381

cols = ['origin','dest','arr_delay','dep_delay','flight']
filt = (df['dest']=='IAH')
grp = ['origin','dest']
df_summ = df.loc[filt,cols].groupby(grp) \
.agg({'arr_delay':[np.max], \
'dep_delay':[np.max,np.mean], \
'flight':pd.Series.nunique})

# flatten the multi-index column names

df_summ.columns = ['_'.join(tup).rstrip('_') for tup in df_summ.columns.values]

df_summ
##              arr_delay_amax  dep_delay_amax  dep_delay_mean  flight_nunique
## origin dest                                                                
## EWR    IAH            374.0           397.0       11.826551             453
## JFK    IAH            783.0           761.0       15.617647               3
## LGA    IAH            435.0           427.0        9.058986             381

4. JOIN

select distinct origin,dest,arr_delay,a.tailnum,b.seats,c.name
from flights a
left join planes b on a.tailnum=b.tailnum
left join airports c on a.origin=c.faa
where dest='IAH' and arr_delay >=300
order by arr_delay desc
limit 5;
Table 4: 5 records
origin dest arr_delay tailnum seats name
JFK IAH 783 N3DGAA NA John F Kennedy Intl
LGA IAH 435 N811UA 179 La Guardia
LGA IAH 390 N829UA 179 La Guardia
EWR IAH 374 N569UA 178 Newark Liberty Intl
EWR IAH 373 N465UA 200 Newark Liberty Intl
flights %>% 
  select(origin,dest,arr_delay,tailnum) %>% 
  filter(dest=='IAH') %>%
  left_join(planes,by = c('tailnum'='tailnum')) %>%
  left_join(airports,by = c('origin'='faa')) %>%
  select(origin,dest,arr_delay,tailnum,seats,name) %>%
  arrange(desc(arr_delay)) %>%
  head(5)
## # A tibble: 5 x 6
##   origin dest  arr_delay tailnum seats name               
##   <chr>  <chr>     <dbl> <chr>   <int> <chr>              
## 1 JFK    IAH         783 N3DGAA     NA John F Kennedy Intl
## 2 LGA    IAH         435 N811UA    179 La Guardia         
## 3 LGA    IAH         390 N829UA    179 La Guardia         
## 4 EWR    IAH         374 N569UA    178 Newark Liberty Intl
## 5 EWR    IAH         373 N465UA    200 Newark Liberty Intl

cols = ['origin','dest','arr_delay','tailnum']
filt = (df['dest']=='IAH')

df_base = df.loc[filt,cols].sort_values("arr_delay",ascending=False).head(5)

df_planes = r.planes
df_airports = r.airports

cols2=['origin','dest','arr_delay','tailnum','seats','name']

df_base.merge(df_planes,how='left', on='tailnum') \
       .merge(df_airports,how='left',left_on='origin',right_on='faa')[cols2]
##   origin dest  arr_delay tailnum  seats                 name
## 0    JFK  IAH      783.0  N3DGAA    NaN  John F Kennedy Intl
## 1    LGA  IAH      435.0  N811UA  179.0           La Guardia
## 2    LGA  IAH      390.0  N829UA  179.0           La Guardia
## 3    EWR  IAH      374.0  N569UA  178.0  Newark Liberty Intl
## 4    EWR  IAH      373.0  N465UA  200.0  Newark Liberty Intl

5. UNION


select tailnum,model,seats
from planes
where tailnum in ('N10156','N110UW') 

union

select tailnum,model,seats
from planes
where tailnum in ('N281AT','N381AA') 
Table 5: 4 records
tailnum model seats
N10156 EMB-145XR 55
N110UW A320-214 182
N281AT A340-313 375
N381AA DC-7BF 102
df1 <- planes %>% filter(tailnum %in% c('N10156','N110UW')) %>%
  select(tailnum,model,seats)

df2 <- planes %>% filter(tailnum %in% c('N281AT','N381AA')) %>%
  select(tailnum,model,seats)

df1 %>% union(df2)
## # A tibble: 4 x 3
##   tailnum model     seats
##   <chr>   <chr>     <int>
## 1 N10156  EMB-145XR    55
## 2 N110UW  A320-214    182
## 3 N281AT  A340-313    375
## 4 N381AA  DC-7BF      102
cols=['tailnum','model','seats']
filt = df_planes['tailnum'].isin(['N10156','N110UW'])
filt2 = df_planes['tailnum'].isin(['N281AT','N381AA'])

pd.concat([df_planes.loc[filt,cols],df_planes.loc[filt2,cols]])
##      tailnum      model  seats
## 0     N10156  EMB-145XR     55
## 9     N110UW   A320-214    182
## 603   N281AT   A340-313    375
## 1037  N381AA     DC-7BF    102

6. Analytics and aggregate functions

6.1 counts and percentage


select origin,dest,count(*) as cnt
,100 * count(*)/sum(count(*)) over() as prop
from flights 
where dest='IAH'
group by origin,dest;
Table 6: 3 records
origin dest cnt prop
EWR IAH 3973 55
JFK IAH 274 3
LGA IAH 2951 40
flights %>% 
  select(origin,dest) %>% 
  filter(dest=='IAH') %>%
  count(origin,dest) %>%
  mutate(prop=100*round(prop.table(n),2))
## # A tibble: 3 x 4
##   origin dest      n  prop
##   <chr>  <chr> <int> <dbl>
## 1 EWR    IAH    3973   55.
## 2 JFK    IAH     274    4 
## 3 LGA    IAH    2951   41

cols = ['origin','dest','flight']
filt = (df['dest']=='IAH')
grp = ['origin','dest']
df_p = df.loc[filt,cols].groupby(grp).agg({'flight':np.size}).reset_index().rename(columns={'flight':'cnt'})

df_p['prop'] = 100 * df_p['cnt']/df_p['cnt'].sum()

print(df_p)
##   origin dest   cnt       prop
## 0    EWR  IAH  3973  55.195888
## 1    JFK  IAH   274   3.806613
## 2    LGA  IAH  2951  40.997499

6.2 Top N rows per group

select * 
from
(
  select origin
  ,dest
  ,arr_delay
  ,ROW_NUMBER () OVER (PARTITION BY origin,dest ORDER BY arr_delay DESC) as rank_delay
  from flights
) a
 
where a.rank_delay = 1
limit 10;
Table 7: Displaying records 1 - 10
origin dest arr_delay rank_delay
EWR ALB 328 1
EWR ANC 39 1
EWR ATL 796 1
EWR AUS 349 1
EWR AVL 228 1
EWR BDL 266 1
EWR BNA 364 1
EWR BOS 422 1
EWR BQN 208 1
EWR BTV 306 1
flights %>% 
  group_by(origin,dest) %>%
  mutate(rank_delay = dplyr::dense_rank(desc(arr_delay))) %>%
  select(origin,dest,arr_delay,rank_delay) %>%
  arrange(origin,dest,rank_delay) %>%
  filter(rank_delay==1) %>%
  head(10)
## # A tibble: 10 x 4
## # Groups:   origin, dest [10]
##    origin dest  arr_delay rank_delay
##    <chr>  <chr>     <dbl>      <int>
##  1 EWR    ALB         328          1
##  2 EWR    ANC          39          1
##  3 EWR    ATL         796          1
##  4 EWR    AUS         349          1
##  5 EWR    AVL         228          1
##  6 EWR    BDL         266          1
##  7 EWR    BNA         364          1
##  8 EWR    BOS         422          1
##  9 EWR    BQN         208          1
## 10 EWR    BTV         306          1

cols = ['origin','dest','arr_delay','rank_delay']

df.assign(rank_delay = df.sort_values(['arr_delay'], ascending=False) \
.groupby(['origin','dest']) \
.cumcount()+1) \
.query('rank_delay == 1')[cols] \
.sort_values(['origin','dest']).head(10)
##        origin dest  arr_delay  rank_delay
## 20941     EWR  ALB      328.0           1
## 295953    EWR  ANC       39.0           1
## 57582     EWR  ATL      796.0           1
## 259427    EWR  AUS      349.0           1
## 291697    EWR  AVL      228.0           1
## 129590    EWR  BDL      266.0           1
## 21727     EWR  BNA      364.0           1
## 143327    EWR  BOS      422.0           1
## 233374    EWR  BQN      208.0           1
## 287238    EWR  BTV      306.0           1

cols = ['origin','dest','arr_delay','rank_delay']

df.assign(rank_delay=df.groupby(['origin','dest'])['arr_delay'] \
.rank(method='first',ascending=False)) \
.query('rank_delay == 1')[cols] \
.sort_values(['origin','dest']).head(10)
##        origin dest  arr_delay  rank_delay
## 20941     EWR  ALB      328.0         1.0
## 295953    EWR  ANC       39.0         1.0
## 57582     EWR  ATL      796.0         1.0
## 259427    EWR  AUS      349.0         1.0
## 291697    EWR  AVL      228.0         1.0
## 129590    EWR  BDL      266.0         1.0
## 21727     EWR  BNA      364.0         1.0
## 143327    EWR  BOS      422.0         1.0
## 233374    EWR  BQN      208.0         1.0
## 287238    EWR  BTV      306.0         1.0

6.3 Find duplicates

select origin,dest,count(*) as cnt
from flights
group by origin,dest
having count(*) >1
limit 5
;
Table 8: 5 records
origin dest cnt
EWR ALB 439
EWR ANC 8
EWR ATL 5022
EWR AUS 968
EWR AVL 265
flights %>% 
  count(origin,dest) %>%
  filter(n>1) %>%
  head()
## # A tibble: 6 x 3
##   origin dest      n
##   <chr>  <chr> <int>
## 1 EWR    ALB     439
## 2 EWR    ANC       8
## 3 EWR    ATL    5022
## 4 EWR    AUS     968
## 5 EWR    AVL     265
## 6 EWR    BDL     443

cols = ['origin','dest']

df.groupby(cols).size().head()
## origin  dest
## EWR     ALB      439
##         ANC        8
##         ATL     5022
##         AUS      968
##         AVL      265
## dtype: int64
Avatar
Ray Sun
Data Analytics Professional

My interests include AI/ML and data analytics.

Related