SQL R and Python Data manipulation

Ray Sun

May 28, 2020 10 min read R, python, pandas, dplyr, sql

0.1 Setting up environment

library(RSQLite)
library(DBI)
library(tidyverse)
library(nycflights13)
library(reticulate)

reticulate::use_condaenv("r-reticulate",required = TRUE)
np = reticulate::import('numpy')
pd = reticulate::import('pandas')
plt = reticulate::import('matplotlib.pyplot')

con <- dbConnect(RSQLite::SQLite(), ":memory:")
dbWriteTable(con, "flights", flights)
dbWriteTable(con, "airports", airports)
dbWriteTable(con, "planes", planes)

knitr::opts_chunk$set(connection = con)

glimpse(flights)

## Observations: 336,776
## Variables: 19
## $ year           <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, …
## $ month          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ day            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ dep_time       <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558,…
## $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600,…
## $ dep_delay      <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -…
## $ arr_time       <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849…
## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851…
## $ arr_delay      <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -…
## $ carrier        <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", …
## $ flight         <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, …
## $ tailnum        <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N39…
## $ origin         <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA"…
## $ dest           <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD"…
## $ air_time       <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, …
## $ distance       <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733,…
## $ hour           <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, …
## $ minute         <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, …
## $ time_hour      <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 …

import numpy as np
import pandas as pd

df = r.flights.copy()

0.2 Quick reference

	SQL	R	Python
Select columns	select v1, v2 from df;	select(df,v1,v2)	df[[‘v1’,‘v2’]]
Filter	select v1,v2 from df where v1 = 1 and v2 = 2;	filter(df,(v1 == 1 & v2 == 2))	df.query(‘v1 == 1 & v2 == 2’)
Distinct rows	select distinct v1 from df;	unique(df$v1)	df[[‘v1’]].drop_duplicates()
Sampling	select * from df sample 10;	sample_n(df,10)	df.sample(n=10)
Sort	select * from df order by v1, v2 desc;	arrange(df,v1, desc(v2))	df.sort_values([‘v1’,‘v2’],ascending = [True,False])
Columns transaformation	select v1 as col1, a+b as col2 from df	df %>% rename(col1=v1) %>% mutate(col2=a+b)	df.rename(colmns={‘v1’:col1}).assign(col2=df[‘a’]+df[‘b’])
Quick summary	NA	summary(df)	df.describe()
Group by	select grp, avg(v1) as avg from df group by grp;	df %>% group_by(grp) %>% summarise(avg=mean(v1,na.rm=TRUE))	df.groupby(‘grp’).agg({‘avg’:np.mean})
	select grp1, grp2, count(*) from df group by grp;	df %>% count(grp1,grp2)	df.groupby(‘grp1’)[‘grp2’].value_counts()
Pivot		df %>% count(grp1,grp2) %>% spread(grp1,value=n,fill=0)	df.groupby(‘grp1’)[‘grp2’].value_counts().unstack().fillna(0)

1. SELECT

select distinct year,month,origin,dest,arr_delay
from flights 
order by arr_delay desc,dest asc
limit 5;

Table 1: 5 records
year	month	origin	dest	arr_delay
2013	1	JFK	HNL	1272
2013	6	JFK	CMH	1127
2013	1	EWR	ORD	1109
2013	9	JFK	SFO	1007
2013	7	JFK	CVG	989

flights %>% 
  select(year,month,origin,dest,arr_delay) %>% 
  arrange(desc(arr_delay),dest) %>%
  unique() %>% 
  head(5)

## # A tibble: 5 x 5
##    year month origin dest  arr_delay
##   <int> <int> <chr>  <chr>     <dbl>
## 1  2013     1 JFK    HNL        1272
## 2  2013     6 JFK    CMH        1127
## 3  2013     1 EWR    ORD        1109
## 4  2013     9 JFK    SFO        1007
## 5  2013     7 JFK    CVG         989


cols = ['year','month','origin','dest','arr_delay']
df[cols].sort_values(['arr_delay','dest'],ascending=[False,True]).drop_duplicates().head(5)

##         year  month origin dest  arr_delay
## 7072    2013      1    JFK  HNL     1272.0
## 235778  2013      6    JFK  CMH     1127.0
## 8239    2013      1    EWR  ORD     1109.0
## 327043  2013      9    JFK  SFO     1007.0
## 270376  2013      7    JFK  CVG      989.0

2. WHERE

select distinct year,month,origin,dest,arr_delay
from flights 
where dest='IAH' and arr_delay >=300
order by arr_delay desc
limit 5;

Table 2: 5 records
year	month	origin	dest	arr_delay
2013	4	JFK	IAH	783
2013	4	LGA	IAH	435
2013	6	LGA	IAH	390
2013	7	EWR	IAH	374
2013	9	EWR	IAH	373

flights %>% 
  select(year,month,origin,dest,arr_delay) %>% 
  filter(dest=='IAH' & arr_delay >=300) %>%
  arrange(desc(arr_delay)) %>%
  unique() %>% 
  head(5)

## # A tibble: 5 x 5
##    year month origin dest  arr_delay
##   <int> <int> <chr>  <chr>     <dbl>
## 1  2013     4 JFK    IAH         783
## 2  2013     4 LGA    IAH         435
## 3  2013     6 LGA    IAH         390
## 4  2013     7 EWR    IAH         374
## 5  2013     9 EWR    IAH         373


filt = (df['dest']=='IAH') & (df['arr_delay']>=300)
df.loc[filt,cols].sort_values("arr_delay",ascending=False).head(5)

##         year  month origin dest  arr_delay
## 182284  2013      4    JFK  IAH      783.0
## 182153  2013      4    LGA  IAH      435.0
## 247626  2013      6    LGA  IAH      390.0
## 274063  2013      7    EWR  IAH      374.0
## 319938  2013      9    EWR  IAH      373.0

3. GROUPBY


select origin,dest,max(arr_delay),avg(arr_delay),count(*),count(distinct flight) as n_flights
from flights 
where dest='IAH'
group by origin,dest;

Table 3: 3 records
origin	dest	max(arr_delay)	avg(arr_delay)	count(*)	n_flights
EWR	IAH	374	5.406577	3973	453
JFK	IAH	783	17.132841	274	3
LGA	IAH	435	1.450363	2951	381

flights %>% 
  select(origin,dest,arr_delay,flight) %>% 
  filter(dest=='IAH') %>%
  group_by(origin,dest) %>%
  summarise(max_delay=max(arr_delay,na.rm = TRUE)
            ,mean=mean(arr_delay,na.rm = TRUE)
            ,count=n()
            ,n_flight=n_distinct(flight))

## # A tibble: 3 x 6
## # Groups:   origin [3]
##   origin dest  max_delay  mean count n_flight
##   <chr>  <chr>     <dbl> <dbl> <int>    <int>
## 1 EWR    IAH         374  5.41  3973      453
## 2 JFK    IAH         783 17.1    274        3
## 3 LGA    IAH         435  1.45  2951      381


cols = ['origin','dest','arr_delay','dep_delay','flight']
filt = (df['dest']=='IAH')
grp = ['origin','dest']
df_summ = df.loc[filt,cols].groupby(grp) \
.agg({'arr_delay':[np.max], \
'dep_delay':[np.max,np.mean], \
'flight':pd.Series.nunique})

# flatten the multi-index column names

df_summ.columns = ['_'.join(tup).rstrip('_') for tup in df_summ.columns.values]

df_summ

##              arr_delay_amax  dep_delay_amax  dep_delay_mean  flight_nunique
## origin dest                                                                
## EWR    IAH            374.0           397.0       11.826551             453
## JFK    IAH            783.0           761.0       15.617647               3
## LGA    IAH            435.0           427.0        9.058986             381

4. JOIN

select distinct origin,dest,arr_delay,a.tailnum,b.seats,c.name
from flights a
left join planes b on a.tailnum=b.tailnum
left join airports c on a.origin=c.faa
where dest='IAH' and arr_delay >=300
order by arr_delay desc
limit 5;

Table 4: 5 records
origin	dest	arr_delay	tailnum	seats	name
JFK	IAH	783	N3DGAA	NA	John F Kennedy Intl
LGA	IAH	435	N811UA	179	La Guardia
LGA	IAH	390	N829UA	179	La Guardia
EWR	IAH	374	N569UA	178	Newark Liberty Intl
EWR	IAH	373	N465UA	200	Newark Liberty Intl

flights %>% 
  select(origin,dest,arr_delay,tailnum) %>% 
  filter(dest=='IAH') %>%
  left_join(planes,by = c('tailnum'='tailnum')) %>%
  left_join(airports,by = c('origin'='faa')) %>%
  select(origin,dest,arr_delay,tailnum,seats,name) %>%
  arrange(desc(arr_delay)) %>%
  head(5)

## # A tibble: 5 x 6
##   origin dest  arr_delay tailnum seats name               
##   <chr>  <chr>     <dbl> <chr>   <int> <chr>              
## 1 JFK    IAH         783 N3DGAA     NA John F Kennedy Intl
## 2 LGA    IAH         435 N811UA    179 La Guardia         
## 3 LGA    IAH         390 N829UA    179 La Guardia         
## 4 EWR    IAH         374 N569UA    178 Newark Liberty Intl
## 5 EWR    IAH         373 N465UA    200 Newark Liberty Intl


cols = ['origin','dest','arr_delay','tailnum']
filt = (df['dest']=='IAH')

df_base = df.loc[filt,cols].sort_values("arr_delay",ascending=False).head(5)

df_planes = r.planes
df_airports = r.airports

cols2=['origin','dest','arr_delay','tailnum','seats','name']

df_base.merge(df_planes,how='left', on='tailnum') \
       .merge(df_airports,how='left',left_on='origin',right_on='faa')[cols2]

##   origin dest  arr_delay tailnum  seats                 name
## 0    JFK  IAH      783.0  N3DGAA    NaN  John F Kennedy Intl
## 1    LGA  IAH      435.0  N811UA  179.0           La Guardia
## 2    LGA  IAH      390.0  N829UA  179.0           La Guardia
## 3    EWR  IAH      374.0  N569UA  178.0  Newark Liberty Intl
## 4    EWR  IAH      373.0  N465UA  200.0  Newark Liberty Intl

5. UNION


select tailnum,model,seats
from planes
where tailnum in ('N10156','N110UW') 

union

select tailnum,model,seats
from planes
where tailnum in ('N281AT','N381AA')

Table 5: 4 records
tailnum	model	seats
N10156	EMB-145XR	55
N110UW	A320-214	182
N281AT	A340-313	375
N381AA	DC-7BF	102

df1 <- planes %>% filter(tailnum %in% c('N10156','N110UW')) %>%
  select(tailnum,model,seats)

df2 <- planes %>% filter(tailnum %in% c('N281AT','N381AA')) %>%
  select(tailnum,model,seats)

df1 %>% union(df2)

## # A tibble: 4 x 3
##   tailnum model     seats
##   <chr>   <chr>     <int>
## 1 N10156  EMB-145XR    55
## 2 N110UW  A320-214    182
## 3 N281AT  A340-313    375
## 4 N381AA  DC-7BF      102

cols=['tailnum','model','seats']
filt = df_planes['tailnum'].isin(['N10156','N110UW'])
filt2 = df_planes['tailnum'].isin(['N281AT','N381AA'])

pd.concat([df_planes.loc[filt,cols],df_planes.loc[filt2,cols]])

##      tailnum      model  seats
## 0     N10156  EMB-145XR     55
## 9     N110UW   A320-214    182
## 603   N281AT   A340-313    375
## 1037  N381AA     DC-7BF    102

6. Analytics and aggregate functions

6.1 counts and percentage


select origin,dest,count(*) as cnt
,100 * count(*)/sum(count(*)) over() as prop
from flights 
where dest='IAH'
group by origin,dest;

Table 6: 3 records
origin	dest	cnt	prop
EWR	IAH	3973	55
JFK	IAH	274	3
LGA	IAH	2951	40

flights %>% 
  select(origin,dest) %>% 
  filter(dest=='IAH') %>%
  count(origin,dest) %>%
  mutate(prop=100*round(prop.table(n),2))

## # A tibble: 3 x 4
##   origin dest      n  prop
##   <chr>  <chr> <int> <dbl>
## 1 EWR    IAH    3973   55.
## 2 JFK    IAH     274    4 
## 3 LGA    IAH    2951   41


cols = ['origin','dest','flight']
filt = (df['dest']=='IAH')
grp = ['origin','dest']
df_p = df.loc[filt,cols].groupby(grp).agg({'flight':np.size}).reset_index().rename(columns={'flight':'cnt'})

df_p['prop'] = 100 * df_p['cnt']/df_p['cnt'].sum()

print(df_p)

##   origin dest   cnt       prop
## 0    EWR  IAH  3973  55.195888
## 1    JFK  IAH   274   3.806613
## 2    LGA  IAH  2951  40.997499

6.2 Top N rows per group

select * 
from
(
  select origin
  ,dest
  ,arr_delay
  ,ROW_NUMBER () OVER (PARTITION BY origin,dest ORDER BY arr_delay DESC) as rank_delay
  from flights
) a
 
where a.rank_delay = 1
limit 10;

Table 7: Displaying records 1 - 10
origin	dest	arr_delay	rank_delay
EWR	ALB	328	1
EWR	ANC	39	1
EWR	ATL	796	1
EWR	AUS	349	1
EWR	AVL	228	1
EWR	BDL	266	1
EWR	BNA	364	1
EWR	BOS	422	1
EWR	BQN	208	1
EWR	BTV	306	1

flights %>% 
  group_by(origin,dest) %>%
  mutate(rank_delay = dplyr::dense_rank(desc(arr_delay))) %>%
  select(origin,dest,arr_delay,rank_delay) %>%
  arrange(origin,dest,rank_delay) %>%
  filter(rank_delay==1) %>%
  head(10)

## # A tibble: 10 x 4
## # Groups:   origin, dest [10]
##    origin dest  arr_delay rank_delay
##    <chr>  <chr>     <dbl>      <int>
##  1 EWR    ALB         328          1
##  2 EWR    ANC          39          1
##  3 EWR    ATL         796          1
##  4 EWR    AUS         349          1
##  5 EWR    AVL         228          1
##  6 EWR    BDL         266          1
##  7 EWR    BNA         364          1
##  8 EWR    BOS         422          1
##  9 EWR    BQN         208          1
## 10 EWR    BTV         306          1


cols = ['origin','dest','arr_delay','rank_delay']

df.assign(rank_delay = df.sort_values(['arr_delay'], ascending=False) \
.groupby(['origin','dest']) \
.cumcount()+1) \
.query('rank_delay == 1')[cols] \
.sort_values(['origin','dest']).head(10)

##        origin dest  arr_delay  rank_delay
## 20941     EWR  ALB      328.0           1
## 295953    EWR  ANC       39.0           1
## 57582     EWR  ATL      796.0           1
## 259427    EWR  AUS      349.0           1
## 291697    EWR  AVL      228.0           1
## 129590    EWR  BDL      266.0           1
## 21727     EWR  BNA      364.0           1
## 143327    EWR  BOS      422.0           1
## 233374    EWR  BQN      208.0           1
## 287238    EWR  BTV      306.0           1


cols = ['origin','dest','arr_delay','rank_delay']

df.assign(rank_delay=df.groupby(['origin','dest'])['arr_delay'] \
.rank(method='first',ascending=False)) \
.query('rank_delay == 1')[cols] \
.sort_values(['origin','dest']).head(10)

##        origin dest  arr_delay  rank_delay
## 20941     EWR  ALB      328.0         1.0
## 295953    EWR  ANC       39.0         1.0
## 57582     EWR  ATL      796.0         1.0
## 259427    EWR  AUS      349.0         1.0
## 291697    EWR  AVL      228.0         1.0
## 129590    EWR  BDL      266.0         1.0
## 21727     EWR  BNA      364.0         1.0
## 143327    EWR  BOS      422.0         1.0
## 233374    EWR  BQN      208.0         1.0
## 287238    EWR  BTV      306.0         1.0

6.3 Find duplicates

select origin,dest,count(*) as cnt
from flights
group by origin,dest
having count(*) >1
limit 5
;

Table 8: 5 records
origin	dest	cnt
EWR	ALB	439
EWR	ANC	8
EWR	ATL	5022
EWR	AUS	968
EWR	AVL	265

flights %>% 
  count(origin,dest) %>%
  filter(n>1) %>%
  head()

## # A tibble: 6 x 3
##   origin dest      n
##   <chr>  <chr> <int>
## 1 EWR    ALB     439
## 2 EWR    ANC       8
## 3 EWR    ATL    5022
## 4 EWR    AUS     968
## 5 EWR    AVL     265
## 6 EWR    BDL     443


cols = ['origin','dest']

df.groupby(cols).size().head()

## origin  dest
## EWR     ALB      439
##         ANC        8
##         ATL     5022
##         AUS      968
##         AVL      265
## dtype: int64

R Markdown

Ray Sun

Data Analytics Professional

My interests include AI/ML and data analytics.

origin	dest	arr_delay	rank_delay
EWR	ALB	328	1
EWR	ANC	39	1
EWR	ATL	796	1
EWR	AUS	349	1
EWR	AVL	228	1
EWR	BDL	266	1
EWR	BNA	364	1
EWR	BOS	422	1
EWR	BQN	208	1
EWR	BTV	306	1

origin	dest	arr_delay	rank_delay
EWR	ALB	328	1
EWR	ANC	39	1
EWR	ATL	796	1
EWR	AUS	349	1
EWR	AVL	228	1
EWR	BDL	266	1
EWR	BNA	364	1
EWR	BOS	422	1
EWR	BQN	208	1
EWR	BTV	306	1

SQL R and Python Data manipulation

0.1 Setting up environment

0.2 Quick reference

1. SELECT

2. WHERE

3. GROUPBY

4. JOIN

5. UNION

6. Analytics and aggregate functions

6.1 counts and percentage

6.2 Top N rows per group

6.3 Find duplicates

Ray Sun

Data Analytics Professional

Related

origin	dest	arr_delay	rank_delay
EWR	ALB	328	1
EWR	ANC	39	1
EWR	ATL	796	1
EWR	AUS	349	1
EWR	AVL	228	1
EWR	BDL	266	1
EWR	BNA	364	1
EWR	BOS	422	1
EWR	BQN	208	1
EWR	BTV	306	1