Split-apply-combine in R

Typical example of split-apply-combine - dplyr

  • group_by & summarise and piviot
mtcars %>%
  group_by(cyl,gear) %>%
  summarise(hp_avg = mean(hp)) %>%
  spread(gear,hp_avg)
## # A tibble: 3 x 4
## # Groups:   cyl [3]
##     cyl   `3`   `4`   `5`
##   <dbl> <dbl> <dbl> <dbl>
## 1     4   97    76   102 
## 2     6  108.  116.  175 
## 3     8  194.   NA   300.
mtcars %>%
  group_by(cyl,gear) %>%
  summarise(hp_avg = mean(hp)) %>%
  tidyr::pivot_wider(names_from = gear,names_prefix = 'gear'
                          ,values_from = hp_avg)
## # A tibble: 3 x 4
## # Groups:   cyl [3]
##     cyl gear3 gear4 gear5
##   <dbl> <dbl> <dbl> <dbl>
## 1     4   97    76   102 
## 2     6  108.  116.  175 
## 3     8  194.   NA   300.

Split-apply-combine using purrr

mtcars %>%
  split(.$cyl) %>% # from base R
  map(~ lm(mpg ~ wt + hp, data = .)) %>%
  map(summary) %>%
  map_dbl("r.squared") 
##         4         6         8 
## 0.6807065 0.5889239 0.4970692
mtcars %>% split(list(.$cyl)) %>%
  map(~ lm(mpg ~ hp + am, data=.)) %>%
  map(coef) %>%
  map(function(x) data.frame(level = names(x), estimate = x,
                             stringsAsFactors = FALSE)) %>%
  bind_rows(.id = "cyl") %>%
  spread(level,estimate)
##   cyl (Intercept)       am          hp
## 1   4    31.87423 4.879098 -0.10599487
## 2   6    21.65792 1.802465 -0.02197757
## 3   8    22.21049 4.234489 -0.03687806
mtcars %>%
  select(cyl,mpg,am,hp) %>%
  split(.$cyl) %>% # from base R
  map(~ lm(mpg ~ ., data = .)) %>%
  map(summary) %>%
  map(function(x) {
    out <- as.data.frame(x$coefficients[,1, drop = FALSE])
    out$level <- row.names(out)
    out[, 2:1]
    }) %>%
  bind_rows(.id = "cyl") %>%
  spread(level,Estimate)
##   cyl (Intercept)       am          hp
## 1   4    31.87423 4.879098 -0.10599487
## 2   6    21.65792 1.802465 -0.02197757
## 3   8    22.21049 4.234489 -0.03687806
mtcars %>% 
  select(cyl,mpg,am,hp) %>%
  split(.$cyl) %>% 
  map(~ tidy(lm(mpg ~ ., data  = .))) %>%
  bind_rows(.id = "cyl") %>%
  select(cyl,term,estimate) %>%
  spread(term,estimate)
## # A tibble: 3 x 4
##   cyl   `(Intercept)`    am      hp
##   <chr>         <dbl> <dbl>   <dbl>
## 1 4              31.9  4.88 -0.106 
## 2 6              21.7  1.80 -0.0220
## 3 8              22.2  4.23 -0.0369

Map functions

mtcars %>%
  map(mean) %>%
  bind_rows() %>%
  gather()
## # A tibble: 11 x 2
##    key     value
##    <chr>   <dbl>
##  1 mpg    20.1  
##  2 cyl     6.19 
##  3 disp  231.   
##  4 hp    147.   
##  5 drat    3.60 
##  6 wt      3.22 
##  7 qsec   17.8  
##  8 vs      0.438
##  9 am      0.406
## 10 gear    3.69 
## 11 carb    2.81
Avatar
Ray Sun
Data Analytics Professional

My interests include AI/ML and data analytics.

Related