Data visualisation with R ggplot2

1. Overview

  • The template used to create a ggplot2 chart.
ggplot(data = <DATA>) + 
  <GEOM_FUNCTION>(
     mapping = aes(<MAPPINGS>),
     stat = <STAT>, 
     position = <POSITION>
  ) +
  <COORDINATE_FUNCTION> +
  <FACET_FUNCTION>

2. Comparison

2.1 Density (Numeric Distribution)

A density plot shows the distribution of a numeric variable.

# Make the histogram
 p1 <- diamonds %>%
  #filter( price<300 ) %>%
  ggplot() +
    geom_density(aes(x=price),
                 fill="#69b3a2", 
                 color="#e9ecef", 
                 alpha=0.8) +
    scale_x_continuous(limits = quantile(diamonds$price,c(0.01,0.99))) +
    geom_vline(aes(xintercept = mean(price)), 
             linetype = "dashed", size = 0.6,
             color = "#FC4E07") +
    ggtitle("Diamonds price distribution")



 # Dummy data
data <- data.frame(
  var1 = rnorm(1000),
  var2 = rnorm(1000, mean=2)
)

# Chart
p2 <- ggplot(data, aes(x=x) ) +
  # Top
  geom_density( aes(x = var1, y = ..density..), fill="#69b3a2" ) +
  geom_label( aes(x=4.5, y=0.25, label="variable1"), color="#69b3a2") +
  # Bottom
  geom_density( aes(x = var2, y = -..density..), fill= "#404080") +
  geom_label( aes(x=4.5, y=-0.25, label="variable2"), color="#404080") +
  xlab("value of x")

# Chart
p3 <- ggplot(data, aes(x=x) ) +
  geom_histogram( aes(x = var1, y = ..density..), fill="#69b3a2" ) +
  geom_label( aes(x=4.5, y=0.25, label="variable1"), color="#69b3a2") +
  geom_histogram( aes(x = var2, y = -..density..), fill= "#404080") +
  geom_label( aes(x=4.5, y=-0.25, label="variable2"), color="#404080") +
  xlab("value of x")

grid.arrange(p1,p2,p3,nrow=3)

Compare desity by groups

p1 <- ggplot(data=diamonds, aes(x=price, group=cut, fill=cut)) +
    geom_density(adjust=1.5, alpha=.4) 

p2 <- ggplot(data=diamonds, aes(x=price, group=cut, fill=cut)) +
    geom_density(adjust=1.5) +
    facet_wrap(~cut) +
    theme(
      legend.position="none",
      panel.spacing = unit(0.1, "lines"),
      axis.ticks.x=element_blank()
    )

# basic example
library(ggridges)
p3 <- ggplot(diamonds, aes(x = price, y = cut, fill = cut)) +
  geom_density_ridges() +
  theme_ridges() + 
  theme(legend.position = "none")

grid.arrange(p1, # First row with one plot spaning over 2 columns
             arrangeGrob(p2, p3, ncol = 2), # Second row with 2 plots in 2 different columns
             nrow = 2)     

2.2 Boxplot (Numeric Distribution)

ggplot(mtcars, aes(x=as.factor(cyl), y=mpg)) + 
    geom_boxplot(fill="slateblue", alpha=0.2,outlier.shape = NA) + 
    scale_y_continuous(limits = quantile(mtcars$mpg,c(0.01,0.99))) +
    xlab("cyl")

# create a data frame
variety=rep(LETTERS[1:7], each=40)
treatment=rep(c("high","low"),each=20)
note=seq(1:280)+sample(1:150, 280, replace=T)
data=data.frame(variety, treatment ,  note)
 
# grouped boxplot
p1 <- ggplot(data, aes(x=variety, y=note, fill=treatment)) + 
    geom_boxplot()
ggplotly(p1) %>%layout(boxmode = "group")
p2 <- ggplot(data, aes(x=variety, y=note, fill=treatment)) + 
    geom_violin()


gridExtra::grid.arrange(p1,p2,nrow=2,top="2 boxplot charts")

2.3 Histogram

# plot
bin <- 20
p <- ggplot(data=diamonds) +
     geom_histogram( aes(x=price),
                    #binwidth=1000, #function(x) 2 * IQR(x) / (length(x)^(1/3)), 
                    bins = bin,
                    fill="#69b3a2"
                    , color="#e9ecef"
                    , alpha=0.9) +
    ggtitle(paste0("Bin size = ",bin) ) 

 plotly::ggplotly(p)

2.4 Barchart

  • Barchart with Error Bars
# create dummy data
data <- data.frame(
  name=letters[1:5],
  value=sample(seq(4,15),5),
  sd=c(1,0.2,3,2,4)
)
 
# Most basic error bar
ggplot(data) +
    geom_bar( aes(x=name, y=value), stat="identity", fill="skyblue", alpha=0.7) +
    geom_errorbar( aes(x=name, ymin=value-sd, ymax=value+sd), 
                   width=0.4, colour="orange", alpha=0.9, size=1.3)

  • 2 groups
#Let's build a dataset : height of 10 sorgho and poacee sample in 3 environmental conditions (A, B, C)
data <- data.frame(
  specie=c(rep("sorgho" , 10) , rep("poacee" , 10) ),
  cond_A=rnorm(20,10,4),
  cond_B=rnorm(20,8,3),
  cond_C=rnorm(20,5,4)
)

#Let's calculate the average value for each condition and each specie with the *aggregate* function
bilan <- aggregate(cbind(cond_A,cond_B,cond_C)~specie , data=data , mean)
rownames(bilan) <- bilan[,1]
bilan <- as.matrix(bilan[,-1])
 
#Plot boundaries
lim <- 1.2*max(bilan)

#A function to add arrows on the chart
error.bar <- function(x, y, upper, lower=upper, length=0.1,...){
  arrows(x,y+upper, x, y-lower, angle=90, code=3, length=length, ...)
}
 
#Then I calculate the standard deviation for each specie and condition :
stdev <- aggregate(cbind(cond_A,cond_B,cond_C)~specie , data=data , sd)
rownames(stdev) <- stdev[,1]
stdev <- as.matrix(stdev[,-1]) * 1.96 / 10
 
#I am ready to add the error bar on the plot using my "error bar" function !
ze_barplot <- barplot(bilan , beside=T , legend.text=T,col=c("blue" , "skyblue") 
                      , ylim=c(0,lim) , ylab="height")
error.bar(ze_barplot,bilan, stdev)

diamonds %>% filter(cut %in% c('Fair','Ideal')) %>%
  mutate(price_grp=cut(price,breaks = c(-Inf,1000,2000,3000,4000,5000,Inf))) %>%
  ggplot(aes(x=price_grp,fill=cut)) +
    geom_bar(color="#e9ecef", alpha=0.6, position = 'identity') +
    scale_fill_manual(values=c("#69b3a2", "#404080")) +
    ggtitle("Diamond price distribution")

2.5 Stack Columns

  • Group barchart and Percent stacked barchar
# create a dataset
specie <- c(rep("sorgho" , 3) , rep("poacee" , 3) , rep("banana" , 3) , rep("triticum" , 3) )
condition <- rep(c("normal" , "stress" , "Nitrogen") , 4)
value <- abs(rnorm(12 , 0 , 15))
data <- data.frame(specie,condition,value)
 
# Grouped
p1 <- ggplot(data, aes(fill=condition, y=value, x=specie)) + 
    geom_bar(position="dodge", stat="identity") +
    ggtitle("Studying 4 species..")

 
# Stacked + percent
p2 <- ggplot(data, aes(fill=condition, y=value, x=specie)) + 
    geom_bar(position="fill", stat="identity") +
    ggtitle("Studying 4 species..")
 
# Graph
p3 <- ggplot(data, aes(fill=condition, y=value, x=condition)) + 
    geom_bar(position="dodge", stat="identity") +
    ggtitle("Studying 4 species..") +
    facet_wrap(~specie) +
    theme(legend.position="none") +
    xlab("")


grid.arrange(p1, # First row with one plot spaning over 2 columns
             arrangeGrob(p2, p3, ncol = 2), # Second row with 2 plots in 2 different columns
             nrow = 2)     

2.6 Rank

Barchart rank

p1 <- diamonds %>% dplyr::group_by(cut)  %>% tally() %>%
    ggplot( aes(x=cut, y=n)) +
    geom_bar(stat="identity", fill="#f68060", alpha=.6, width=.4) +
    #coord_flip() +
    theme_bw() +
    xlab("") 

p2 <- diamonds %>% dplyr::group_by(cut)  %>% tally() %>% mutate(cut2=fct_reorder(cut,desc(n))) %>%
    ggplot( aes(x=cut2, y=n)) +
    geom_bar(stat="identity", fill="#f68060", alpha=.6, width=.4) +
    coord_flip() +
    theme_bw() +
    xlab("") 

p3 <- diamonds %>% dplyr::group_by(cut)  %>% tally() %>%
  ggplot( aes(x=cut, y=n)) +
    geom_segment( aes(xend=cut, yend=0)) +
    geom_point( size=4, color="orange") +
    coord_flip() +
    theme_bw() +
    xlab("")
gridExtra::grid.arrange(p1,p2,p3,nrow=3)

grid.arrange(p1, # First row with one plot spaning over 2 columns
             arrangeGrob(p2, p3, ncol = 2), # Second row with 2 plots in 2 different columns
             nrow = 2)  

2.7 Heatmap

# Dummy data
x <- LETTERS[1:20]
y <- paste0("var", seq(1,20))
data <- expand.grid(X=x, Y=y)
data$Z <- runif(400, 0, 5)
 
# Heatmap 
ggplot(data, aes(X, Y, fill= Z)) + 
  geom_tile()

2.8 Bullet

fig <- plot_ly() 
fig <- fig %>%
  add_trace(
    type = "indicator",
    mode = "number+gauge+delta",
    value = 180,
    delta = list(reference = 200),
    domain = list(x = c(0.25, 1), y = c(0.08, 0.25)),
    title =list(text = "Revenue"),
    gauge = list(
      shape = "bullet",
      axis = list(range = c(NULL, 300)),
      threshold = list(
        line= list(color = "black", width = 2),
        thickness = 0.75,
        value = 170),
      steps = list(
        list(range = c(0, 150), color = "gray"),
        list(range = c(150, 250), color = "lightgray")),
      bar = list(color = "black"))) 
fig <- fig %>%
  add_trace(
    type = "indicator",
    mode = "number+gauge+delta",
    value = 35,
    delta = list(reference = 200),
    domain = list(x = c(0.25, 1), y = c(0.4, 0.6)),
    title = list(text = "Profit"),
    gauge = list(
      shape = "bullet",
      axis = list(range = list(NULL, 100)),
      threshold = list(
        line = list(color = "black", width= 2),
        thickness = 0.75,
        value = 50),
      steps = list(
        list(range = c(0, 25), color = "gray"),
        list(range = c(25, 75), color = "lightgray")),
      bar = list(color = "black"))) 
fig <- fig %>%
  add_trace(
    type =  "indicator",
    mode = "number+gauge+delta",
    value = 220,
    delta = list(reference = 300 ),
    domain = list(x = c(0.25, 1), y = c(0.7, 0.9)),
    title = list(text = "Satisfaction"),
    gauge = list(
      shape = "bullet",
      axis = list(range = list(NULL, 300)),
      threshold = list(
        line = list(color = "black", width = 2),
        thickness = 0.75,
        value = 210),
      steps = list(
        list(range = c(0, 100), color = "gray"),
        list(range = c(100, 250), color = "lightgray")),
      bar = list(color = "black")))

fig

2.9 WaterFall (Part of Whole)

df_wf <- diamonds %>% count(cut) %>%
  mutate(prop=round(prop.table(n),digits = 2)*100) %>%
  rbind(cbind(cut='Total',as.data.frame.list(colSums(.[,-1]))))

df_wf$cut <- as.factor(df_wf$cut)

df_wf$cut <- fct_relevel(df_wf$cut,c('Total'
                        ,'Fair'
                        ,'Good'
                        ,'Ideal'
                        ,'Premium'
                        ,'Very Good'
                        ))

df_wf_plt <- df_wf %>% 
  arrange(prop) %>%
  mutate(csum=cumsum(prop),
         cut = fct_reorder(cut,prop,.desc=TRUE),
         id = as.integer(cut),
         labl = paste0(scales::comma(n),' (',prop,'%)'),
         desc = case_when(id == 2 ~ "Volume Rank 2",
                          id == 3  ~ "Volume Rank 3",
                          id == 4  ~ "Volume Rank 4",
                          id == 5  ~ "Volume Rank 5",
                          TRUE ~ "cut")) %>%
  arrange(id) %>%
  mutate(end = csum - prop,
         strt = lag(end,default = 0))
  


 df_wf_plt %>%
  ggplot(aes(x=cut
             , xmin = id - 0.45
             , xmax = id + 0.45
             , ymin = end
             , ymax = strt
             )) +
  geom_rect(colour = "black"
            ,fill = "#FFFF66"
            ,alpha = 0.6
            , show.legend = FALSE) +
   geom_text(aes(id,end, 
                   label = labl), 
               vjust = 1.5, 
               size = 3) +
   geom_text(aes(id,strt, 
                   label = desc), 
               vjust = -0.5, 
               size = 3) +
   labs(title = "Waterfall Chart",
        subtitle = "By Diamonds Cut",
        caption = "(Based on data from ...)") +
   xlab("Cut Type") + 
   ylab("Percentage") +
   theme_minimal()

# create company income statement
category <- c("Sales", "Services", "Fixed Costs", 
              "Variable Costs", "Taxes")
amount <- c(101000, 52000, -23000, -15000, -10000)
income <- data.frame(category, amount)

waterfalls::waterfall(income, 
          calc_total=TRUE, 
          total_axis_text = "Net",
          total_rect_text_color="black",
          total_rect_color="goldenrod1") +
  scale_y_continuous(label=scales::dollar) +
  labs(title = "West Coast Profit and Loss", 
       subtitle = "Year 2017",
       y="", 
       x="") +
  theme_minimal() 

balance <- data.frame(desc = c("Starting Cash",
     "Sales", "Refunds", "Payouts", "Court Losses",
     "Court Wins", "Contracts", "End Cash"), 
     amount = c(2000,
     3400, -1100, -100, -6600, 3800, 1400, 2800))
balance$desc <- factor(balance$desc, levels = balance$desc)
balance$id <- seq_along(balance$amount)
balance$type <- ifelse(balance$amount > 0, "in","out")
balance[balance$desc %in% c("Starting Cash", "End Cash"),"type"] <- "net"

balance$end <- cumsum(balance$amount)
balance$end <- c(head(balance$end, -1), 0)
balance$start <- c(0, head(balance$end, -1))
balance <- balance[, c(3, 1, 4, 6, 5, 2)]

# id          desc type start   end amount
# 1  1 Starting Cash  net     0  2000   2000
# 2  2         Sales   in  2000  5400   3400
# 3  3       Refunds  out  5400  4300  -1100
# 4  4       Payouts  out  4300  4200   -100
# 5  5  Court Losses  out  4200 -2400  -6600
# 6  6    Court Wins   in -2400  1400   3800
# 7  7     Contracts   in  1400  2800   1400
# 8  8      End Cash  net  2800     0   2800


# ggplot(balance, aes(desc, fill = type)) + 
#   geom_rect(aes(x = desc,xmin = id - 0.45, xmax = id + 0.45, ymin = end,
#      ymax = start))
# balance$type <- factor(balance$type, levels = c("out","in", "net"))

strwr <- function(str) gsub(" ", "\n", str)

p1 <- ggplot(balance, aes(fill = type)) + 
  geom_rect(aes(x = desc,
                xmin = id - 0.45, 
                xmax = id + 0.45, 
                ymin = end,
                ymax = start)) + 
  scale_y_continuous("", labels = scales::comma) +
     scale_x_discrete("", breaks = levels(balance$desc),
         labels = strwr(levels(balance$desc))) +
     theme(legend.position = "none")

p1 + geom_text(data = balance[balance$type == "in",], 
               aes(id,end, 
                   label = scales::comma(amount)), 
               vjust = 1, 
               size = 3) +
     geom_text(data = balance[balance$type == "out",], aes(id,
         end, label = scales::comma(amount)), vjust = -0.3,
         size = 3) + 
  geom_text(data = subset(balance,
     type == "net" & id == min(id)), aes(id, end,
     colour = type, label = scales::comma(end), vjust = ifelse(end <
         start, 1, -0.3)), size = 3.5) + 
  geom_text(data = subset(balance,
     type == "net" & id == max(id)), aes(id, start,
     colour = type, label = scales::comma(start), vjust = ifelse(end <
         start, -0.3, 1)), size = 3.5)

3. Relationship

3.1 Scatter

# A basic scatterplot with color depending on Species
p1 <- ggplot(iris, aes(x=Sepal.Length, y=Sepal.Width, color=Species)) + 
    geom_point(size=3)

p2 <- ggplot(iris, aes(x=Sepal.Length, y=Sepal.Width)) +
  geom_point() +
  geom_smooth(method=lm , color="red", fill="#69b3a2", se=TRUE) 

gridExtra::grid.arrange(p1,p2,nrow=2)

3.2 Correlation

# Quick display of two cabapilities of GGally, to assess the distribution and correlation of variables 
library(GGally)
 
# Create data 
data(flea)
ggpairs(flea, columns = 2:4, ggplot2::aes(colour=species)) 

3.3 Combined Bar and line

# Build dummy data
data <- data.frame(
  day = as.Date("2019-01-01") + 0:99,
  temperature = runif(100) + seq(1,100)^2.5 / 10000,
  price = runif(100) + seq(100,1)^1.5 / 10
)

# Value used to transform the data
coeff <- 10

# A few constants
temperatureColor <- "#69b3a2"
priceColor <- rgb(0.2, 0.6, 0.9, 1)

ggplot(head(data, 80), aes(x=day)) +
  
  geom_bar( aes(y=temperature), stat="identity", size=.1, fill=temperatureColor, color="black", alpha=.4) + 
  geom_line( aes(y=price / coeff), size=2, color=priceColor) +
  
  scale_y_continuous(
    
    # Features of the first axis
    name = "Temperature (Celsius °)",
    
    # Add a second axis and specify its features
    sec.axis = sec_axis(~.*coeff, name="Price ($)")
  ) + 
  
  theme(
    axis.title.y = element_text(color = temperatureColor, size=13),
    axis.title.y.right = element_text(color = priceColor, size=13)
  ) +

  ggtitle("Temperature down, price up")

3.6 Evolution (Change over time)

# Load dataset from github
data <- read.table("https://raw.githubusercontent.com/holtzy/data_to_viz/master/Example_dataset/3_TwoNumOrdered.csv", header=T)
data$date <- as.Date(data$date)

# plot
data %>% 
  ggplot( aes(x=date, y=value)) +
    geom_line(color="#69b3a2") +
    ylim(0,22000) +
    annotate(geom="text", x=as.Date("2017-01-01"), y=20089, 
             label="Bitcoin price reached 20k $\nat the end of 2017") +
    annotate(geom="point", x=as.Date("2017-12-17"), y=20089, 
             size=10, shape=21, fill="transparent") +
    geom_hline(yintercept=5000, color="orange", size=.5)

library(dygraphs)
library(xts)          # To make the convertion data-frame / xts format
 
# Create data 
data <- data.frame(
  time=seq(from=Sys.Date()-40, to=Sys.Date(), by=1 ), 
  value=runif(41)
)

# Double check time is at the date format
str(data$time)
##  Date[1:41], format: "2020-04-18" "2020-04-19" "2020-04-20" "2020-04-21" "2020-04-22" ...
# Switch to XTS format
data <- xts(x = data$value, order.by = data$time)
 
# Default = line plot --> See chart #316
 
# Add points
p1 <- dygraph(data) %>%
  dyOptions( drawPoints = TRUE, pointSize = 4 )
p1
p2 <- dygraph(data) %>%
  dyOptions( fillGraph=TRUE )
trend <- sin(seq(1,41))+runif(41)
data <- data.frame(
  time=seq(from=Sys.Date()-40, to=Sys.Date(), by=1 ), 
  trend=trend, 
  max=trend+abs(rnorm(41)), 
  min=trend-abs(rnorm(41, sd=1))
)

# switch to xts format
data <- xts(x = data[,-1], order.by = data$time)

# Plot
p3 <- dygraph(data) %>%
  dySeries(c("min", "trend", "max"))


p3
df <- economics %>%
  select(date, psavert, uempmed) %>%
  gather(key = "variable", value = "value", -date)

# Multiple line plot
p1 <- ggplot(df, aes(x = date, y = value)) + 
  geom_line(aes(color = variable), size = 1) +
  scale_color_manual(values = c("#00AFBB", "#E7B800")) +
  theme_minimal() 
# Area plot
p2 <- ggplot(df, aes(x = date, y = value)) + 
  geom_area(aes(color = variable, fill = variable), 
            alpha = 0.5, position = position_dodge(0.8)) +
  scale_color_manual(values = c("#00AFBB", "#E7B800")) +
  scale_fill_manual(values = c("#00AFBB", "#E7B800")) +
  theme_minimal()

gridExtra::grid.arrange(p1,p2,nrow=2)

3.7 Calendar-Heatmap

library(lubridate) # for easy date manipulation

amznStock = as.data.frame(tidyquant::tq_get(c("AMZN"),get="stock.prices")) # get data using tidyquant
amznStock = amznStock[year(amznStock$date) > 2017, ] # Using data only after 2012 

amznStock$weekday = as.POSIXlt(amznStock$date)$wday #finding the day no. of the week
amznStock$weekdayf<-factor(amznStock$weekday,levels=rev(1:7),labels=rev(c("Mon","Tue","Wed","Thu","Fri","Sat","Sun")),ordered=TRUE) # converting the day no. to factor

amznStock$monthf<-factor(month(amznStock$date),levels=as.character(1:12),labels=c("Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"),ordered=TRUE) # finding the month

amznStock$week <- as.numeric(format(amznStock$date,"%W")) # finding the week of the year for each date

amznStock$day <- lubridate::day(amznStock$date)

p <- ggplot(amznStock, aes(monthf, day, fill = amznStock$adjusted)) + 
    geom_tile(colour = "white") + facet_grid(year(amznStock$date)~ .) + scale_fill_gradient(low="red", high="green") +  xlab("Month") + ylab("") + ggtitle("Time-Series Calendar Heatmap: AMZN Stock Prices") + labs(fill = "Price")

p

stock.data <- transform(amznStock,
  week = as.POSIXlt(amznStock$date)$yday %/% 7 + 1,
  wday = as.POSIXlt(amznStock$date)$wday,
  year = as.POSIXlt(amznStock$date)$year + 1900)

library(ggplot2)

ggplot(stock.data, aes(week, wday, fill = adjusted)) + 

  geom_tile(colour = "white") + 

  scale_fill_gradientn(colours = c("#D61818","#FFAE63","#FFFFBD","#B5E384")) + 

  facet_wrap(~ year, ncol = 1)

4 Spatial

4.1 Map

# Load the library

# Note: if you do not already installed it, install it with:
# install.packages("leaflet")

# Background 1: NASA
# m <- leaflet() %>% 
#    addTiles() %>% 
#    setView( lng = 2.34, lat = 48.85, zoom = 5 ) %>% 
#    addProviderTiles("NASAGIBS.ViirsEarthAtNight2012")
# m
 
# Background 2: World Imagery
m <- leaflet() %>% 
   addTiles() %>% 
   setView( lng = 2.34, lat = 48.85, zoom = 3 ) %>% 
   addProviderTiles("Esri.WorldImagery")
m
data("world.cities")

df <- world.cities %>% filter(country.etc=="Australia")
## define a palette for hte colour
pal <- colorNumeric(palette = "YlOrRd",
                    domain = df$pop)

leaflet(data = df) %>%
    addTiles() %>%
    addCircleMarkers(lat = ~lat, lng = ~long, popup = ~name, 
                     color = ~pal(pop), stroke = FALSE, fillOpacity = 0.6) %>%
    addLegend(position = "bottomleft", pal = pal, values = ~pop)
leaflet(data = df) %>%
    addTiles() %>%
    addMarkers(lat = ~lat, lng = ~long, popup = ~name, 
                     label=~ as.character(pop),clusterOptions = 
                 markerOptions()) %>%
    addLegend(position = "bottomleft", pal = pal, values = ~pop)
  • To be continued

5 Others

5.1 qqplot

Avatar
Ray Sun
Data Analytics Professional

My interests include AI/ML and data analytics.

Related