Introduction

This exercise is inspired by the article “Introduction to Network Analysis with R” by Jesse Sadler. <a https://www.jessesadler.com/post/network-analysis-with-r/

The exercise uses a modified dataset originally from U.S. Department of Transportation’s (DOT) Bureau of Transportation Statistics.

The modified dataset contains flights related data for both departing and arriving flights for two NY airports(“JFK”,“LGA”), for year 2015.

The purpose of this exercise is to create network data from the given dataset and generate a visual representation for the connecting airports.

Learning outcomes

The following are the learning outcome from this exercise:

Import library

library(tidyverse)
library(visNetwork)
library(networkD3)

Import data and check data structure

flights <- read.csv("prepdata_NY_20Jan.csv", header=TRUE, sep=",")
str(flights)
## 'data.frame':    392979 obs. of  33 variables:
##  $ X                  : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ DATE               : chr  "2015-01-01" "2015-01-02" "2015-01-03" "2015-01-04" ...
##  $ AIRLINE            : chr  "AA" "AA" "AA" "AA" ...
##  $ FLIGHT_NUMBER      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ TAIL_NUMBER        : chr  "N787AA" "N795AA" "N788AA" "N791AA" ...
##  $ ORIGIN_AIRPORT     : chr  "JFK" "JFK" "JFK" "JFK" ...
##  $ DESTINATION_AIRPORT: chr  "LAX" "LAX" "LAX" "LAX" ...
##  $ SCHEDULED_DEPARTURE: int  900 900 900 900 900 900 900 900 900 900 ...
##  $ DEPARTURE_TIME     : int  855 850 853 853 853 856 859 856 901 903 ...
##  $ DEPARTURE_DELAY    : int  -5 -10 -7 -7 -7 -4 -1 -4 1 3 ...
##  $ TAXI_OUT           : int  17 15 15 14 27 85 29 26 43 37 ...
##  $ WHEELS_OFF         : int  912 905 908 907 920 1021 928 922 944 940 ...
##  $ SCHEDULED_TIME     : int  390 390 390 390 390 395 395 395 395 395 ...
##  $ ELAPSED_TIME       : int  402 381 358 385 389 424 382 362 400 392 ...
##  $ AIR_TIME           : int  378 357 330 352 338 335 341 333 353 345 ...
##  $ DISTANCE           : int  2475 2475 2475 2475 2475 2475 2475 2475 2475 2475 ...
##  $ WHEELS_ON          : int  1230 1202 1138 1159 1158 1256 1209 1155 1237 1225 ...
##  $ TAXI_IN            : int  7 9 13 19 24 4 12 3 4 10 ...
##  $ SCHEDULED_ARRIVAL  : int  1230 1230 1230 1230 1230 1235 1235 1235 1235 1235 ...
##  $ ARRIVAL_TIME       : int  1237 1211 1151 1218 1222 1300 1221 1158 1241 1235 ...
##  $ ARRIVAL_DELAY      : int  7 -19 -39 -12 -8 25 -14 -37 6 0 ...
##  $ DIVERTED           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ CANCELLED          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ CANCELLATION_REASON: chr  "" "" "" "" ...
##  $ AIR_SYSTEM_DELAY   : int  NA NA NA NA NA 25 NA NA NA NA ...
##  $ SECURITY_DELAY     : int  NA NA NA NA NA 0 NA NA NA NA ...
##  $ AIRLINE_DELAY      : int  NA NA NA NA NA 0 NA NA NA NA ...
##  $ LATE_AIRCRAFT_DELAY: int  NA NA NA NA NA 0 NA NA NA NA ...
##  $ WEATHER_DELAY      : int  NA NA NA NA NA 0 NA NA NA NA ...
##  $ RISK_INDX          : int  1 42 75 45 92 45 33 73 69 81 ...
##  $ ASSESS_CAT         : chr  "AV" "WH" "TT" "OT" ...
##  $ SEC_STATE          : logi  NA NA NA NA NA NA ...
##  $ SEC_LvL            : int  3 3 3 3 3 3 3 3 3 3 ...

Prepapre Nodes data

sources <- flights %>%
  distinct(ORIGIN_AIRPORT) %>%
  rename(label = ORIGIN_AIRPORT)

destinations <- flights %>%
  distinct(DESTINATION_AIRPORT) %>%
  rename(label = DESTINATION_AIRPORT)

nodes <- full_join(sources, destinations, by = "label")
nodes <- nodes %>% rowid_to_column("id")
head(nodes)
##   id label
## 1  1   JFK
## 2  2   LAX
## 3  3   SFO
## 4  4   LGA
## 5  5   MCO
## 6  6   LAS

Prepapre Edges data

per_route <- flights %>%  
  group_by(ORIGIN_AIRPORT, DESTINATION_AIRPORT) %>%
  summarise(weight = n()) %>% 
  ungroup()
per_route
## # A tibble: 268 x 3
##    ORIGIN_AIRPORT DESTINATION_AIRPORT weight
##    <chr>          <chr>                <int>
##  1 ABQ            JFK                    282
##  2 ACK            JFK                    343
##  3 AGS            LGA                      1
##  4 ATL            JFK                   1945
##  5 ATL            LGA                   8375
##  6 AUS            JFK                   1604
##  7 BGR            LGA                     56
##  8 BHM            LGA                    498
##  9 BNA            JFK                    337
## 10 BNA            LGA                   2513
## # ... with 258 more rows
edges <- per_route %>% 
  left_join(nodes, by = c("ORIGIN_AIRPORT" = "label")) %>% 
  rename(from = id)

edges <- edges %>% 
  left_join(nodes, by = c("DESTINATION_AIRPORT" = "label")) %>% 
  rename(to = id)

edges <- select(edges, from, to, weight)
edges
## # A tibble: 268 x 3
##     from    to weight
##    <int> <int>  <int>
##  1    29     1    282
##  2    51     1    343
##  3    82     4      1
##  4    55     1   1945
##  5    55     4   8375
##  6    11     1   1604
##  7    67     4     56
##  8    76     4    498
##  9    74     1    337
## 10    74     4   2513
## # ... with 258 more rows

Create D3 SankeyNetwork plot

nodes_d3 <- mutate(nodes, id = id - 1)
edges_d3 <- mutate(edges, from = from - 1, to = to - 1)

sankeyNetwork(Links = edges_d3, Nodes = nodes_d3, Source = "from", Target = "to", 
              NodeID = "label", Value = "weight", fontSize = 16, unit = "flight(s)")

Also see the link for the bigger plot.

@end