This exercise is inspired by the article “Introduction to Network Analysis with R” by Jesse Sadler. <a https://www.jessesadler.com/post/network-analysis-with-r/
The exercise uses a modified dataset originally from U.S. Department of Transportation’s (DOT) Bureau of Transportation Statistics.
The modified dataset contains flights related data for both departing and arriving flights for two NY airports(“JFK”,“LGA”), for year 2015.
The purpose of this exercise is to create network data from the given dataset and generate a visual representation for the connecting airports.
The following are the learning outcome from this exercise:
how to convert data to network data structure (edges and nodes)
how to create D3 SankeyNetwork plot
library(tidyverse)
library(visNetwork)
library(networkD3)
flights <- read.csv("prepdata_NY_20Jan.csv", header=TRUE, sep=",")
str(flights)
## 'data.frame': 392979 obs. of 33 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ DATE : chr "2015-01-01" "2015-01-02" "2015-01-03" "2015-01-04" ...
## $ AIRLINE : chr "AA" "AA" "AA" "AA" ...
## $ FLIGHT_NUMBER : int 1 1 1 1 1 1 1 1 1 1 ...
## $ TAIL_NUMBER : chr "N787AA" "N795AA" "N788AA" "N791AA" ...
## $ ORIGIN_AIRPORT : chr "JFK" "JFK" "JFK" "JFK" ...
## $ DESTINATION_AIRPORT: chr "LAX" "LAX" "LAX" "LAX" ...
## $ SCHEDULED_DEPARTURE: int 900 900 900 900 900 900 900 900 900 900 ...
## $ DEPARTURE_TIME : int 855 850 853 853 853 856 859 856 901 903 ...
## $ DEPARTURE_DELAY : int -5 -10 -7 -7 -7 -4 -1 -4 1 3 ...
## $ TAXI_OUT : int 17 15 15 14 27 85 29 26 43 37 ...
## $ WHEELS_OFF : int 912 905 908 907 920 1021 928 922 944 940 ...
## $ SCHEDULED_TIME : int 390 390 390 390 390 395 395 395 395 395 ...
## $ ELAPSED_TIME : int 402 381 358 385 389 424 382 362 400 392 ...
## $ AIR_TIME : int 378 357 330 352 338 335 341 333 353 345 ...
## $ DISTANCE : int 2475 2475 2475 2475 2475 2475 2475 2475 2475 2475 ...
## $ WHEELS_ON : int 1230 1202 1138 1159 1158 1256 1209 1155 1237 1225 ...
## $ TAXI_IN : int 7 9 13 19 24 4 12 3 4 10 ...
## $ SCHEDULED_ARRIVAL : int 1230 1230 1230 1230 1230 1235 1235 1235 1235 1235 ...
## $ ARRIVAL_TIME : int 1237 1211 1151 1218 1222 1300 1221 1158 1241 1235 ...
## $ ARRIVAL_DELAY : int 7 -19 -39 -12 -8 25 -14 -37 6 0 ...
## $ DIVERTED : int 0 0 0 0 0 0 0 0 0 0 ...
## $ CANCELLED : int 0 0 0 0 0 0 0 0 0 0 ...
## $ CANCELLATION_REASON: chr "" "" "" "" ...
## $ AIR_SYSTEM_DELAY : int NA NA NA NA NA 25 NA NA NA NA ...
## $ SECURITY_DELAY : int NA NA NA NA NA 0 NA NA NA NA ...
## $ AIRLINE_DELAY : int NA NA NA NA NA 0 NA NA NA NA ...
## $ LATE_AIRCRAFT_DELAY: int NA NA NA NA NA 0 NA NA NA NA ...
## $ WEATHER_DELAY : int NA NA NA NA NA 0 NA NA NA NA ...
## $ RISK_INDX : int 1 42 75 45 92 45 33 73 69 81 ...
## $ ASSESS_CAT : chr "AV" "WH" "TT" "OT" ...
## $ SEC_STATE : logi NA NA NA NA NA NA ...
## $ SEC_LvL : int 3 3 3 3 3 3 3 3 3 3 ...
sources <- flights %>%
distinct(ORIGIN_AIRPORT) %>%
rename(label = ORIGIN_AIRPORT)
destinations <- flights %>%
distinct(DESTINATION_AIRPORT) %>%
rename(label = DESTINATION_AIRPORT)
nodes <- full_join(sources, destinations, by = "label")
nodes <- nodes %>% rowid_to_column("id")
head(nodes)
## id label
## 1 1 JFK
## 2 2 LAX
## 3 3 SFO
## 4 4 LGA
## 5 5 MCO
## 6 6 LAS
per_route <- flights %>%
group_by(ORIGIN_AIRPORT, DESTINATION_AIRPORT) %>%
summarise(weight = n()) %>%
ungroup()
per_route
## # A tibble: 268 x 3
## ORIGIN_AIRPORT DESTINATION_AIRPORT weight
## <chr> <chr> <int>
## 1 ABQ JFK 282
## 2 ACK JFK 343
## 3 AGS LGA 1
## 4 ATL JFK 1945
## 5 ATL LGA 8375
## 6 AUS JFK 1604
## 7 BGR LGA 56
## 8 BHM LGA 498
## 9 BNA JFK 337
## 10 BNA LGA 2513
## # ... with 258 more rows
edges <- per_route %>%
left_join(nodes, by = c("ORIGIN_AIRPORT" = "label")) %>%
rename(from = id)
edges <- edges %>%
left_join(nodes, by = c("DESTINATION_AIRPORT" = "label")) %>%
rename(to = id)
edges <- select(edges, from, to, weight)
edges
## # A tibble: 268 x 3
## from to weight
## <int> <int> <int>
## 1 29 1 282
## 2 51 1 343
## 3 82 4 1
## 4 55 1 1945
## 5 55 4 8375
## 6 11 1 1604
## 7 67 4 56
## 8 76 4 498
## 9 74 1 337
## 10 74 4 2513
## # ... with 258 more rows
nodes_d3 <- mutate(nodes, id = id - 1)
edges_d3 <- mutate(edges, from = from - 1, to = to - 1)
sankeyNetwork(Links = edges_d3, Nodes = nodes_d3, Source = "from", Target = "to",
NodeID = "label", Value = "weight", fontSize = 16, unit = "flight(s)")
Also see the link for the bigger plot.
@end