# Case study: data vizualization on flight delays using tidyverse tools

``library(tidyverse)  # data wrangling``

``````library(tidyverse)
library(nycflights13)

data("flights")``````

See here

# 4 Solutions

## 4.1 Plot the distribution of the delays. Describe your insights.

``````flights %>%
ggplot() +
aes(x = dep_delay) +
geom_histogram()``````

Alternatively:

``````flights %>%
ggplot() +
aes(x = dep_delay) +
geom_density()``````

The distribution is skewed to the right. Some flights are extremely lated compared to the majority.

## 4.2 Plot the distribution of the delays per origin airport.

``````flights %>%
ggplot() +
aes(x = dep_delay) +
geom_density() +
facet_wrap(~ origin)``````

## 4.3 Visualize the assocation of delay and time of the day. Find a way to reduce overplotting.

Hint: Try out `geom_bind2d()` or `geom_density2d()` instead of using a scatter plot.

``````flights %>%
ggplot() +
aes(x = dep_time, dep_delay) +
geom_density2d()``````

``````flights %>%
ggplot() +
aes(x = dep_time, dep_delay) +
geom_density2d() +
geom_smooth()  # smoothing line``````

Alternatively:

``````flights %>%
ggplot() +
aes(x = dep_time, dep_delay) +
geom_bin2d() +
geom_smooth(method = "lm")  # smoothing line``````

## 4.4 Visualize the assocation of delay and distance to destination. Separate by origin and month.

``````flights %>%
ggplot() +
aes(x = distance, dep_delay) +
geom_density2d() +
facet_grid(origin ~ month)``````

## 4.5 Visualize the assocation of delay and time of the day. Only include the three airlines where the delay is highest.

Reduce overplotting.

``````flights %>%
group_by(carrier) %>%
summarise(dep_delay_carrier = mean(dep_delay, na.rm = TRUE)) %>%
arrange(-dep_delay_carrier) %>%
slice(1:3)
#> # A tibble: 3 x 2
#>   carrier dep_delay_carrier
#>   <chr>               <dbl>
#> 1 F9                   20.2
#> 2 EV                   20.0
#> 3 YV                   19.0``````
``````flights %>%
filter(carrier %in% c("F9", "EV", "YV")) %>%
ggplot() +
aes(x = dep_time, dep_delay) +
geom_density2d()``````

## 4.6 Visualize the proportion of delayed flights per origin.

``````flights %>%
mutate(is_delayed = dep_delay > 0) %>%
group_by(origin) %>%
summarise(delay_n = sum(is_delayed == TRUE, na.rm = TRUE),
delay_prop = delay_n / n()) %>%
ggplot() +
aes(x = origin, y = delay_prop) +
geom_col()
``````

Alternatively:

``````flights %>%
mutate(is_delayed = dep_delay > 0) %>%
group_by(origin) %>%
ggplot() +
aes(x = origin, fill = is_delayed) +
geom_bar()``````

Or even this way:

``````flights %>%
mutate(is_delayed = dep_delay > 0) %>%
group_by(origin) %>%
ggplot() +
aes(x = origin, fill = is_delayed) +
geom_bar(position = "fill")``````

## 4.7 Visualize the proportion of delayed flights per time of the day

``````flights %>%
mutate(is_delayed = dep_delay > 0) %>%
group_by(origin) %>%
drop_na(is_delayed, origin) %>%
ggplot() +
aes(x = origin, fill = is_delayed) +
geom_bar(position = "fill") +
facet_wrap(~ hour) +
scale_fill_viridis_d()  # d as in "discrete"``````

## 4.8 Visualize the proportion of delayed flights per week day

There’s a package that does the weight lifting for us when it comes to working with times and dates:

``library(lubridate)``
``````flights %>%
mutate(is_delayed = dep_delay > 0) %>%
mutate(day_of_week = wday(time_hour)) %>%   # Wochentag
group_by(origin) %>%
drop_na(is_delayed, origin) %>%
ggplot() +
aes(x = origin, fill = is_delayed) +
geom_bar(position = "fill") +
facet_wrap(~ day_of_week) +
scale_fill_viridis_d()  # d wie "discrete"``````

# 5 Reproducibility

``````#> ─ Session info ───────────────────────────────────────────────────────────────────────────────────────────────────────
#>  setting  value
#>  version  R version 4.0.2 (2020-06-22)
#>  os       macOS  10.16
#>  system   x86_64, darwin17.0
#>  ui       X11
#>  language (EN)
#>  collate  en_US.UTF-8
#>  ctype    en_US.UTF-8
#>  tz       Europe/Berlin
#>  date     2021-02-24
#>
#> ─ Packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────
#>  package     * version     date       lib source
#>  assertthat    0.2.1       2019-03-21 [1] CRAN (R 4.0.0)
#>  backports     1.2.1       2020-12-09 [1] CRAN (R 4.0.2)
#>  blogdown      1.1         2021-01-19 [1] CRAN (R 4.0.2)
#>  bookdown      0.21.6      2021-02-02 [1] Github (rstudio/bookdown@6c7346a)
#>  broom         0.7.5       2021-02-19 [1] CRAN (R 4.0.2)
#>  bslib         0.2.4.9000  2021-02-02 [1] Github (rstudio/bslib@b3cd7a9)
#>  cachem        1.0.4       2021-02-13 [1] CRAN (R 4.0.2)
#>  callr         3.5.1       2020-10-13 [1] CRAN (R 4.0.2)
#>  cellranger    1.1.0       2016-07-27 [1] CRAN (R 4.0.0)
#>  cli           2.3.1       2021-02-23 [1] CRAN (R 4.0.2)
#>  codetools     0.2-16      2018-12-24 [2] CRAN (R 4.0.2)
#>  colorspace    2.0-0       2020-11-11 [1] CRAN (R 4.0.2)
#>  crayon        1.4.1       2021-02-08 [1] CRAN (R 4.0.2)
#>  DBI           1.1.1       2021-01-15 [1] CRAN (R 4.0.2)
#>  dbplyr        2.1.0       2021-02-03 [1] CRAN (R 4.0.2)
#>  debugme       1.1.0       2017-10-22 [1] CRAN (R 4.0.0)
#>  desc          1.2.0       2018-05-01 [1] CRAN (R 4.0.0)
#>  devtools      2.3.2       2020-09-18 [1] CRAN (R 4.0.2)
#>  digest        0.6.27      2020-10-24 [1] CRAN (R 4.0.2)
#>  dplyr       * 1.0.4       2021-02-02 [1] CRAN (R 4.0.2)
#>  ellipsis      0.3.1       2020-05-15 [1] CRAN (R 4.0.0)
#>  evaluate      0.14        2019-05-28 [1] CRAN (R 4.0.0)
#>  fansi         0.4.2       2021-01-15 [1] CRAN (R 4.0.2)
#>  fastmap       1.1.0       2021-01-25 [1] CRAN (R 4.0.2)
#>  forcats     * 0.5.1       2021-01-27 [1] CRAN (R 4.0.2)
#>  fs            1.5.0       2020-07-31 [1] CRAN (R 4.0.2)
#>  generics      0.1.0       2020-10-31 [1] CRAN (R 4.0.2)
#>  ggplot2     * 3.3.3       2020-12-30 [1] CRAN (R 4.0.2)
#>  glue          1.4.2       2020-08-27 [1] CRAN (R 4.0.2)
#>  gtable        0.3.0       2019-03-25 [1] CRAN (R 4.0.0)
#>  haven         2.3.1       2020-06-01 [1] CRAN (R 4.0.0)
#>  hms           1.0.0       2021-01-13 [1] CRAN (R 4.0.2)
#>  htmltools     0.5.1.1     2021-01-22 [1] CRAN (R 4.0.2)
#>  httr          1.4.2       2020-07-20 [1] CRAN (R 4.0.2)
#>  jquerylib     0.1.3       2020-12-17 [1] CRAN (R 4.0.2)
#>  jsonlite      1.7.2       2020-12-09 [1] CRAN (R 4.0.2)
#>  knitr         1.31        2021-01-27 [1] CRAN (R 4.0.2)
#>  lifecycle     1.0.0       2021-02-15 [1] CRAN (R 4.0.2)
#>  lubridate     1.7.9.2     2020-11-13 [1] CRAN (R 4.0.2)
#>  magrittr      2.0.1       2020-11-17 [1] CRAN (R 4.0.2)
#>  memoise       2.0.0       2021-01-26 [1] CRAN (R 4.0.2)
#>  modelr        0.1.8       2020-05-19 [1] CRAN (R 4.0.0)
#>  munsell       0.5.0       2018-06-12 [1] CRAN (R 4.0.0)
#>  pillar        1.5.0       2021-02-22 [1] CRAN (R 4.0.2)
#>  pkgbuild      1.2.0       2020-12-15 [1] CRAN (R 4.0.2)
#>  pkgconfig     2.0.3       2019-09-22 [1] CRAN (R 4.0.0)
#>  pkgload       1.2.0       2021-02-23 [1] CRAN (R 4.0.2)
#>  prettyunits   1.1.1       2020-01-24 [1] CRAN (R 4.0.0)
#>  processx      3.4.5       2020-11-30 [1] CRAN (R 4.0.2)
#>  ps            1.5.0       2020-12-05 [1] CRAN (R 4.0.2)
#>  purrr       * 0.3.4       2020-04-17 [1] CRAN (R 4.0.0)
#>  R6            2.5.0       2020-10-28 [1] CRAN (R 4.0.2)
#>  Rcpp          1.0.6       2021-01-15 [1] CRAN (R 4.0.2)
#>  readr       * 1.4.0       2020-10-05 [1] CRAN (R 4.0.2)
#>  readxl        1.3.1       2019-03-13 [1] CRAN (R 4.0.0)
#>  remotes       2.2.0       2020-07-21 [1] CRAN (R 4.0.2)
#>  reprex        1.0.0       2021-01-27 [1] CRAN (R 4.0.2)
#>  rlang         0.4.10      2020-12-30 [1] CRAN (R 4.0.2)
#>  rmarkdown     2.7         2021-02-19 [1] CRAN (R 4.0.2)
#>  rprojroot     2.0.2       2020-11-15 [1] CRAN (R 4.0.2)
#>  rstudioapi    0.13.0-9000 2021-02-11 [1] Github (rstudio/rstudioapi@9d21f50)
#>  rvest         0.3.6       2020-07-25 [1] CRAN (R 4.0.2)
#>  sass          0.3.1       2021-01-24 [1] CRAN (R 4.0.2)
#>  scales        1.1.1       2020-05-11 [1] CRAN (R 4.0.0)
#>  sessioninfo   1.1.1       2018-11-05 [1] CRAN (R 4.0.0)
#>  stringi       1.5.3       2020-09-09 [1] CRAN (R 4.0.2)
#>  stringr     * 1.4.0       2019-02-10 [1] CRAN (R 4.0.0)
#>  testthat      3.0.2       2021-02-14 [1] CRAN (R 4.0.2)
#>  tibble      * 3.0.6       2021-01-29 [1] CRAN (R 4.0.2)
#>  tidyr       * 1.1.2       2020-08-27 [1] CRAN (R 4.0.2)
#>  tidyselect    1.1.0       2020-05-11 [1] CRAN (R 4.0.0)
#>  tidyverse   * 1.3.0       2019-11-21 [1] CRAN (R 4.0.0)
#>  usethis       2.0.1       2021-02-10 [1] CRAN (R 4.0.2)
#>  utf8          1.1.4       2018-05-24 [1] CRAN (R 4.0.0)
#>  vctrs         0.3.6       2020-12-17 [1] CRAN (R 4.0.2)
#>  withr         2.4.1       2021-01-26 [1] CRAN (R 4.0.2)
#>  xfun          0.21        2021-02-10 [1] CRAN (R 4.0.2)
#>  xml2          1.3.2       2020-04-23 [1] CRAN (R 4.0.0)
#>  yaml          2.2.1       2020-02-01 [1] CRAN (R 4.0.0)
#>
#> [1] /Users/sebastiansaueruser/Rlibs
#> [2] /Library/Frameworks/R.framework/Versions/4.0/Resources/library``````