0.0.1 How fast is fast?
Let’s see how quickly some predictive model runs, in order to estimate time consumption for larger machine learning pipelines. In addtion, let’s see how much time is saves when using multiples cores, ie. when parallel processing is enabled.
Let’s use a Ranger (Random Forest) as learner.
0.0.2 Tidymodels pipeline
Let’s copy this template in order to not have to type all the verbose Tidymodels code.
0.0.3 Setup
# Setup:
library(tidymodels)
library(tidyverse)
library(tictoc) # Zeitmessung
# Data:
d <- palmerpenguins::penguins |> na.omit()
set.seed(42)
d_split <- initial_split(d)
d_train <- training(d_split)
d_test <- testing(d_split)
0.0.4 Simple Fit
# model:
mod1 <-
rand_forest(mode = "regression")
# cv:
set.seed(42)
rsmpl <- vfold_cv(d_train)
# recipe:
rec1 <-
recipe(body_mass_g ~ ., data = d_train) |>
step_dummy(all_nominal_predictors()) |>
step_normalize(all_predictors())
# workflow:
wf1 <-
workflow() %>%
add_model(mod1) %>%
add_recipe(rec1)
# fitting:
tic()
wf1_fit <-
wf1 %>%
fit(data = d_train)
toc()
#> 0.224 sec elapsed
0.0.5 Resampling
10 times CV
# fitting:
tic()
wf1_fit <-
wf1 %>%
fit_resamples(resamples = rsmpl,
control = control_grid(verbose = TRUE))
toc()
#> 2.372 sec elapsed
0.0.6 Tuning
10 tuning parameters, 10 times CV
# model:
mod_tune <-
rand_forest(mode = "regression",
mtry = tune())
# cv:
set.seed(42)
rsmpl <- vfold_cv(d_train)
# recipe:
rec1 <-
recipe(body_mass_g ~ ., data = d_train) |>
step_dummy(all_nominal_predictors()) |>
step_normalize(all_predictors())
# workflow:
wf_tune <-
workflow() %>%
add_model(mod_tune) %>%
add_recipe(rec1)
# fitting:
tic()
wf_tune_fit <-
wf_tune %>%
tune_grid(
resamples = rsmpl,
grid = 10,
control = control_grid(verbose = FALSE))
toc()
#> 12.419 sec elapsed
0.0.7 More tuning params
# model:
mod_tune <-
rand_forest(mode = "regression",
mtry = tune())
# cv:
set.seed(42)
rsmpl <- vfold_cv(d_train)
# recipe:
rec1 <-
recipe(body_mass_g ~ ., data = d_train) |>
step_dummy(all_nominal_predictors()) |>
step_normalize(all_predictors())
# workflow:
wf_tune <-
workflow() %>%
add_model(mod_tune) %>%
add_recipe(rec1)
# fitting:
tic()
wf_tune_fit <-
wf_tune %>%
tune_grid(
resamples = rsmpl,
grid = 1e2,
control = control_grid(verbose = FALSE))
toc()
#> 16.143 sec elapsed
0.0.8 Parallel processing
tic()
wf_parallel_fit <-
wf_tune %>%
tune_grid(
resamples = rsmpl,
grid = 1e2,
control = control_grid(
verbose = FALSE,
allow_par = TRUE))
toc()
#> 16.46 sec elapsed
0.0.9 Parallel processing - explicitly
library(doParallel)
# Set up a parallel backend with multiple cores
cl <- makeCluster(3) # 4 cores, adjust as needed
registerDoParallel(cl)
tic()
wf_parallel_fit <-
wf_tune %>%
tune_grid(
resamples = rsmpl,
grid = 1e2,
control = control_grid(
verbose = FALSE,
allow_par = TRUE))
toc()
#> 20.697 sec elapsed
Again, a drop in computation time. Interesting.
0.0.10 ANOVA race
Can we get a speed-up using an ANOVA race?
library(finetune)
tic()
wf_race_fit <-
wf_tune %>%
tune_race_anova(
resamples = rsmpl,
grid = 1e2,
control = control_race(
verbose = FALSE,
allow_par = TRUE))
toc()
#> 9.303 sec elapsed
Not really. At least not in this case.
However, the authors report a benchmark with a juicy speed-up.
0.0.11 Acknowledgements
0.0.12 Reproducibility
#> ─ Session info ───────────────────────────────────────────────────────────────────────────────────────────────────────
#> setting value
#> version R version 4.2.1 (2022-06-23)
#> os macOS Big Sur ... 10.16
#> system x86_64, darwin17.0
#> ui X11
#> language (EN)
#> collate en_US.UTF-8
#> ctype en_US.UTF-8
#> tz Europe/Berlin
#> date 2023-11-15
#> pandoc 3.1.1 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/tools/ (via rmarkdown)
#>
#> ─ Packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────
#> package * version date (UTC) lib source
#> backports 1.4.1 2021-12-13 [1] CRAN (R 4.2.0)
#> blogdown 1.18 2023-06-19 [1] CRAN (R 4.2.0)
#> bookdown 0.36 2023-10-16 [1] CRAN (R 4.2.0)
#> boot 1.3-28.1 2022-11-22 [1] CRAN (R 4.2.0)
#> broom * 1.0.5 2023-06-09 [1] CRAN (R 4.2.0)
#> bslib 0.5.1 2023-08-11 [1] CRAN (R 4.2.0)
#> cachem 1.0.8 2023-05-01 [1] CRAN (R 4.2.0)
#> callr 3.7.3 2022-11-02 [1] CRAN (R 4.2.0)
#> class 7.3-22 2023-05-03 [1] CRAN (R 4.2.0)
#> cli 3.6.1 2023-03-23 [1] CRAN (R 4.2.0)
#> codetools 0.2-19 2023-02-01 [1] CRAN (R 4.2.0)
#> colorout * 1.3-0 2023-11-08 [1] Github (jalvesaq/colorout@8384882)
#> colorspace 2.1-0 2023-01-23 [1] CRAN (R 4.2.0)
#> crayon 1.5.2 2022-09-29 [1] CRAN (R 4.2.1)
#> data.table 1.14.8 2023-02-17 [1] CRAN (R 4.2.0)
#> devtools 2.4.5 2022-10-11 [1] CRAN (R 4.2.1)
#> dials * 1.2.0 2023-04-03 [1] CRAN (R 4.2.0)
#> DiceDesign 1.9 2021-02-13 [1] CRAN (R 4.2.0)
#> digest 0.6.33 2023-07-07 [1] CRAN (R 4.2.0)
#> doParallel * 1.0.17 2022-02-07 [1] CRAN (R 4.2.0)
#> dplyr * 1.1.3 2023-09-03 [1] CRAN (R 4.2.0)
#> ellipsis 0.3.2 2021-04-29 [1] CRAN (R 4.2.0)
#> evaluate 0.21 2023-05-05 [1] CRAN (R 4.2.0)
#> fansi 1.0.5 2023-10-08 [1] CRAN (R 4.2.0)
#> fastmap 1.1.1 2023-02-24 [1] CRAN (R 4.2.0)
#> finetune * 1.1.0 2023-04-19 [1] CRAN (R 4.2.0)
#> forcats * 1.0.0 2023-01-29 [1] CRAN (R 4.2.0)
#> foreach * 1.5.2 2022-02-02 [1] CRAN (R 4.2.0)
#> fs 1.6.3 2023-07-20 [1] CRAN (R 4.2.0)
#> furrr 0.3.1 2022-08-15 [1] CRAN (R 4.2.0)
#> future 1.33.0 2023-07-01 [1] CRAN (R 4.2.0)
#> future.apply 1.11.0 2023-05-21 [1] CRAN (R 4.2.0)
#> generics 0.1.3 2022-07-05 [1] CRAN (R 4.2.0)
#> ggplot2 * 3.4.4 2023-10-12 [1] CRAN (R 4.2.0)
#> globals 0.16.2 2022-11-21 [1] CRAN (R 4.2.0)
#> glue 1.6.2 2022-02-24 [1] CRAN (R 4.2.0)
#> gower 1.0.1 2022-12-22 [1] CRAN (R 4.2.0)
#> GPfit 1.0-8 2019-02-08 [1] CRAN (R 4.2.0)
#> gtable 0.3.4 2023-08-21 [1] CRAN (R 4.2.0)
#> hardhat 1.3.0 2023-03-30 [1] CRAN (R 4.2.0)
#> hms 1.1.3 2023-03-21 [1] CRAN (R 4.2.0)
#> htmltools 0.5.6.1 2023-10-06 [1] CRAN (R 4.2.0)
#> htmlwidgets 1.6.2 2023-03-17 [1] CRAN (R 4.2.0)
#> httpuv 1.6.11 2023-05-11 [1] CRAN (R 4.2.0)
#> infer * 1.0.5 2023-09-06 [1] CRAN (R 4.2.0)
#> ipred 0.9-14 2023-03-09 [1] CRAN (R 4.2.0)
#> iterators * 1.0.14 2022-02-05 [1] CRAN (R 4.2.0)
#> jquerylib 0.1.4 2021-04-26 [1] CRAN (R 4.2.0)
#> jsonlite 1.8.7 2023-06-29 [1] CRAN (R 4.2.0)
#> knitr 1.45 2023-10-30 [1] CRAN (R 4.2.1)
#> later 1.3.1 2023-05-02 [1] CRAN (R 4.2.0)
#> lattice 0.21-8 2023-04-05 [1] CRAN (R 4.2.0)
#> lava 1.7.2.1 2023-02-27 [1] CRAN (R 4.2.0)
#> lhs 1.1.6 2022-12-17 [1] CRAN (R 4.2.0)
#> lifecycle 1.0.3 2022-10-07 [1] CRAN (R 4.2.0)
#> listenv 0.9.0 2022-12-16 [1] CRAN (R 4.2.0)
#> lme4 1.1-34 2023-07-04 [1] CRAN (R 4.2.0)
#> lubridate * 1.9.3 2023-09-27 [1] CRAN (R 4.2.0)
#> magrittr 2.0.3 2022-03-30 [1] CRAN (R 4.2.0)
#> MASS 7.3-60 2023-05-04 [1] CRAN (R 4.2.0)
#> Matrix 1.5-4.1 2023-05-18 [1] CRAN (R 4.2.0)
#> memoise 2.0.1 2021-11-26 [1] CRAN (R 4.2.0)
#> mime 0.12 2021-09-28 [1] CRAN (R 4.2.0)
#> miniUI 0.1.1.1 2018-05-18 [1] CRAN (R 4.2.0)
#> minqa 1.2.5 2022-10-19 [1] CRAN (R 4.2.1)
#> modeldata * 1.2.0 2023-08-09 [1] CRAN (R 4.2.0)
#> munsell 0.5.0 2018-06-12 [1] CRAN (R 4.2.0)
#> nlme 3.1-162 2023-01-31 [1] CRAN (R 4.2.0)
#> nloptr 2.0.3 2022-05-26 [1] CRAN (R 4.2.0)
#> nnet 7.3-19 2023-05-03 [1] CRAN (R 4.2.0)
#> palmerpenguins 0.1.1 2022-08-15 [1] CRAN (R 4.2.0)
#> parallelly 1.36.0 2023-05-26 [1] CRAN (R 4.2.0)
#> parsnip * 1.1.1 2023-08-17 [1] CRAN (R 4.2.0)
#> pillar 1.9.0 2023-03-22 [1] CRAN (R 4.2.0)
#> pkgbuild 1.4.0 2022-11-27 [1] CRAN (R 4.2.0)
#> pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.2.0)
#> pkgload 1.3.2.1 2023-07-08 [1] CRAN (R 4.2.0)
#> prettyunits 1.1.1 2020-01-24 [1] CRAN (R 4.2.0)
#> processx 3.8.2 2023-06-30 [1] CRAN (R 4.2.0)
#> prodlim 2023.03.31 2023-04-02 [1] CRAN (R 4.2.0)
#> profvis 0.3.8 2023-05-02 [1] CRAN (R 4.2.0)
#> promises 1.2.1 2023-08-10 [1] CRAN (R 4.2.0)
#> ps 1.7.5 2023-04-18 [1] CRAN (R 4.2.0)
#> purrr * 1.0.2 2023-08-10 [1] CRAN (R 4.2.0)
#> R6 2.5.1 2021-08-19 [1] CRAN (R 4.2.0)
#> ranger * 0.15.1 2023-04-03 [1] CRAN (R 4.2.0)
#> Rcpp 1.0.11 2023-07-06 [1] CRAN (R 4.2.0)
#> readr * 2.1.4 2023-02-10 [1] CRAN (R 4.2.0)
#> recipes * 1.0.8 2023-08-25 [1] CRAN (R 4.2.0)
#> remotes 2.4.2.1 2023-07-18 [1] CRAN (R 4.2.0)
#> rlang 1.1.1 2023-04-28 [1] CRAN (R 4.2.0)
#> rmarkdown 2.25 2023-09-18 [1] CRAN (R 4.2.0)
#> rpart 4.1.19 2022-10-21 [1] CRAN (R 4.2.0)
#> rsample * 1.2.0 2023-08-23 [1] CRAN (R 4.2.0)
#> rstudioapi 0.15.0 2023-07-07 [1] CRAN (R 4.2.0)
#> sass 0.4.7 2023-07-15 [1] CRAN (R 4.2.0)
#> scales * 1.2.1 2022-08-20 [1] CRAN (R 4.2.0)
#> sessioninfo 1.2.2 2021-12-06 [1] CRAN (R 4.2.0)
#> shiny 1.7.5 2023-08-12 [1] CRAN (R 4.2.0)
#> stringi 1.7.12 2023-01-11 [1] CRAN (R 4.2.0)
#> stringr * 1.5.0 2022-12-02 [1] CRAN (R 4.2.0)
#> survival 3.5-5 2023-03-12 [1] CRAN (R 4.2.0)
#> tibble * 3.2.1 2023-03-20 [1] CRAN (R 4.2.0)
#> tictoc * 1.2 2023-04-23 [1] CRAN (R 4.2.0)
#> tidymodels * 1.1.1 2023-08-24 [1] CRAN (R 4.2.0)
#> tidyr * 1.3.0 2023-01-24 [1] CRAN (R 4.2.0)
#> tidyselect 1.2.0 2022-10-10 [1] CRAN (R 4.2.0)
#> tidyverse * 2.0.0 2023-02-22 [1] CRAN (R 4.2.0)
#> timechange 0.2.0 2023-01-11 [1] CRAN (R 4.2.0)
#> timeDate 4022.108 2023-01-07 [1] CRAN (R 4.2.0)
#> tune * 1.1.2 2023-08-23 [1] CRAN (R 4.2.0)
#> tzdb 0.4.0 2023-05-12 [1] CRAN (R 4.2.0)
#> urlchecker 1.0.1 2021-11-30 [1] CRAN (R 4.2.0)
#> usethis 2.2.2 2023-07-06 [1] CRAN (R 4.2.0)
#> utf8 1.2.3 2023-01-31 [1] CRAN (R 4.2.0)
#> vctrs 0.6.4 2023-10-12 [1] CRAN (R 4.2.0)
#> withr 2.5.2 2023-10-30 [1] CRAN (R 4.2.1)
#> workflows * 1.1.3 2023-02-22 [1] CRAN (R 4.2.0)
#> workflowsets * 1.0.1 2023-04-06 [1] CRAN (R 4.2.0)
#> xfun 0.40 2023-08-09 [1] CRAN (R 4.2.0)
#> xtable 1.8-4 2019-04-21 [1] CRAN (R 4.2.0)
#> yaml 2.3.7 2023-01-23 [1] CRAN (R 4.2.0)
#> yardstick * 1.2.0 2023-04-21 [1] CRAN (R 4.2.0)
#>
#> [1] /Users/sebastiansaueruser/Rlibs
#> [2] /Library/Frameworks/R.framework/Versions/4.2/Resources/library
#>
#> ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────