31 min read

Performance measures for `caret` and `lm()`

Recently, I run into performance issue when fitting a linear model together with a resampling scheme and a tuning grid (via caret). The dataset was recently large - some 200k rows and approx. 20 columns (nycflights13 train). Still, I was suprised that my machine got stuck during the computation. Now I wonder whether I ran into memory constraints (16BG on my machine), or whether some other stuff went wrong.

Load packages

library(tidyverse)
library(caret)
library(stringr)

Load data

data("flights", package = "nycflights13")
glimpse(flights)
#> Observations: 336,776
#> Variables: 19
#> $ year           <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
#> $ month          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
#> $ day            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
#> $ dep_time       <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558,…
#> $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600,…
#> $ dep_delay      <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, …
#> $ arr_time       <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753…
#> $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745…
#> $ arr_delay      <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3,…
#> $ carrier        <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "…
#> $ flight         <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, …
#> $ tailnum        <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN",…
#> $ origin         <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", …
#> $ dest           <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", …
#> $ air_time       <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, …
#> $ distance       <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944,…
#> $ hour           <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6…
#> $ minute         <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
#> $ time_hour      <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-0…

flights2 <- flights %>% 
  select(-year) %>% 
  drop_na()

Any NAs?

anyNA(flights2)
#> [1] FALSE

Define computation load

Let’s run with some different sample size to gauge the performance.

Define the grid for the computation load, ie., sample size and number of predictors:

sample_size <- c(100, 500, 1000, 5000, 1e04, 2*1e04, 1e05)

k <- c(1, 2, 5, 10)

Discard outcome vars

We do not want to draw the outcome variabls as predictors, hence we exclude them:

outcome_vars <- c("dep_delay", "arr_delay")

pred_names <- names(flights2)[!(names(flights2) %in% outcome_vars)]
pred_names
#>  [1] "month"          "day"            "dep_time"       "sched_dep_time"
#>  [5] "arr_time"       "sched_arr_time" "carrier"        "flight"        
#>  [9] "tailnum"        "origin"         "dest"           "air_time"      
#> [13] "distance"       "hour"           "minute"         "time_hour"

str(pred_names)
#>  chr [1:16] "month" "day" "dep_time" "sched_dep_time" "arr_time" ...
pred_names[outcome_vars]
#> [1] NA NA
names(flights2)[outcome_vars]
#> [1] NA NA

Discarding using purrr:


dummy <- str_detect(names(flights2), "_delay") %>% discard(.x = names(flights2), .p = .)
dummy
#>  [1] "month"          "day"            "dep_time"       "sched_dep_time"
#>  [5] "arr_time"       "sched_arr_time" "carrier"        "flight"        
#>  [9] "tailnum"        "origin"         "dest"           "air_time"      
#> [13] "distance"       "hour"           "minute"         "time_hour"

str(dummy)
#>  chr [1:16] "month" "day" "dep_time" "sched_dep_time" "arr_time" ...
names(flights2)[outcome_vars]
#> [1] NA NA

Define index numbers of predictors:

set.seed(42)
pred_positions <- sample(x = 1:length(pred_names), size = 3)
pred_positions
#> [1]  1  5 16

pred_names <- pred_names[pred_positions]
pred_names
#> [1] "month"     "arr_time"  "time_hour"

And now we draw the desired number of preditors:

flights_k <- select(flights, one_of(pred_names))

Put that into a function, and wait: some categorical columns are a pain. Consider dest with many levels. Let’s put in a lever to include ony numerical columns.

draw_preds_flights <- function(df = flights2, k, numeric_only = TRUE) {
  
  
  if (numeric_only == TRUE) {
    df <- df %>% select_if(is.numeric)
  }

  pred_names <- str_detect(names(df), "_delay") %>% discard(.x = names(df), .p = .)
  
  
  pred_positions <- sample(x = 1:length(pred_names), size = k)
  
  pred_names_selected <- pred_names[pred_positions]

  flights_k <- select(df, one_of(pred_names_selected))

  return(flights_k)
}

Test the function:

draw_preds_flights(k = 7) %>% str()
#> Classes 'tbl_df', 'tbl' and 'data.frame':    327346 obs. of  7 variables:
#>  $ distance      : num  1400 1416 1089 1576 762 ...
#>  $ hour          : num  5 5 5 5 6 5 6 6 6 6 ...
#>  $ sched_dep_time: int  515 529 540 545 600 558 600 600 600 600 ...
#>  $ day           : int  1 1 1 1 1 1 1 1 1 1 ...
#>  $ air_time      : num  227 227 160 183 116 150 158 53 140 138 ...
#>  $ month         : int  1 1 1 1 1 1 1 1 1 1 ...
#>  $ minute        : num  15 29 40 45 0 58 0 0 0 0 ...

Appears to work.

Here’s a cross val scheme:

my_crossval <- trainControl(method = "cv",
                            number = 5,
                            allowParallel = TRUE)

Here’s a scheme to turn it off:

no_crossval <- trainControl(method = "none")

Now craft a function that performs a lm fit using caret given the compution load.


lm_flights <- function(df = flights2, n, n_preds, crossval = my_crossval) {
  
  df <- draw_preds_flights(df = df, k = n_preds)
  
  # add outcome var:
  df <- df %>% 
    bind_cols(flights2 %>% select(arr_delay))
  
  df <- sample_n(df, size = n)
  
  lm_fit1 <- train(arr_delay ~ .,
                 data = df,
                 method = "lm",
                 trControl = my_crossval) 
  
  return(lm_fit1)
}

Test the function.

start <- Sys.time()
dummy <- lm_flights(n = 200, n_preds = 3)
end <- Sys.time()

time_taken <- end - start
cat("Time taken: ", time_taken)
#> Time taken:  0.5061591
start <- Sys.time()
dummy <- lm_flights(n = 1000, n_preds = 3)
end <- Sys.time()

time_taken <- end - start
cat("Time taken: ", time_taken)
#> Time taken:  0.8717802
start <- Sys.time()
dummy <- lm_flights(n = 1e5, n_preds = 3)
end <- Sys.time()

time_taken <- end - start
cat("Time taken: ", time_taken)
#> Time taken:  2.815394

Register cores:

doMC::registerDoMC(cores = 2)

Now with more cores:

start <- Sys.time()
dummy <- lm_flights(n = 200, n_preds = 3)
end <- Sys.time()

time_taken <- end - start
cat("Time taken: ", time_taken)
#> Time taken:  0.565805

Half the time!

start <- Sys.time()
dummy <- lm_flights(n = 200, n_preds = 3, crossval = no_crossval)
end <- Sys.time()

time_taken <- end - start
cat("Time taken: ", time_taken)
#> Time taken:  0.5536761

summary(dummy)
#> 
#> Call:
#> lm(formula = .outcome ~ ., data = dat)
#> 
#> Residuals:
#>    Min     1Q Median     3Q    Max 
#> -61.39 -22.68 -10.33  11.97 233.35 
#> 
#> Coefficients:
#>              Estimate Std. Error t value Pr(>|t|)   
#> (Intercept) -11.98724   10.39743  -1.153  0.25035   
#> minute       -0.03708    0.16008  -0.232  0.81706   
#> air_time     -0.02181    0.03611  -0.604  0.54662   
#> hour          1.79362    0.64672   2.773  0.00608 **
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 42.72 on 196 degrees of freedom
#> Multiple R-squared:  0.0383, Adjusted R-squared:  0.02358 
#> F-statistic: 2.602 on 3 and 196 DF,  p-value: 0.05326

Define function that just gives back the time taken:

lm_flights_duration <- function(df = flights2, n, k, crossval = my_crossval) {
  start <- Sys.time()
  dummy <- lm_flights(df = df, n = n, n_preds = k, crossval = crossval)
  end <- Sys.time()
  
  rm(dummy)

  time_taken <- end - start
  return(time_taken)
}

Test it:

lm_flights_duration(n = 200, k = 3)
#> Time difference of 0.537606 secs
lm_flights_duration(n = 1000, k = 3)
#> Time difference of 0.5362749 secs
lm_flights_duration(n = 1e5, k = 3)
#> Time difference of 2.059408 secs

Low put that into a loop:

time_df <- tibble(n = NA,
                  k = NA,
                  time_taken = NA,
                  trial = NA,
                  run = NA)

i <- 1
j <- 1
trial <- 1

max_trial <- 10

run <- 1


for (i in seq_along(sample_size)) {  # loop for sample size
  cat("n = ", sample_size[i], "\n")
  
  for (j in seq_along(k)) {  # loop for number of predictors (k)
    cat("  k = ", k[j], "\n")
    
    for (trial in 1:max_trial) {
      cat("    trial = ", trial, "\n")
      cat("    run = ", run, "\n")
      
      lm_duration <- lm_flights_duration(n = sample_size[i], k = k[j])
      cat("Time taken for present model computation: ", lm_duration, " s \n")
    
      time_df <- time_df %>% 
        add_row(n = sample_size[i],
                k = k[j],
                trial = trial,
                time_taken = lm_duration,
                run = run)
      
      run <- run + 1
    }
  }
}
#> n =  100 
#>   k =  1 
#>     trial =  1 
#>     run =  1 
#> Time taken for present model computation:  1.081863  s 
#>     trial =  2 
#>     run =  2 
#> Time taken for present model computation:  0.4767709  s 
#>     trial =  3 
#>     run =  3 
#> Time taken for present model computation:  0.4721088  s 
#>     trial =  4 
#>     run =  4 
#> Time taken for present model computation:  0.438689  s 
#>     trial =  5 
#>     run =  5 
#> Time taken for present model computation:  0.4379151  s 
#>     trial =  6 
#>     run =  6 
#> Time taken for present model computation:  0.4470088  s 
#>     trial =  7 
#>     run =  7 
#> Time taken for present model computation:  0.437305  s 
#>     trial =  8 
#>     run =  8 
#> Time taken for present model computation:  0.437597  s 
#>     trial =  9 
#>     run =  9 
#> Time taken for present model computation:  0.437248  s 
#>     trial =  10 
#>     run =  10 
#> Time taken for present model computation:  0.435257  s 
#>   k =  2 
#>     trial =  1 
#>     run =  11 
#> Time taken for present model computation:  0.4330289  s 
#>     trial =  2 
#>     run =  12 
#> Time taken for present model computation:  0.443146  s 
#>     trial =  3 
#>     run =  13 
#> Time taken for present model computation:  0.4344649  s 
#>     trial =  4 
#>     run =  14 
#> Time taken for present model computation:  0.437479  s 
#>     trial =  5 
#>     run =  15 
#> Time taken for present model computation:  0.440253  s 
#>     trial =  6 
#>     run =  16 
#> Time taken for present model computation:  0.4336469  s 
#>     trial =  7 
#>     run =  17 
#> Time taken for present model computation:  0.444977  s 
#>     trial =  8 
#>     run =  18 
#> Time taken for present model computation:  0.4379861  s 
#>     trial =  9 
#>     run =  19 
#> Time taken for present model computation:  0.4363792  s 
#>     trial =  10 
#>     run =  20 
#> Time taken for present model computation:  0.5396779  s 
#>   k =  5 
#>     trial =  1 
#>     run =  21 
#> Time taken for present model computation:  0.4446821  s 
#>     trial =  2 
#>     run =  22 
#> Time taken for present model computation:  0.441684  s 
#>     trial =  3 
#>     run =  23 
#> Time taken for present model computation:  0.4409502  s 
#>     trial =  4 
#>     run =  24 
#> Time taken for present model computation:  0.5154259  s 
#>     trial =  5 
#>     run =  25 
#> Time taken for present model computation:  0.439065  s 
#>     trial =  6 
#>     run =  26 
#> Time taken for present model computation:  0.4437191  s 
#>     trial =  7 
#>     run =  27 
#> Time taken for present model computation:  0.4360859  s 
#>     trial =  8 
#>     run =  28 
#> Time taken for present model computation:  0.445807  s 
#>     trial =  9 
#>     run =  29 
#> Time taken for present model computation:  0.4463658  s 
#>     trial =  10 
#>     run =  30 
#> Time taken for present model computation:  0.4561081  s 
#>   k =  10 
#>     trial =  1 
#>     run =  31 
#> Time taken for present model computation:  0.4590909  s 
#>     trial =  2 
#>     run =  32 
#> Time taken for present model computation:  0.445292  s 
#>     trial =  3 
#>     run =  33 
#> Time taken for present model computation:  0.439853  s 
#>     trial =  4 
#>     run =  34 
#> Time taken for present model computation:  0.4462152  s 
#>     trial =  5 
#>     run =  35 
#> Time taken for present model computation:  0.444721  s 
#>     trial =  6 
#>     run =  36 
#> Time taken for present model computation:  0.4642301  s 
#>     trial =  7 
#>     run =  37 
#> Time taken for present model computation:  0.44784  s 
#>     trial =  8 
#>     run =  38 
#> Time taken for present model computation:  0.443491  s 
#>     trial =  9 
#>     run =  39 
#> Time taken for present model computation:  0.448123  s 
#>     trial =  10 
#>     run =  40 
#> Time taken for present model computation:  0.4442511  s 
#> n =  500 
#>   k =  1 
#>     trial =  1 
#>     run =  41 
#> Time taken for present model computation:  0.4442301  s 
#>     trial =  2 
#>     run =  42 
#> Time taken for present model computation:  0.5318801  s 
#>     trial =  3 
#>     run =  43 
#> Time taken for present model computation:  0.5844562  s 
#>     trial =  4 
#>     run =  44 
#> Time taken for present model computation:  0.4407461  s 
#>     trial =  5 
#>     run =  45 
#> Time taken for present model computation:  0.4416189  s 
#>     trial =  6 
#>     run =  46 
#> Time taken for present model computation:  0.4451249  s 
#>     trial =  7 
#>     run =  47 
#> Time taken for present model computation:  0.4442358  s 
#>     trial =  8 
#>     run =  48 
#> Time taken for present model computation:  0.4367981  s 
#>     trial =  9 
#>     run =  49 
#> Time taken for present model computation:  0.4361279  s 
#>     trial =  10 
#>     run =  50 
#> Time taken for present model computation:  0.4720929  s 
#>   k =  2 
#>     trial =  1 
#>     run =  51 
#> Time taken for present model computation:  0.5250502  s 
#>     trial =  2 
#>     run =  52 
#> Time taken for present model computation:  0.441947  s 
#>     trial =  3 
#>     run =  53 
#> Time taken for present model computation:  0.4436438  s 
#>     trial =  4 
#>     run =  54 
#> Time taken for present model computation:  0.4400389  s 
#>     trial =  5 
#>     run =  55 
#> Time taken for present model computation:  0.6254661  s 
#>     trial =  6 
#>     run =  56 
#> Time taken for present model computation:  0.4434049  s 
#>     trial =  7 
#>     run =  57 
#> Time taken for present model computation:  0.442023  s 
#>     trial =  8 
#>     run =  58 
#> Time taken for present model computation:  0.444391  s 
#>     trial =  9 
#>     run =  59 
#> Time taken for present model computation:  0.447026  s 
#>     trial =  10 
#>     run =  60 
#> Time taken for present model computation:  0.446955  s 
#>   k =  5 
#>     trial =  1 
#>     run =  61 
#> Time taken for present model computation:  0.4426479  s 
#>     trial =  2 
#>     run =  62 
#> Time taken for present model computation:  0.6556299  s 
#>     trial =  3 
#>     run =  63 
#> Time taken for present model computation:  0.631006  s 
#>     trial =  4 
#>     run =  64 
#> Time taken for present model computation:  0.6135962  s 
#>     trial =  5 
#>     run =  65 
#> Time taken for present model computation:  0.6482961  s 
#>     trial =  6 
#>     run =  66 
#> Time taken for present model computation:  0.443377  s 
#>     trial =  7 
#>     run =  67 
#> Time taken for present model computation:  0.4554012  s 
#>     trial =  8 
#>     run =  68 
#> Time taken for present model computation:  0.4599311  s 
#>     trial =  9 
#>     run =  69 
#> Time taken for present model computation:  0.455996  s 
#>     trial =  10 
#>     run =  70 
#> Time taken for present model computation:  0.4931841  s 
#>   k =  10 
#>     trial =  1 
#>     run =  71 
#> Time taken for present model computation:  0.4936209  s 
#>     trial =  2 
#>     run =  72 
#> Time taken for present model computation:  0.497539  s 
#>     trial =  3 
#>     run =  73 
#> Time taken for present model computation:  0.8787749  s 
#>     trial =  4 
#>     run =  74 
#> Time taken for present model computation:  0.809119  s 
#>     trial =  5 
#>     run =  75 
#> Time taken for present model computation:  0.5426331  s 
#>     trial =  6 
#>     run =  76 
#> Time taken for present model computation:  0.464215  s 
#>     trial =  7 
#>     run =  77 
#> Time taken for present model computation:  0.5386751  s 
#>     trial =  8 
#>     run =  78 
#> Time taken for present model computation:  0.453577  s 
#>     trial =  9 
#>     run =  79 
#> Time taken for present model computation:  0.4623952  s 
#>     trial =  10 
#>     run =  80 
#> Time taken for present model computation:  0.4489491  s 
#> n =  1000 
#>   k =  1 
#>     trial =  1 
#>     run =  81 
#> Time taken for present model computation:  0.4592569  s 
#>     trial =  2 
#>     run =  82 
#> Time taken for present model computation:  0.4496381  s 
#>     trial =  3 
#>     run =  83 
#> Time taken for present model computation:  0.451736  s 
#>     trial =  4 
#>     run =  84 
#> Time taken for present model computation:  0.4546001  s 
#>     trial =  5 
#>     run =  85 
#> Time taken for present model computation:  0.4720671  s 
#>     trial =  6 
#>     run =  86 
#> Time taken for present model computation:  0.446197  s 
#>     trial =  7 
#>     run =  87 
#> Time taken for present model computation:  0.463027  s 
#>     trial =  8 
#>     run =  88 
#> Time taken for present model computation:  0.4518969  s 
#>     trial =  9 
#>     run =  89 
#> Time taken for present model computation:  0.449012  s 
#>     trial =  10 
#>     run =  90 
#> Time taken for present model computation:  0.4623961  s 
#>   k =  2 
#>     trial =  1 
#>     run =  91 
#> Time taken for present model computation:  0.4603031  s 
#>     trial =  2 
#>     run =  92 
#> Time taken for present model computation:  0.457056  s 
#>     trial =  3 
#>     run =  93 
#> Time taken for present model computation:  0.4493222  s 
#>     trial =  4 
#>     run =  94 
#> Time taken for present model computation:  0.4531391  s 
#>     trial =  5 
#>     run =  95 
#> Time taken for present model computation:  0.450547  s 
#>     trial =  6 
#>     run =  96 
#> Time taken for present model computation:  0.472997  s 
#>     trial =  7 
#>     run =  97 
#> Time taken for present model computation:  0.5127561  s 
#>     trial =  8 
#>     run =  98 
#> Time taken for present model computation:  0.4521949  s 
#>     trial =  9 
#>     run =  99 
#> Time taken for present model computation:  0.593178  s 
#>     trial =  10 
#>     run =  100 
#> Time taken for present model computation:  0.5076699  s 
#>   k =  5 
#>     trial =  1 
#>     run =  101 
#> Time taken for present model computation:  0.4637768  s 
#>     trial =  2 
#>     run =  102 
#> Time taken for present model computation:  0.471168  s 
#>     trial =  3 
#>     run =  103 
#> Time taken for present model computation:  0.487745  s 
#>     trial =  4 
#>     run =  104 
#> Time taken for present model computation:  0.592675  s 
#>     trial =  5 
#>     run =  105 
#> Time taken for present model computation:  0.5066102  s 
#>     trial =  6 
#>     run =  106 
#> Time taken for present model computation:  0.585495  s 
#>     trial =  7 
#>     run =  107 
#> Time taken for present model computation:  0.4994502  s 
#>     trial =  8 
#>     run =  108 
#> Time taken for present model computation:  0.539782  s 
#>     trial =  9 
#>     run =  109 
#> Time taken for present model computation:  0.49896  s 
#>     trial =  10 
#>     run =  110 
#> Time taken for present model computation:  0.4953618  s 
#>   k =  10 
#>     trial =  1 
#>     run =  111 
#> Time taken for present model computation:  0.5023279  s 
#>     trial =  2 
#>     run =  112 
#> Time taken for present model computation:  0.5025802  s 
#>     trial =  3 
#>     run =  113 
#> Time taken for present model computation:  0.509032  s 
#>     trial =  4 
#>     run =  114 
#> Time taken for present model computation:  0.4841821  s 
#>     trial =  5 
#>     run =  115 
#> Time taken for present model computation:  0.498765  s 
#>     trial =  6 
#>     run =  116 
#> Time taken for present model computation:  0.4866431  s 
#>     trial =  7 
#>     run =  117 
#> Time taken for present model computation:  0.499033  s 
#>     trial =  8 
#>     run =  118 
#> Time taken for present model computation:  0.5038011  s 
#>     trial =  9 
#>     run =  119 
#> Time taken for present model computation:  0.5065711  s 
#>     trial =  10 
#>     run =  120 
#> Time taken for present model computation:  0.4977269  s 
#> n =  5000 
#>   k =  1 
#>     trial =  1 
#>     run =  121 
#> Time taken for present model computation:  0.5383601  s 
#>     trial =  2 
#>     run =  122 
#> Time taken for present model computation:  0.5369561  s 
#>     trial =  3 
#>     run =  123 
#> Time taken for present model computation:  0.5346379  s 
#>     trial =  4 
#>     run =  124 
#> Time taken for present model computation:  0.5241148  s 
#>     trial =  5 
#>     run =  125 
#> Time taken for present model computation:  0.5374188  s 
#>     trial =  6 
#>     run =  126 
#> Time taken for present model computation:  0.5398779  s 
#>     trial =  7 
#>     run =  127 
#> Time taken for present model computation:  0.573046  s 
#>     trial =  8 
#>     run =  128 
#> Time taken for present model computation:  0.5561271  s 
#>     trial =  9 
#>     run =  129 
#> Time taken for present model computation:  0.7098169  s 
#>     trial =  10 
#>     run =  130 
#> Time taken for present model computation:  0.6438732  s 
#>   k =  2 
#>     trial =  1 
#>     run =  131 
#> Time taken for present model computation:  0.5494349  s 
#>     trial =  2 
#>     run =  132 
#> Time taken for present model computation:  0.5612428  s 
#>     trial =  3 
#>     run =  133 
#> Time taken for present model computation:  0.536979  s 
#>     trial =  4 
#>     run =  134 
#> Time taken for present model computation:  0.5327849  s 
#>     trial =  5 
#>     run =  135 
#> Time taken for present model computation:  0.5426998  s 
#>     trial =  6 
#>     run =  136 
#> Time taken for present model computation:  0.5875919  s 
#>     trial =  7 
#>     run =  137 
#> Time taken for present model computation:  0.5822392  s 
#>     trial =  8 
#>     run =  138 
#> Time taken for present model computation:  0.7569931  s 
#>     trial =  9 
#>     run =  139 
#> Time taken for present model computation:  0.5909822  s 
#>     trial =  10 
#>     run =  140 
#> Time taken for present model computation:  0.6130528  s 
#>   k =  5 
#>     trial =  1 
#>     run =  141 
#> Time taken for present model computation:  0.5605171  s 
#>     trial =  2 
#>     run =  142 
#> Time taken for present model computation:  0.66048  s 
#>     trial =  3 
#>     run =  143 
#> Time taken for present model computation:  0.8047049  s 
#>     trial =  4 
#>     run =  144 
#> Time taken for present model computation:  0.6085701  s 
#>     trial =  5 
#>     run =  145 
#> Time taken for present model computation:  0.7481561  s 
#>     trial =  6 
#>     run =  146 
#> Time taken for present model computation:  0.8513651  s 
#>     trial =  7 
#>     run =  147 
#> Time taken for present model computation:  1.098596  s 
#>     trial =  8 
#>     run =  148 
#> Time taken for present model computation:  0.7258921  s 
#>     trial =  9 
#>     run =  149 
#> Time taken for present model computation:  0.823082  s 
#>     trial =  10 
#>     run =  150 
#> Time taken for present model computation:  0.5915329  s 
#>   k =  10 
#>     trial =  1 
#>     run =  151 
#> Time taken for present model computation:  0.6477251  s 
#>     trial =  2 
#>     run =  152 
#> Time taken for present model computation:  0.628072  s 
#>     trial =  3 
#>     run =  153 
#> Time taken for present model computation:  0.6369081  s 
#>     trial =  4 
#>     run =  154 
#> Time taken for present model computation:  0.5999858  s 
#>     trial =  5 
#>     run =  155 
#> Time taken for present model computation:  0.6012709  s 
#>     trial =  6 
#>     run =  156 
#> Time taken for present model computation:  0.579746  s 
#>     trial =  7 
#>     run =  157 
#> Time taken for present model computation:  0.610817  s 
#>     trial =  8 
#>     run =  158 
#> Time taken for present model computation:  0.9687049  s 
#>     trial =  9 
#>     run =  159 
#> Time taken for present model computation:  1.114943  s 
#>     trial =  10 
#>     run =  160 
#> Time taken for present model computation:  0.820246  s 
#> n =  10000 
#>   k =  1 
#>     trial =  1 
#>     run =  161 
#> Time taken for present model computation:  0.9626641  s 
#>     trial =  2 
#>     run =  162 
#> Time taken for present model computation:  0.9527571  s 
#>     trial =  3 
#>     run =  163 
#> Time taken for present model computation:  0.8696439  s 
#>     trial =  4 
#>     run =  164 
#> Time taken for present model computation:  0.7900901  s 
#>     trial =  5 
#>     run =  165 
#> Time taken for present model computation:  0.6508131  s 
#>     trial =  6 
#>     run =  166 
#> Time taken for present model computation:  0.9293659  s 
#>     trial =  7 
#>     run =  167 
#> Time taken for present model computation:  0.8383338  s 
#>     trial =  8 
#>     run =  168 
#> Time taken for present model computation:  0.6500361  s 
#>     trial =  9 
#>     run =  169 
#> Time taken for present model computation:  0.7106299  s 
#>     trial =  10 
#>     run =  170 
#> Time taken for present model computation:  1.070245  s 
#>   k =  2 
#>     trial =  1 
#>     run =  171 
#> Time taken for present model computation:  0.7441781  s 
#>     trial =  2 
#>     run =  172 
#> Time taken for present model computation:  0.6520219  s 
#>     trial =  3 
#>     run =  173 
#> Time taken for present model computation:  0.664592  s 
#>     trial =  4 
#>     run =  174 
#> Time taken for present model computation:  0.7777629  s 
#>     trial =  5 
#>     run =  175 
#> Time taken for present model computation:  0.6470389  s 
#>     trial =  6 
#>     run =  176 
#> Time taken for present model computation:  0.6708791  s 
#>     trial =  7 
#>     run =  177 
#> Time taken for present model computation:  0.774122  s 
#>     trial =  8 
#>     run =  178 
#> Time taken for present model computation:  1.122771  s 
#>     trial =  9 
#>     run =  179 
#> Time taken for present model computation:  0.8425469  s 
#>     trial =  10 
#>     run =  180 
#> Time taken for present model computation:  0.723881  s 
#>   k =  5 
#>     trial =  1 
#>     run =  181 
#> Time taken for present model computation:  0.8225582  s 
#>     trial =  2 
#>     run =  182 
#> Time taken for present model computation:  1.084098  s 
#>     trial =  3 
#>     run =  183 
#> Time taken for present model computation:  0.7142301  s 
#>     trial =  4 
#>     run =  184 
#> Time taken for present model computation:  0.987601  s 
#>     trial =  5 
#>     run =  185 
#> Time taken for present model computation:  1.043293  s 
#>     trial =  6 
#>     run =  186 
#> Time taken for present model computation:  1.091965  s 
#>     trial =  7 
#>     run =  187 
#> Time taken for present model computation:  0.963506  s 
#>     trial =  8 
#>     run =  188 
#> Time taken for present model computation:  1.372548  s 
#>     trial =  9 
#>     run =  189 
#> Time taken for present model computation:  1.107177  s 
#>     trial =  10 
#>     run =  190 
#> Time taken for present model computation:  0.9992149  s 
#>   k =  10 
#>     trial =  1 
#>     run =  191 
#> Time taken for present model computation:  1.086515  s 
#>     trial =  2 
#>     run =  192 
#> Time taken for present model computation:  0.8890159  s 
#>     trial =  3 
#>     run =  193 
#> Time taken for present model computation:  1.135766  s 
#>     trial =  4 
#>     run =  194 
#> Time taken for present model computation:  0.770365  s 
#>     trial =  5 
#>     run =  195 
#> Time taken for present model computation:  1.076935  s 
#>     trial =  6 
#>     run =  196 
#> Time taken for present model computation:  0.784188  s 
#>     trial =  7 
#>     run =  197 
#> Time taken for present model computation:  1.018505  s 
#>     trial =  8 
#>     run =  198 
#> Time taken for present model computation:  1.161256  s 
#>     trial =  9 
#>     run =  199 
#> Time taken for present model computation:  1.088395  s 
#>     trial =  10 
#>     run =  200 
#> Time taken for present model computation:  1.568357  s 
#> n =  20000 
#>   k =  1 
#>     trial =  1 
#>     run =  201 
#> Time taken for present model computation:  1.249474  s 
#>     trial =  2 
#>     run =  202 
#> Time taken for present model computation:  1.129212  s 
#>     trial =  3 
#>     run =  203 
#> Time taken for present model computation:  1.116724  s 
#>     trial =  4 
#>     run =  204 
#> Time taken for present model computation:  1.263537  s 
#>     trial =  5 
#>     run =  205 
#> Time taken for present model computation:  1.090505  s 
#>     trial =  6 
#>     run =  206 
#> Time taken for present model computation:  1.047199  s 
#>     trial =  7 
#>     run =  207 
#> Time taken for present model computation:  1.096708  s 
#>     trial =  8 
#>     run =  208 
#> Time taken for present model computation:  1.192962  s 
#>     trial =  9 
#>     run =  209 
#> Time taken for present model computation:  1.170878  s 
#>     trial =  10 
#>     run =  210 
#> Time taken for present model computation:  1.300999  s 
#>   k =  2 
#>     trial =  1 
#>     run =  211 
#> Time taken for present model computation:  1.056511  s 
#>     trial =  2 
#>     run =  212 
#> Time taken for present model computation:  1.19051  s 
#>     trial =  3 
#>     run =  213 
#> Time taken for present model computation:  1.245578  s 
#>     trial =  4 
#>     run =  214 
#> Time taken for present model computation:  1.007171  s 
#>     trial =  5 
#>     run =  215 
#> Time taken for present model computation:  0.9098899  s 
#>     trial =  6 
#>     run =  216 
#> Time taken for present model computation:  1.03809  s 
#>     trial =  7 
#>     run =  217 
#> Time taken for present model computation:  1.060952  s 
#>     trial =  8 
#>     run =  218 
#> Time taken for present model computation:  1.403779  s 
#>     trial =  9 
#>     run =  219 
#> Time taken for present model computation:  1.09942  s 
#>     trial =  10 
#>     run =  220 
#> Time taken for present model computation:  1.27247  s 
#>   k =  5 
#>     trial =  1 
#>     run =  221 
#> Time taken for present model computation:  1.019589  s 
#>     trial =  2 
#>     run =  222 
#> Time taken for present model computation:  1.178619  s 
#>     trial =  3 
#>     run =  223 
#> Time taken for present model computation:  1.08717  s 
#>     trial =  4 
#>     run =  224 
#> Time taken for present model computation:  0.9382942  s 
#>     trial =  5 
#>     run =  225 
#> Time taken for present model computation:  0.8877261  s 
#>     trial =  6 
#>     run =  226 
#> Time taken for present model computation:  0.8789959  s 
#>     trial =  7 
#>     run =  227 
#> Time taken for present model computation:  0.881546  s 
#>     trial =  8 
#>     run =  228 
#> Time taken for present model computation:  0.916564  s 
#>     trial =  9 
#>     run =  229 
#> Time taken for present model computation:  0.8839359  s 
#>     trial =  10 
#>     run =  230 
#> Time taken for present model computation:  0.9359338  s 
#>   k =  10 
#>     trial =  1 
#>     run =  231 
#> Time taken for present model computation:  1.303608  s 
#>     trial =  2 
#>     run =  232 
#> Time taken for present model computation:  1.121285  s 
#>     trial =  3 
#>     run =  233 
#> Time taken for present model computation:  1.179266  s 
#>     trial =  4 
#>     run =  234 
#> Time taken for present model computation:  1.120013  s 
#>     trial =  5 
#>     run =  235 
#> Time taken for present model computation:  1.624908  s 
#>     trial =  6 
#>     run =  236 
#> Time taken for present model computation:  1.045926  s 
#>     trial =  7 
#>     run =  237 
#> Time taken for present model computation:  1.011557  s 
#>     trial =  8 
#>     run =  238 
#> Time taken for present model computation:  0.988035  s 
#>     trial =  9 
#>     run =  239 
#> Time taken for present model computation:  1.648639  s 
#>     trial =  10 
#>     run =  240 
#> Time taken for present model computation:  1.673923  s 
#> n =  1e+05 
#>   k =  1 
#>     trial =  1 
#>     run =  241 
#> Time taken for present model computation:  4.281005  s 
#>     trial =  2 
#>     run =  242 
#> Time taken for present model computation:  4.204582  s 
#>     trial =  3 
#>     run =  243 
#> Time taken for present model computation:  3.944786  s 
#>     trial =  4 
#>     run =  244 
#> Time taken for present model computation:  2.798044  s 
#>     trial =  5 
#>     run =  245 
#> Time taken for present model computation:  2.558156  s 
#>     trial =  6 
#>     run =  246 
#> Time taken for present model computation:  2.953777  s 
#>     trial =  7 
#>     run =  247 
#> Time taken for present model computation:  3.665101  s 
#>     trial =  8 
#>     run =  248 
#> Time taken for present model computation:  3.418542  s 
#>     trial =  9 
#>     run =  249 
#> Time taken for present model computation:  3.354636  s 
#>     trial =  10 
#>     run =  250 
#> Time taken for present model computation:  2.95774  s 
#>   k =  2 
#>     trial =  1 
#>     run =  251 
#> Time taken for present model computation:  3.145113  s 
#>     trial =  2 
#>     run =  252 
#> Time taken for present model computation:  2.544685  s 
#>     trial =  3 
#>     run =  253 
#> Time taken for present model computation:  2.571834  s 
#>     trial =  4 
#>     run =  254 
#> Time taken for present model computation:  2.700399  s 
#>     trial =  5 
#>     run =  255 
#> Time taken for present model computation:  2.548327  s 
#>     trial =  6 
#>     run =  256 
#> Time taken for present model computation:  2.915473  s 
#>     trial =  7 
#>     run =  257 
#> Time taken for present model computation:  2.859279  s 
#>     trial =  8 
#>     run =  258 
#> Time taken for present model computation:  2.73993  s 
#>     trial =  9 
#>     run =  259 
#> Time taken for present model computation:  2.883709  s 
#>     trial =  10 
#>     run =  260 
#> Time taken for present model computation:  2.709883  s 
#>   k =  5 
#>     trial =  1 
#>     run =  261 
#> Time taken for present model computation:  3.273748  s 
#>     trial =  2 
#>     run =  262 
#> Time taken for present model computation:  2.935905  s 
#>     trial =  3 
#>     run =  263 
#> Time taken for present model computation:  2.833582  s 
#>     trial =  4 
#>     run =  264 
#> Time taken for present model computation:  3.734762  s 
#>     trial =  5 
#>     run =  265 
#> Time taken for present model computation:  3.014805  s 
#>     trial =  6 
#>     run =  266 
#> Time taken for present model computation:  3.585858  s 
#>     trial =  7 
#>     run =  267 
#> Time taken for present model computation:  3.106992  s 
#>     trial =  8 
#>     run =  268 
#> Time taken for present model computation:  2.803939  s 
#>     trial =  9 
#>     run =  269 
#> Time taken for present model computation:  2.900499  s 
#>     trial =  10 
#>     run =  270 
#> Time taken for present model computation:  3.422959  s 
#>   k =  10 
#>     trial =  1 
#>     run =  271 
#> Time taken for present model computation:  3.97062  s 
#>     trial =  2 
#>     run =  272 
#> Time taken for present model computation:  4.130229  s 
#>     trial =  3 
#>     run =  273 
#> Time taken for present model computation:  3.900531  s 
#>     trial =  4 
#>     run =  274 
#> Time taken for present model computation:  5.105087  s 
#>     trial =  5 
#>     run =  275 
#> Time taken for present model computation:  5.50513  s 
#>     trial =  6 
#>     run =  276 
#> Time taken for present model computation:  4.338999  s 
#>     trial =  7 
#>     run =  277 
#> Time taken for present model computation:  4.343665  s 
#>     trial =  8 
#>     run =  278 
#> Time taken for present model computation:  4.533405  s 
#>     trial =  9 
#>     run =  279 
#> Time taken for present model computation:  5.312447  s 
#>     trial =  10 
#>     run =  280 
#> Time taken for present model computation:  4.613441  s

Show the results

Now let’s put that together.

time_df <- time_df %>% 
  mutate(id = 1:nrow(time_df)) %>% 
  group_by(k, n) %>% 
  mutate(time_taken_avg = mean(time_taken, na.rm = TRUE)) %>% 
  ungroup()

time_df <- time_df %>% 
  drop_na() 
time_df %>% 
  ggplot(aes(x = n, y = time_taken, color = factor(k))) +
  geom_point() +
  scale_x_log10(breaks = sample_size) +
  geom_line(aes(y = time_taken_avg, group = k)) +
  labs(color = "k",
       y = "computation time [sec]") +
  scale_color_viridis_d()  +
  theme_light()

For small samples, there’s a large variance in computation time, with sample sizes augmenting, the computation time differences become more stable.

time_df %>% 
  drop_na() %>% 
  ggplot(aes(x = n, y = time_taken, color = factor(k))) +
  geom_point() +
  scale_x_log10(breaks = c(1e1, 1e03, 1e04, 1e05)) +
  geom_line(aes(y = time_taken_avg, group = k)) +
  facet_wrap(~ k, labeller = label_context) +
  labs(color = "k") +
  scale_color_viridis_d() +
  theme_light()

Debrief

On average, the time for each model was acceptable. However, two things should be noted: First, non-numeric variables can consume much more time (if many levels are present). Second, parameter tuning and cross validation will soak up a lot of time. Assume there are 3 tuning parameters, and we test 5 values each (thus yielding 15 models). Assume further we will use a 10 times cross validation with 10 folds each, giving 100 repetitions. In sum, 15k repetitions will be needed. That will take ages, and possible a lot of memory can be exhausted.