Estimating simulation variance in Stan models

1 Load packages

library(tidyverse)  # data wrangling
library(rstanarm)
library(gt)

2 Motivation

stan_glm() allows for setting a seed value thereby eliminating the variance induced by random numbers. However, in case a seed is not used, how much variance is to be expected? This is the research question of this analysis.

Let’s choose \(n=100\) repetitions in our simulation.

3 Model

Let’s compute a simple linear model of this type:

stan_glm(out_var ~ pred1 + pred2, data = d, refresh = 0).

4 Workhorse function

get_simu_var_b1 <- function(dataset, n = 100) {
  
  all_dfs <- teachertools::teaching_df()
  
  d <- teachertools::teaching_df(dataset)
  #d <- teachertools::teaching_df("gtcars")
  
  d_name <- attr(d, "df_name")
  
  d_path <- all_dfs |> 
    dplyr::filter(datasets_names == d_name) |> 
    dplyr::pull(source)
  
  preds_chosen <- attr(d, "preds_chosen")
  focus_pred <- attr(d, "focus_pred")
  output_var <- attr(d, "output_var")
  
  results <- vector(mode = "numeric", length = n)
  
  for (i in 1:n){
  
    mod <- stan_glm(av ~ uv1 + uv2, data = d, refresh = 0)
    results[i] <- coef(mod)["uv1"] |> as.numeric()
    cat(paste0("i = ", i, " of ", n, "\n"))
  }
  return(results)
}

5 Function for summarizing the simulation results

summ_simu <- function(simu_data){
  out <- 
     list(
       dataset = deparse(substitute(simu_data)),
       std_dev = sd(simu_data),
       variance = var(simu_data),
       arithm_mean = mean(simu_data),
       med = median(simu_data),
       iqr = IQR(simu_data),
       max_value = max(simu_data),
       min_value = min(simu_data),
       max_minus_min = max(simu_data) - min(simu_data),
       max_to_min = max(simu_data) / min(simu_data),
       var_coeff = sd(simu_data) / mean(simu_data),
       rel_var = var(simu_data) /  mean(simu_data)
       )
}

6 Dataset mtcars

mtcars_out <- get_simu_var_b1(dataset = "mtcars")
hist(mtcars_out)

mtcars_results1 <-
  summ_simu(mtcars_out)

mtcars_results1
#> $dataset
#> [1] "mtcars_out"
#> 
#> $std_dev
#> [1] 0.0113812
#> 
#> $variance
#> [1] 0.0001295316
#> 
#> $arithm_mean
#> [1] -4.758728
#> 
#> $med
#> [1] -4.760092
#> 
#> $iqr
#> [1] 0.01441278
#> 
#> $max_value
#> [1] -4.728438
#> 
#> $min_value
#> [1] -4.786539
#> 
#> $max_minus_min
#> [1] 0.05810086
#> 
#> $max_to_min
#> [1] 0.9878616
#> 
#> $var_coeff
#> [1] -0.002391647
#> 
#> $rel_var
#> [1] -2.72198e-05

7 Dataset msleep

msleep_out <- get_simu_var_b1(dataset = "msleep")
hist(msleep_out)

msleep_results1 <- summ_simu(msleep_out)

msleep_results1
#> $dataset
#> [1] "msleep_out"
#> 
#> $std_dev
#> [1] 4.674048e-05
#> 
#> $variance
#> [1] 2.184672e-09
#> 
#> $arithm_mean
#> [1] 0.007708606
#> 
#> $med
#> [1] 0.007704791
#> 
#> $iqr
#> [1] 6.562096e-05
#> 
#> $max_value
#> [1] 0.007827808
#> 
#> $min_value
#> [1] 0.007570025
#> 
#> $max_minus_min
#> [1] 0.0002577828
#> 
#> $max_to_min
#> [1] 1.034053
#> 
#> $var_coeff
#> [1] 0.006063415
#> 
#> $rel_var
#> [1] 2.834069e-07

8 Dataset penguins

penguins_out <- get_simu_var_b1(dataset = "penguins")
hist(penguins_out)

penguins_results1 <- summ_simu(penguins_out)

penguins_results1
#> $dataset
#> [1] "penguins_out"
#> 
#> $std_dev
#> [1] 0.002455989
#> 
#> $variance
#> [1] 6.031884e-06
#> 
#> $arithm_mean
#> [1] -0.6427714
#> 
#> $med
#> [1] -0.642781
#> 
#> $iqr
#> [1] 0.002662391
#> 
#> $max_value
#> [1] -0.6346151
#> 
#> $min_value
#> [1] -0.6484986
#> 
#> $max_minus_min
#> [1] 0.01388349
#> 
#> $max_to_min
#> [1] 0.9785913
#> 
#> $var_coeff
#> [1] -0.003820938
#> 
#> $rel_var
#> [1] -9.384182e-06

9 Dataset tips

tips_out <- get_simu_var_b1(dataset = "tips")
hist(tips_out)

tips_results1 <- summ_simu(tips_out)

tips_results1
#> $dataset
#> [1] "tips_out"
#> 
#> $std_dev
#> [1] 0.002283815
#> 
#> $variance
#> [1] 5.215813e-06
#> 
#> $arithm_mean
#> [1] 0.1926246
#> 
#> $med
#> [1] 0.1923049
#> 
#> $iqr
#> [1] 0.003167123
#> 
#> $max_value
#> [1] 0.1995643
#> 
#> $min_value
#> [1] 0.1877088
#> 
#> $max_minus_min
#> [1] 0.01185556
#> 
#> $max_to_min
#> [1] 1.063159
#> 
#> $var_coeff
#> [1] 0.0118563
#> 
#> $rel_var
#> [1] 2.707761e-05

10 Dataset gtcars

#saveRDS(gtcars_out, file = "gtcars_out.rds")
gtcars_rdsfile_path <- "/Users/sebastiansaueruser/Publikationen/blog_ses/sesa-blog/gtcars_out.rds"
if (file.exists(gtcars_rdsfile_path)) {
  cat("RDS file exists")
  gtcars_out <- readRDS(gtcars_rdsfile_path)
} else {
  cat("Coduncting simulation...")
  gtcars_out <- get_simu_var_b1(dataset = "gtcars")
}
hist(gtcars_out)

gtcars_results1 <- summ_simu(gtcars_out)

gtcars_results1
#> $dataset
#> [1] "gtcars_out"
#> 
#> $std_dev
#> [1] 199.5382
#> 
#> $variance
#> [1] 39815.49
#> 
#> $arithm_mean
#> [1] -21198.79
#> 
#> $med
#> [1] -21168.08
#> 
#> $iqr
#> [1] 258.0704
#> 
#> $max_value
#> [1] -20851.2
#> 
#> $min_value
#> [1] -21851.56
#> 
#> $max_minus_min
#> [1] 1000.359
#> 
#> $max_to_min
#> [1] 0.9542202
#> 
#> $var_coeff
#> [1] -0.009412714
#> 
#> $rel_var
#> [1] -1.878196

11 Dataset Boston

Boston_out <- get_simu_var_b1(dataset = "Boston")
hist(Boston_out)

Boston_results1 <- summ_simu(Boston_out)

Boston_results1
#> $dataset
#> [1] "Boston_out"
#> 
#> $std_dev
#> [1] 8.527836e-05
#> 
#> $variance
#> [1] 7.272398e-09
#> 
#> $arithm_mean
#> [1] 0.02056738
#> 
#> $med
#> [1] 0.02057132
#> 
#> $iqr
#> [1] 0.0001292766
#> 
#> $max_value
#> [1] 0.02073811
#> 
#> $min_value
#> [1] 0.02035571
#> 
#> $max_minus_min
#> [1] 0.0003823966
#> 
#> $max_to_min
#> [1] 1.018786
#> 
#> $var_coeff
#> [1] 0.004146291
#> 
#> $rel_var
#> [1] 3.535889e-07

12 Dataset TeachingRatings

TeachingRatings_out <- get_simu_var_b1(dataset = "TeachingRatings")
hist(TeachingRatings_out)

TeachingRatings_results1 <- summ_simu(TeachingRatings_out)

TeachingRatings_results1
#> $dataset
#> [1] "TeachingRatings_out"
#> 
#> $std_dev
#> [1] 6.331732e-06
#> 
#> $variance
#> [1] 4.009083e-11
#> 
#> $arithm_mean
#> [1] -1.502834e-05
#> 
#> $med
#> [1] -1.545877e-05
#> 
#> $iqr
#> [1] 8.406646e-06
#> 
#> $max_value
#> [1] 3.224198e-06
#> 
#> $min_value
#> [1] -2.743382e-05
#> 
#> $max_minus_min
#> [1] 3.065801e-05
#> 
#> $max_to_min
#> [1] -0.1175264
#> 
#> $var_coeff
#> [1] -0.4213195
#> 
#> $rel_var
#> [1] -2.667682e-06

13 Results overview

main_results <-
  bind_rows(Boston_results1,
            gtcars_results1,
            msleep_results1,
            mtcars_results1,
            penguins_results1,
            TeachingRatings_results1,
            tips_results1)

out2 <- main_results %>% 
  mutate(across(where(is.numeric), round, 3))


gt(main_results) %>% 
  fmt_number(where(is.numeric), decimals = 3)
dataset std_dev variance arithm_mean med iqr max_value min_value max_minus_min max_to_min var_coeff rel_var
Boston_out 0.000 0.000 0.021 0.021 0.000 0.021 0.020 0.000 1.019 0.004 0.000
gtcars_out 199.538 39,815.485 −21,198.794 −21,168.076 258.070 −20,851.197 −21,851.556 1,000.359 0.954 −0.009 −1.878
msleep_out 0.000 0.000 0.008 0.008 0.000 0.008 0.008 0.000 1.034 0.006 0.000
mtcars_out 0.011 0.000 −4.759 −4.760 0.014 −4.728 −4.787 0.058 0.988 −0.002 0.000
penguins_out 0.002 0.000 −0.643 −0.643 0.003 −0.635 −0.648 0.014 0.979 −0.004 0.000
TeachingRatings_out 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 −0.118 −0.421 0.000
tips_out 0.002 0.000 0.193 0.192 0.003 0.200 0.188 0.012 1.063 0.012 0.000

14 Conclusion

The simulation results are quite close to each other, the coefficient of variation is always below 5%. In situations however where the absolute values are close to zero, the relative deviations can be comparatively high.

15 Reproducibility

#> ─ Session info ───────────────────────────────────────────────────────────────────────────────────────────────────────
#>  setting  value
#>  version  R version 4.2.1 (2022-06-23)
#>  os       macOS Big Sur ... 10.16
#>  system   x86_64, darwin17.0
#>  ui       X11
#>  language (EN)
#>  collate  en_US.UTF-8
#>  ctype    en_US.UTF-8
#>  tz       Europe/Berlin
#>  date     2023-03-19
#>  pandoc   2.19.2 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/tools/ (via rmarkdown)
#> 
#> ─ Packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────
#>  package       * version    date (UTC) lib source
#>  assertthat      0.2.1      2019-03-21 [1] CRAN (R 4.2.0)
#>  backports       1.4.1      2021-12-13 [1] CRAN (R 4.2.0)
#>  base64enc       0.1-3      2015-07-28 [1] CRAN (R 4.2.0)
#>  bayesplot       1.10.0     2022-11-16 [1] CRAN (R 4.2.0)
#>  blogdown        1.16       2022-12-13 [1] CRAN (R 4.2.0)
#>  bookdown        0.33       2023-03-06 [1] CRAN (R 4.2.0)
#>  boot            1.3-28.1   2022-11-22 [1] CRAN (R 4.2.0)
#>  broom           1.0.4      2023-03-11 [1] CRAN (R 4.2.0)
#>  bslib           0.4.2      2022-12-16 [1] CRAN (R 4.2.0)
#>  cachem          1.0.7      2023-02-24 [1] CRAN (R 4.2.0)
#>  callr           3.7.3      2022-11-02 [1] CRAN (R 4.2.0)
#>  caret           6.0-93     2022-08-09 [1] CRAN (R 4.2.0)
#>  cellranger      1.1.0      2016-07-27 [1] CRAN (R 4.2.0)
#>  class           7.3-20     2022-01-16 [2] CRAN (R 4.2.1)
#>  cli             3.6.0      2023-01-09 [1] CRAN (R 4.2.0)
#>  codetools       0.2-18     2020-11-04 [2] CRAN (R 4.2.1)
#>  colorout      * 1.2-2      2022-06-13 [1] local
#>  colorspace      2.1-0      2023-01-23 [1] CRAN (R 4.2.0)
#>  colourpicker    1.2.0      2022-10-28 [1] CRAN (R 4.2.0)
#>  crayon          1.5.2      2022-09-29 [1] CRAN (R 4.2.1)
#>  crosstalk       1.2.0      2021-11-04 [1] CRAN (R 4.2.0)
#>  data.table      1.14.8     2023-02-17 [1] CRAN (R 4.2.0)
#>  DBI             1.1.3      2022-06-18 [1] CRAN (R 4.2.0)
#>  dbplyr          2.3.1      2023-02-24 [1] CRAN (R 4.2.0)
#>  devtools        2.4.5      2022-10-11 [1] CRAN (R 4.2.1)
#>  digest          0.6.31     2022-12-11 [1] CRAN (R 4.2.0)
#>  dplyr         * 1.1.0      2023-01-29 [1] CRAN (R 4.2.0)
#>  DT              0.27       2023-01-17 [1] CRAN (R 4.2.0)
#>  dygraphs        1.1.1.6    2018-07-11 [1] CRAN (R 4.2.0)
#>  ellipsis        0.3.2      2021-04-29 [1] CRAN (R 4.2.0)
#>  evaluate        0.20       2023-01-17 [1] CRAN (R 4.2.0)
#>  fansi           1.0.4      2023-01-22 [1] CRAN (R 4.2.0)
#>  fastmap         1.1.1      2023-02-24 [1] CRAN (R 4.2.0)
#>  forcats       * 1.0.0      2023-01-29 [1] CRAN (R 4.2.0)
#>  foreach         1.5.2      2022-02-02 [1] CRAN (R 4.2.0)
#>  fs              1.6.1      2023-02-06 [1] CRAN (R 4.2.0)
#>  future          1.32.0     2023-03-07 [1] CRAN (R 4.2.0)
#>  future.apply    1.10.0     2022-11-05 [1] CRAN (R 4.2.0)
#>  gargle          1.3.0      2023-01-30 [1] CRAN (R 4.2.0)
#>  generics        0.1.3      2022-07-05 [1] CRAN (R 4.2.0)
#>  ggplot2       * 3.4.1      2023-02-10 [1] CRAN (R 4.2.0)
#>  globals         0.16.2     2022-11-21 [1] CRAN (R 4.2.0)
#>  glue            1.6.2      2022-02-24 [1] CRAN (R 4.2.0)
#>  googledrive     2.0.0      2021-07-08 [1] CRAN (R 4.2.0)
#>  googlesheets4   1.0.1      2022-08-13 [1] CRAN (R 4.2.0)
#>  gower           1.0.1      2022-12-22 [1] CRAN (R 4.2.0)
#>  gridExtra       2.3        2017-09-09 [1] CRAN (R 4.2.0)
#>  gtable          0.3.1      2022-09-01 [1] CRAN (R 4.2.0)
#>  gtools          3.9.4      2022-11-27 [1] CRAN (R 4.2.0)
#>  hardhat         1.2.0      2022-06-30 [1] CRAN (R 4.2.0)
#>  haven           2.5.2      2023-02-28 [1] CRAN (R 4.2.0)
#>  highr           0.10       2022-12-22 [1] CRAN (R 4.2.0)
#>  hms             1.1.2      2022-08-19 [1] CRAN (R 4.2.0)
#>  htmltools       0.5.4      2022-12-07 [1] CRAN (R 4.2.0)
#>  htmlwidgets     1.6.1      2023-01-07 [1] CRAN (R 4.2.0)
#>  httpuv          1.6.9      2023-02-14 [1] CRAN (R 4.2.0)
#>  httr            1.4.5      2023-02-24 [1] CRAN (R 4.2.0)
#>  igraph          1.4.1      2023-02-24 [1] CRAN (R 4.2.0)
#>  inline          0.3.19     2021-05-31 [1] CRAN (R 4.2.0)
#>  ipred           0.9-14     2023-03-09 [1] CRAN (R 4.2.0)
#>  iterators       1.0.14     2022-02-05 [1] CRAN (R 4.2.0)
#>  jquerylib       0.1.4      2021-04-26 [1] CRAN (R 4.2.0)
#>  jsonlite        1.8.4      2022-12-06 [1] CRAN (R 4.2.0)
#>  knitr           1.42       2023-01-25 [1] CRAN (R 4.2.0)
#>  later           1.3.0      2021-08-18 [1] CRAN (R 4.2.0)
#>  lattice         0.20-45    2021-09-22 [2] CRAN (R 4.2.1)
#>  lava            1.7.2.1    2023-02-27 [1] CRAN (R 4.2.0)
#>  lifecycle       1.0.3      2022-10-07 [1] CRAN (R 4.2.0)
#>  listenv         0.9.0      2022-12-16 [1] CRAN (R 4.2.0)
#>  lme4            1.1-32     2023-03-14 [1] CRAN (R 4.2.0)
#>  loo             2.5.1      2022-03-24 [1] CRAN (R 4.2.0)
#>  lubridate       1.9.2      2023-02-10 [1] CRAN (R 4.2.0)
#>  magrittr        2.0.3      2022-03-30 [1] CRAN (R 4.2.0)
#>  markdown        1.5        2023-01-31 [1] CRAN (R 4.2.0)
#>  MASS            7.3-58.3   2023-03-07 [1] CRAN (R 4.2.0)
#>  Matrix          1.5-3      2022-11-11 [1] CRAN (R 4.2.0)
#>  matrixStats     0.63.0     2022-11-18 [1] CRAN (R 4.2.0)
#>  memoise         2.0.1      2021-11-26 [1] CRAN (R 4.2.0)
#>  mime            0.12       2021-09-28 [1] CRAN (R 4.2.0)
#>  miniUI          0.1.1.1    2018-05-18 [1] CRAN (R 4.2.0)
#>  minqa           1.2.5      2022-10-19 [1] CRAN (R 4.2.1)
#>  ModelMetrics    1.2.2.2    2020-03-17 [1] CRAN (R 4.2.0)
#>  modelr          0.1.10     2022-11-11 [1] CRAN (R 4.2.0)
#>  munsell         0.5.0      2018-06-12 [1] CRAN (R 4.2.0)
#>  nlme            3.1-162    2023-01-31 [1] CRAN (R 4.2.0)
#>  nloptr          2.0.3      2022-05-26 [1] CRAN (R 4.2.0)
#>  nnet            7.3-18     2022-09-28 [1] CRAN (R 4.2.0)
#>  parallelly      1.34.0     2023-01-13 [1] CRAN (R 4.2.1)
#>  pillar          1.8.1      2022-08-19 [1] CRAN (R 4.2.0)
#>  pkgbuild        1.4.0      2022-11-27 [1] CRAN (R 4.2.0)
#>  pkgconfig       2.0.3      2019-09-22 [1] CRAN (R 4.2.0)
#>  pkgload         1.3.2      2022-11-16 [1] CRAN (R 4.2.0)
#>  plyr            1.8.8      2022-11-11 [1] CRAN (R 4.2.0)
#>  prettyunits     1.1.1      2020-01-24 [1] CRAN (R 4.2.0)
#>  pROC            1.18.0     2021-09-03 [1] CRAN (R 4.2.0)
#>  processx        3.8.0      2022-10-26 [1] CRAN (R 4.2.0)
#>  prodlim         2019.11.13 2019-11-17 [1] CRAN (R 4.2.0)
#>  profvis         0.3.7      2020-11-02 [1] CRAN (R 4.2.0)
#>  promises        1.2.0.1    2021-02-11 [1] CRAN (R 4.2.0)
#>  ps              1.7.2      2022-10-26 [1] CRAN (R 4.2.0)
#>  purrr         * 1.0.1      2023-01-10 [1] CRAN (R 4.2.0)
#>  R6              2.5.1      2021-08-19 [1] CRAN (R 4.2.0)
#>  Rcpp          * 1.0.10     2023-01-22 [1] CRAN (R 4.2.0)
#>  RcppParallel    5.1.6      2023-01-09 [1] CRAN (R 4.2.0)
#>  readr         * 2.1.4      2023-02-10 [1] CRAN (R 4.2.0)
#>  readxl          1.4.2      2023-02-09 [1] CRAN (R 4.2.0)
#>  recipes         1.0.4      2023-01-11 [1] CRAN (R 4.2.0)
#>  remotes         2.4.2      2021-11-30 [1] CRAN (R 4.2.0)
#>  reprex          2.0.2      2022-08-17 [1] CRAN (R 4.2.0)
#>  reshape2        1.4.4      2020-04-09 [1] CRAN (R 4.2.0)
#>  rlang           1.0.6      2022-09-24 [1] CRAN (R 4.2.0)
#>  rmarkdown       2.20       2023-01-19 [1] CRAN (R 4.2.0)
#>  rpart           4.1.19     2022-10-21 [1] CRAN (R 4.2.0)
#>  rstan           2.21.8     2023-01-17 [1] CRAN (R 4.2.0)
#>  rstanarm      * 2.21.3     2022-04-09 [1] CRAN (R 4.2.0)
#>  rstantools      2.2.0      2022-04-08 [1] CRAN (R 4.2.0)
#>  rstudioapi      0.14       2022-08-22 [1] CRAN (R 4.2.0)
#>  rvest           1.0.3      2022-08-19 [1] CRAN (R 4.2.0)
#>  sass            0.4.5      2023-01-24 [1] CRAN (R 4.2.0)
#>  scales          1.2.1      2022-08-20 [1] CRAN (R 4.2.0)
#>  sessioninfo     1.2.2      2021-12-06 [1] CRAN (R 4.2.0)
#>  shiny           1.7.4      2022-12-15 [1] CRAN (R 4.2.0)
#>  shinyjs         2.1.0      2021-12-23 [1] CRAN (R 4.2.0)
#>  shinystan       2.6.0      2022-03-03 [1] CRAN (R 4.2.0)
#>  shinythemes     1.2.0      2021-01-25 [1] CRAN (R 4.2.0)
#>  StanHeaders     2.21.0-7   2020-12-17 [1] CRAN (R 4.2.0)
#>  stringi         1.7.12     2023-01-11 [1] CRAN (R 4.2.0)
#>  stringr       * 1.5.0      2022-12-02 [1] CRAN (R 4.2.0)
#>  survival        3.5-0      2023-01-09 [1] CRAN (R 4.2.0)
#>  teachertools    0.2.0.9000 2023-03-17 [1] local
#>  threejs         0.3.3      2020-01-21 [1] CRAN (R 4.2.0)
#>  tibble        * 3.1.8      2022-07-22 [1] CRAN (R 4.2.0)
#>  tidyr         * 1.3.0      2023-01-24 [1] CRAN (R 4.2.0)
#>  tidyselect      1.2.0      2022-10-10 [1] CRAN (R 4.2.0)
#>  tidyverse     * 1.3.2      2022-07-18 [1] CRAN (R 4.2.0)
#>  timechange      0.2.0      2023-01-11 [1] CRAN (R 4.2.0)
#>  timeDate        4022.108   2023-01-07 [1] CRAN (R 4.2.0)
#>  tzdb            0.3.0      2022-03-28 [1] CRAN (R 4.2.0)
#>  urlchecker      1.0.1      2021-11-30 [1] CRAN (R 4.2.0)
#>  usethis         2.1.6      2022-05-25 [1] CRAN (R 4.2.0)
#>  utf8            1.2.3      2023-01-31 [1] CRAN (R 4.2.0)
#>  vctrs           0.5.2      2023-01-23 [1] CRAN (R 4.2.0)
#>  withr           2.5.0      2022-03-03 [1] CRAN (R 4.2.0)
#>  xfun            0.37       2023-01-31 [1] CRAN (R 4.2.0)
#>  xml2            1.3.3      2021-11-30 [1] CRAN (R 4.2.0)
#>  xtable          1.8-4      2019-04-21 [1] CRAN (R 4.2.0)
#>  xts             0.12.2     2022-10-16 [1] CRAN (R 4.2.0)
#>  yaml            2.3.7      2023-01-23 [1] CRAN (R 4.2.0)
#>  zoo             1.8-11     2022-09-17 [1] CRAN (R 4.2.0)
#> 
#>  [1] /Users/sebastiansaueruser/Rlibs
#>  [2] /Library/Frameworks/R.framework/Versions/4.2/Resources/library
#> 
#> ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────