1 Load packages
library(tidyverse) # data wrangling
library(rstanarm)
library(gt)
2 Motivation
stan_glm()
allows for setting a seed value thereby eliminating the variance induced by random numbers.
However, in case a seed is not used, how much variance is to be expected?
This is the research question of this analysis.
Let’s choose \(n=100\) repetitions in our simulation.
3 Model
Let’s compute a simple linear model of this type:
stan_glm(out_var ~ pred1 + pred2, data = d, refresh = 0)
.
4 Workhorse function
get_simu_var_b1 <- function(dataset, n = 100) {
all_dfs <- teachertools::teaching_df()
d <- teachertools::teaching_df(dataset)
#d <- teachertools::teaching_df("gtcars")
d_name <- attr(d, "df_name")
d_path <- all_dfs |>
dplyr::filter(datasets_names == d_name) |>
dplyr::pull(source)
preds_chosen <- attr(d, "preds_chosen")
focus_pred <- attr(d, "focus_pred")
output_var <- attr(d, "output_var")
results <- vector(mode = "numeric", length = n)
for (i in 1:n){
mod <- stan_glm(av ~ uv1 + uv2, data = d, refresh = 0)
results[i] <- coef(mod)["uv1"] |> as.numeric()
cat(paste0("i = ", i, " of ", n, "\n"))
}
return(results)
}
5 Function for summarizing the simulation results
summ_simu <- function(simu_data){
out <-
list(
dataset = deparse(substitute(simu_data)),
std_dev = sd(simu_data),
variance = var(simu_data),
arithm_mean = mean(simu_data),
med = median(simu_data),
iqr = IQR(simu_data),
max_value = max(simu_data),
min_value = min(simu_data),
max_minus_min = max(simu_data) - min(simu_data),
max_to_min = max(simu_data) / min(simu_data),
var_coeff = sd(simu_data) / mean(simu_data),
rel_var = var(simu_data) / mean(simu_data)
)
}
6 Dataset mtcars
mtcars_out <- get_simu_var_b1(dataset = "mtcars")
hist(mtcars_out)
mtcars_results1 <-
summ_simu(mtcars_out)
mtcars_results1
#> $dataset
#> [1] "mtcars_out"
#>
#> $std_dev
#> [1] 0.0113812
#>
#> $variance
#> [1] 0.0001295316
#>
#> $arithm_mean
#> [1] -4.758728
#>
#> $med
#> [1] -4.760092
#>
#> $iqr
#> [1] 0.01441278
#>
#> $max_value
#> [1] -4.728438
#>
#> $min_value
#> [1] -4.786539
#>
#> $max_minus_min
#> [1] 0.05810086
#>
#> $max_to_min
#> [1] 0.9878616
#>
#> $var_coeff
#> [1] -0.002391647
#>
#> $rel_var
#> [1] -2.72198e-05
7 Dataset msleep
msleep_out <- get_simu_var_b1(dataset = "msleep")
hist(msleep_out)
msleep_results1 <- summ_simu(msleep_out)
msleep_results1
#> $dataset
#> [1] "msleep_out"
#>
#> $std_dev
#> [1] 4.674048e-05
#>
#> $variance
#> [1] 2.184672e-09
#>
#> $arithm_mean
#> [1] 0.007708606
#>
#> $med
#> [1] 0.007704791
#>
#> $iqr
#> [1] 6.562096e-05
#>
#> $max_value
#> [1] 0.007827808
#>
#> $min_value
#> [1] 0.007570025
#>
#> $max_minus_min
#> [1] 0.0002577828
#>
#> $max_to_min
#> [1] 1.034053
#>
#> $var_coeff
#> [1] 0.006063415
#>
#> $rel_var
#> [1] 2.834069e-07
8 Dataset penguins
penguins_out <- get_simu_var_b1(dataset = "penguins")
hist(penguins_out)
penguins_results1 <- summ_simu(penguins_out)
penguins_results1
#> $dataset
#> [1] "penguins_out"
#>
#> $std_dev
#> [1] 0.002455989
#>
#> $variance
#> [1] 6.031884e-06
#>
#> $arithm_mean
#> [1] -0.6427714
#>
#> $med
#> [1] -0.642781
#>
#> $iqr
#> [1] 0.002662391
#>
#> $max_value
#> [1] -0.6346151
#>
#> $min_value
#> [1] -0.6484986
#>
#> $max_minus_min
#> [1] 0.01388349
#>
#> $max_to_min
#> [1] 0.9785913
#>
#> $var_coeff
#> [1] -0.003820938
#>
#> $rel_var
#> [1] -9.384182e-06
9 Dataset tips
tips_out <- get_simu_var_b1(dataset = "tips")
hist(tips_out)
tips_results1 <- summ_simu(tips_out)
tips_results1
#> $dataset
#> [1] "tips_out"
#>
#> $std_dev
#> [1] 0.002283815
#>
#> $variance
#> [1] 5.215813e-06
#>
#> $arithm_mean
#> [1] 0.1926246
#>
#> $med
#> [1] 0.1923049
#>
#> $iqr
#> [1] 0.003167123
#>
#> $max_value
#> [1] 0.1995643
#>
#> $min_value
#> [1] 0.1877088
#>
#> $max_minus_min
#> [1] 0.01185556
#>
#> $max_to_min
#> [1] 1.063159
#>
#> $var_coeff
#> [1] 0.0118563
#>
#> $rel_var
#> [1] 2.707761e-05
10 Dataset gtcars
#saveRDS(gtcars_out, file = "gtcars_out.rds")
gtcars_rdsfile_path <- "/Users/sebastiansaueruser/Publikationen/blog_ses/sesa-blog/gtcars_out.rds"
if (file.exists(gtcars_rdsfile_path)) {
cat("RDS file exists")
gtcars_out <- readRDS(gtcars_rdsfile_path)
} else {
cat("Coduncting simulation...")
gtcars_out <- get_simu_var_b1(dataset = "gtcars")
}
hist(gtcars_out)
gtcars_results1 <- summ_simu(gtcars_out)
gtcars_results1
#> $dataset
#> [1] "gtcars_out"
#>
#> $std_dev
#> [1] 199.5382
#>
#> $variance
#> [1] 39815.49
#>
#> $arithm_mean
#> [1] -21198.79
#>
#> $med
#> [1] -21168.08
#>
#> $iqr
#> [1] 258.0704
#>
#> $max_value
#> [1] -20851.2
#>
#> $min_value
#> [1] -21851.56
#>
#> $max_minus_min
#> [1] 1000.359
#>
#> $max_to_min
#> [1] 0.9542202
#>
#> $var_coeff
#> [1] -0.009412714
#>
#> $rel_var
#> [1] -1.878196
11 Dataset Boston
Boston_out <- get_simu_var_b1(dataset = "Boston")
hist(Boston_out)
Boston_results1 <- summ_simu(Boston_out)
Boston_results1
#> $dataset
#> [1] "Boston_out"
#>
#> $std_dev
#> [1] 8.527836e-05
#>
#> $variance
#> [1] 7.272398e-09
#>
#> $arithm_mean
#> [1] 0.02056738
#>
#> $med
#> [1] 0.02057132
#>
#> $iqr
#> [1] 0.0001292766
#>
#> $max_value
#> [1] 0.02073811
#>
#> $min_value
#> [1] 0.02035571
#>
#> $max_minus_min
#> [1] 0.0003823966
#>
#> $max_to_min
#> [1] 1.018786
#>
#> $var_coeff
#> [1] 0.004146291
#>
#> $rel_var
#> [1] 3.535889e-07
12 Dataset TeachingRatings
TeachingRatings_out <- get_simu_var_b1(dataset = "TeachingRatings")
hist(TeachingRatings_out)
TeachingRatings_results1 <- summ_simu(TeachingRatings_out)
TeachingRatings_results1
#> $dataset
#> [1] "TeachingRatings_out"
#>
#> $std_dev
#> [1] 6.331732e-06
#>
#> $variance
#> [1] 4.009083e-11
#>
#> $arithm_mean
#> [1] -1.502834e-05
#>
#> $med
#> [1] -1.545877e-05
#>
#> $iqr
#> [1] 8.406646e-06
#>
#> $max_value
#> [1] 3.224198e-06
#>
#> $min_value
#> [1] -2.743382e-05
#>
#> $max_minus_min
#> [1] 3.065801e-05
#>
#> $max_to_min
#> [1] -0.1175264
#>
#> $var_coeff
#> [1] -0.4213195
#>
#> $rel_var
#> [1] -2.667682e-06
13 Results overview
main_results <-
bind_rows(Boston_results1,
gtcars_results1,
msleep_results1,
mtcars_results1,
penguins_results1,
TeachingRatings_results1,
tips_results1)
out2 <- main_results %>%
mutate(across(where(is.numeric), round, 3))
gt(main_results) %>%
fmt_number(where(is.numeric), decimals = 3)
dataset | std_dev | variance | arithm_mean | med | iqr | max_value | min_value | max_minus_min | max_to_min | var_coeff | rel_var |
---|---|---|---|---|---|---|---|---|---|---|---|
Boston_out | 0.000 | 0.000 | 0.021 | 0.021 | 0.000 | 0.021 | 0.020 | 0.000 | 1.019 | 0.004 | 0.000 |
gtcars_out | 199.538 | 39,815.485 | −21,198.794 | −21,168.076 | 258.070 | −20,851.197 | −21,851.556 | 1,000.359 | 0.954 | −0.009 | −1.878 |
msleep_out | 0.000 | 0.000 | 0.008 | 0.008 | 0.000 | 0.008 | 0.008 | 0.000 | 1.034 | 0.006 | 0.000 |
mtcars_out | 0.011 | 0.000 | −4.759 | −4.760 | 0.014 | −4.728 | −4.787 | 0.058 | 0.988 | −0.002 | 0.000 |
penguins_out | 0.002 | 0.000 | −0.643 | −0.643 | 0.003 | −0.635 | −0.648 | 0.014 | 0.979 | −0.004 | 0.000 |
TeachingRatings_out | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | −0.118 | −0.421 | 0.000 |
tips_out | 0.002 | 0.000 | 0.193 | 0.192 | 0.003 | 0.200 | 0.188 | 0.012 | 1.063 | 0.012 | 0.000 |
14 Conclusion
The simulation results are quite close to each other, the coefficient of variation is always below 5%. In situations however where the absolute values are close to zero, the relative deviations can be comparatively high.
15 Reproducibility
#> ─ Session info ───────────────────────────────────────────────────────────────────────────────────────────────────────
#> setting value
#> version R version 4.2.1 (2022-06-23)
#> os macOS Big Sur ... 10.16
#> system x86_64, darwin17.0
#> ui X11
#> language (EN)
#> collate en_US.UTF-8
#> ctype en_US.UTF-8
#> tz Europe/Berlin
#> date 2023-03-19
#> pandoc 2.19.2 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/tools/ (via rmarkdown)
#>
#> ─ Packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────
#> package * version date (UTC) lib source
#> assertthat 0.2.1 2019-03-21 [1] CRAN (R 4.2.0)
#> backports 1.4.1 2021-12-13 [1] CRAN (R 4.2.0)
#> base64enc 0.1-3 2015-07-28 [1] CRAN (R 4.2.0)
#> bayesplot 1.10.0 2022-11-16 [1] CRAN (R 4.2.0)
#> blogdown 1.16 2022-12-13 [1] CRAN (R 4.2.0)
#> bookdown 0.33 2023-03-06 [1] CRAN (R 4.2.0)
#> boot 1.3-28.1 2022-11-22 [1] CRAN (R 4.2.0)
#> broom 1.0.4 2023-03-11 [1] CRAN (R 4.2.0)
#> bslib 0.4.2 2022-12-16 [1] CRAN (R 4.2.0)
#> cachem 1.0.7 2023-02-24 [1] CRAN (R 4.2.0)
#> callr 3.7.3 2022-11-02 [1] CRAN (R 4.2.0)
#> caret 6.0-93 2022-08-09 [1] CRAN (R 4.2.0)
#> cellranger 1.1.0 2016-07-27 [1] CRAN (R 4.2.0)
#> class 7.3-20 2022-01-16 [2] CRAN (R 4.2.1)
#> cli 3.6.0 2023-01-09 [1] CRAN (R 4.2.0)
#> codetools 0.2-18 2020-11-04 [2] CRAN (R 4.2.1)
#> colorout * 1.2-2 2022-06-13 [1] local
#> colorspace 2.1-0 2023-01-23 [1] CRAN (R 4.2.0)
#> colourpicker 1.2.0 2022-10-28 [1] CRAN (R 4.2.0)
#> crayon 1.5.2 2022-09-29 [1] CRAN (R 4.2.1)
#> crosstalk 1.2.0 2021-11-04 [1] CRAN (R 4.2.0)
#> data.table 1.14.8 2023-02-17 [1] CRAN (R 4.2.0)
#> DBI 1.1.3 2022-06-18 [1] CRAN (R 4.2.0)
#> dbplyr 2.3.1 2023-02-24 [1] CRAN (R 4.2.0)
#> devtools 2.4.5 2022-10-11 [1] CRAN (R 4.2.1)
#> digest 0.6.31 2022-12-11 [1] CRAN (R 4.2.0)
#> dplyr * 1.1.0 2023-01-29 [1] CRAN (R 4.2.0)
#> DT 0.27 2023-01-17 [1] CRAN (R 4.2.0)
#> dygraphs 1.1.1.6 2018-07-11 [1] CRAN (R 4.2.0)
#> ellipsis 0.3.2 2021-04-29 [1] CRAN (R 4.2.0)
#> evaluate 0.20 2023-01-17 [1] CRAN (R 4.2.0)
#> fansi 1.0.4 2023-01-22 [1] CRAN (R 4.2.0)
#> fastmap 1.1.1 2023-02-24 [1] CRAN (R 4.2.0)
#> forcats * 1.0.0 2023-01-29 [1] CRAN (R 4.2.0)
#> foreach 1.5.2 2022-02-02 [1] CRAN (R 4.2.0)
#> fs 1.6.1 2023-02-06 [1] CRAN (R 4.2.0)
#> future 1.32.0 2023-03-07 [1] CRAN (R 4.2.0)
#> future.apply 1.10.0 2022-11-05 [1] CRAN (R 4.2.0)
#> gargle 1.3.0 2023-01-30 [1] CRAN (R 4.2.0)
#> generics 0.1.3 2022-07-05 [1] CRAN (R 4.2.0)
#> ggplot2 * 3.4.1 2023-02-10 [1] CRAN (R 4.2.0)
#> globals 0.16.2 2022-11-21 [1] CRAN (R 4.2.0)
#> glue 1.6.2 2022-02-24 [1] CRAN (R 4.2.0)
#> googledrive 2.0.0 2021-07-08 [1] CRAN (R 4.2.0)
#> googlesheets4 1.0.1 2022-08-13 [1] CRAN (R 4.2.0)
#> gower 1.0.1 2022-12-22 [1] CRAN (R 4.2.0)
#> gridExtra 2.3 2017-09-09 [1] CRAN (R 4.2.0)
#> gtable 0.3.1 2022-09-01 [1] CRAN (R 4.2.0)
#> gtools 3.9.4 2022-11-27 [1] CRAN (R 4.2.0)
#> hardhat 1.2.0 2022-06-30 [1] CRAN (R 4.2.0)
#> haven 2.5.2 2023-02-28 [1] CRAN (R 4.2.0)
#> highr 0.10 2022-12-22 [1] CRAN (R 4.2.0)
#> hms 1.1.2 2022-08-19 [1] CRAN (R 4.2.0)
#> htmltools 0.5.4 2022-12-07 [1] CRAN (R 4.2.0)
#> htmlwidgets 1.6.1 2023-01-07 [1] CRAN (R 4.2.0)
#> httpuv 1.6.9 2023-02-14 [1] CRAN (R 4.2.0)
#> httr 1.4.5 2023-02-24 [1] CRAN (R 4.2.0)
#> igraph 1.4.1 2023-02-24 [1] CRAN (R 4.2.0)
#> inline 0.3.19 2021-05-31 [1] CRAN (R 4.2.0)
#> ipred 0.9-14 2023-03-09 [1] CRAN (R 4.2.0)
#> iterators 1.0.14 2022-02-05 [1] CRAN (R 4.2.0)
#> jquerylib 0.1.4 2021-04-26 [1] CRAN (R 4.2.0)
#> jsonlite 1.8.4 2022-12-06 [1] CRAN (R 4.2.0)
#> knitr 1.42 2023-01-25 [1] CRAN (R 4.2.0)
#> later 1.3.0 2021-08-18 [1] CRAN (R 4.2.0)
#> lattice 0.20-45 2021-09-22 [2] CRAN (R 4.2.1)
#> lava 1.7.2.1 2023-02-27 [1] CRAN (R 4.2.0)
#> lifecycle 1.0.3 2022-10-07 [1] CRAN (R 4.2.0)
#> listenv 0.9.0 2022-12-16 [1] CRAN (R 4.2.0)
#> lme4 1.1-32 2023-03-14 [1] CRAN (R 4.2.0)
#> loo 2.5.1 2022-03-24 [1] CRAN (R 4.2.0)
#> lubridate 1.9.2 2023-02-10 [1] CRAN (R 4.2.0)
#> magrittr 2.0.3 2022-03-30 [1] CRAN (R 4.2.0)
#> markdown 1.5 2023-01-31 [1] CRAN (R 4.2.0)
#> MASS 7.3-58.3 2023-03-07 [1] CRAN (R 4.2.0)
#> Matrix 1.5-3 2022-11-11 [1] CRAN (R 4.2.0)
#> matrixStats 0.63.0 2022-11-18 [1] CRAN (R 4.2.0)
#> memoise 2.0.1 2021-11-26 [1] CRAN (R 4.2.0)
#> mime 0.12 2021-09-28 [1] CRAN (R 4.2.0)
#> miniUI 0.1.1.1 2018-05-18 [1] CRAN (R 4.2.0)
#> minqa 1.2.5 2022-10-19 [1] CRAN (R 4.2.1)
#> ModelMetrics 1.2.2.2 2020-03-17 [1] CRAN (R 4.2.0)
#> modelr 0.1.10 2022-11-11 [1] CRAN (R 4.2.0)
#> munsell 0.5.0 2018-06-12 [1] CRAN (R 4.2.0)
#> nlme 3.1-162 2023-01-31 [1] CRAN (R 4.2.0)
#> nloptr 2.0.3 2022-05-26 [1] CRAN (R 4.2.0)
#> nnet 7.3-18 2022-09-28 [1] CRAN (R 4.2.0)
#> parallelly 1.34.0 2023-01-13 [1] CRAN (R 4.2.1)
#> pillar 1.8.1 2022-08-19 [1] CRAN (R 4.2.0)
#> pkgbuild 1.4.0 2022-11-27 [1] CRAN (R 4.2.0)
#> pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.2.0)
#> pkgload 1.3.2 2022-11-16 [1] CRAN (R 4.2.0)
#> plyr 1.8.8 2022-11-11 [1] CRAN (R 4.2.0)
#> prettyunits 1.1.1 2020-01-24 [1] CRAN (R 4.2.0)
#> pROC 1.18.0 2021-09-03 [1] CRAN (R 4.2.0)
#> processx 3.8.0 2022-10-26 [1] CRAN (R 4.2.0)
#> prodlim 2019.11.13 2019-11-17 [1] CRAN (R 4.2.0)
#> profvis 0.3.7 2020-11-02 [1] CRAN (R 4.2.0)
#> promises 1.2.0.1 2021-02-11 [1] CRAN (R 4.2.0)
#> ps 1.7.2 2022-10-26 [1] CRAN (R 4.2.0)
#> purrr * 1.0.1 2023-01-10 [1] CRAN (R 4.2.0)
#> R6 2.5.1 2021-08-19 [1] CRAN (R 4.2.0)
#> Rcpp * 1.0.10 2023-01-22 [1] CRAN (R 4.2.0)
#> RcppParallel 5.1.6 2023-01-09 [1] CRAN (R 4.2.0)
#> readr * 2.1.4 2023-02-10 [1] CRAN (R 4.2.0)
#> readxl 1.4.2 2023-02-09 [1] CRAN (R 4.2.0)
#> recipes 1.0.4 2023-01-11 [1] CRAN (R 4.2.0)
#> remotes 2.4.2 2021-11-30 [1] CRAN (R 4.2.0)
#> reprex 2.0.2 2022-08-17 [1] CRAN (R 4.2.0)
#> reshape2 1.4.4 2020-04-09 [1] CRAN (R 4.2.0)
#> rlang 1.0.6 2022-09-24 [1] CRAN (R 4.2.0)
#> rmarkdown 2.20 2023-01-19 [1] CRAN (R 4.2.0)
#> rpart 4.1.19 2022-10-21 [1] CRAN (R 4.2.0)
#> rstan 2.21.8 2023-01-17 [1] CRAN (R 4.2.0)
#> rstanarm * 2.21.3 2022-04-09 [1] CRAN (R 4.2.0)
#> rstantools 2.2.0 2022-04-08 [1] CRAN (R 4.2.0)
#> rstudioapi 0.14 2022-08-22 [1] CRAN (R 4.2.0)
#> rvest 1.0.3 2022-08-19 [1] CRAN (R 4.2.0)
#> sass 0.4.5 2023-01-24 [1] CRAN (R 4.2.0)
#> scales 1.2.1 2022-08-20 [1] CRAN (R 4.2.0)
#> sessioninfo 1.2.2 2021-12-06 [1] CRAN (R 4.2.0)
#> shiny 1.7.4 2022-12-15 [1] CRAN (R 4.2.0)
#> shinyjs 2.1.0 2021-12-23 [1] CRAN (R 4.2.0)
#> shinystan 2.6.0 2022-03-03 [1] CRAN (R 4.2.0)
#> shinythemes 1.2.0 2021-01-25 [1] CRAN (R 4.2.0)
#> StanHeaders 2.21.0-7 2020-12-17 [1] CRAN (R 4.2.0)
#> stringi 1.7.12 2023-01-11 [1] CRAN (R 4.2.0)
#> stringr * 1.5.0 2022-12-02 [1] CRAN (R 4.2.0)
#> survival 3.5-0 2023-01-09 [1] CRAN (R 4.2.0)
#> teachertools 0.2.0.9000 2023-03-17 [1] local
#> threejs 0.3.3 2020-01-21 [1] CRAN (R 4.2.0)
#> tibble * 3.1.8 2022-07-22 [1] CRAN (R 4.2.0)
#> tidyr * 1.3.0 2023-01-24 [1] CRAN (R 4.2.0)
#> tidyselect 1.2.0 2022-10-10 [1] CRAN (R 4.2.0)
#> tidyverse * 1.3.2 2022-07-18 [1] CRAN (R 4.2.0)
#> timechange 0.2.0 2023-01-11 [1] CRAN (R 4.2.0)
#> timeDate 4022.108 2023-01-07 [1] CRAN (R 4.2.0)
#> tzdb 0.3.0 2022-03-28 [1] CRAN (R 4.2.0)
#> urlchecker 1.0.1 2021-11-30 [1] CRAN (R 4.2.0)
#> usethis 2.1.6 2022-05-25 [1] CRAN (R 4.2.0)
#> utf8 1.2.3 2023-01-31 [1] CRAN (R 4.2.0)
#> vctrs 0.5.2 2023-01-23 [1] CRAN (R 4.2.0)
#> withr 2.5.0 2022-03-03 [1] CRAN (R 4.2.0)
#> xfun 0.37 2023-01-31 [1] CRAN (R 4.2.0)
#> xml2 1.3.3 2021-11-30 [1] CRAN (R 4.2.0)
#> xtable 1.8-4 2019-04-21 [1] CRAN (R 4.2.0)
#> xts 0.12.2 2022-10-16 [1] CRAN (R 4.2.0)
#> yaml 2.3.7 2023-01-23 [1] CRAN (R 4.2.0)
#> zoo 1.8-11 2022-09-17 [1] CRAN (R 4.2.0)
#>
#> [1] /Users/sebastiansaueruser/Rlibs
#> [2] /Library/Frameworks/R.framework/Versions/4.2/Resources/library
#>
#> ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────