ARM, Kap. 4 Syntax im Tidyverse-Stil

1 Pakete laden

library(tidyverse)
library(arm)  # nicht wirklich wichtig, nur für Funktion "display"
library(foreign)
library(rio)

2 Lineare Transformationen

2.1 Daten laden: kidsiq

kidsiq <- read_csv("https://raw.githubusercontent.com/sebastiansauer/2021-sose/master/data/ARM/kidsiq.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   kid_score = col_double(),
##   mom_hs = col_double(),
##   mom_iq = col_double(),
##   mom_work = col_double(),
##   mom_age = col_double()
## )

2.2 lm1: Interaktionseffekt

lm1 <- lm(kid_score ~ mom_hs + mom_iq + mom_hs:mom_iq,
          data = kidsiq)
display(lm1)
## lm(formula = kid_score ~ mom_hs + mom_iq + mom_hs:mom_iq, data = kidsiq)
##               coef.est coef.se
## (Intercept)   -11.48    13.76 
## mom_hs         51.27    15.34 
## mom_iq          0.97     0.15 
## mom_hs:mom_iq  -0.48     0.16 
## ---
## n = 434, k = 4
## residual sd = 17.97, R-Squared = 0.23

2.3 lm2: Zentrieren

kidsiq <-
  kidsiq %>% 
  mutate(mom_hs_c = mom_hs - mean(mom_hs),
         mom_iq_c = mom_iq - mean(mom_iq))
lm2 <- lm(kid_score ~ mom_hs_c + mom_iq_c + mom_hs_c:mom_iq_c,
          data = kidsiq)
display(lm2)
## lm(formula = kid_score ~ mom_hs_c + mom_iq_c + mom_hs_c:mom_iq_c, 
##     data = kidsiq)
##                   coef.est coef.se
## (Intercept)       87.64     0.91  
## mom_hs_c           2.84     2.43  
## mom_iq_c           0.59     0.06  
## mom_hs_c:mom_iq_c -0.48     0.16  
## ---
## n = 434, k = 4
## residual sd = 17.97, R-Squared = 0.23
lm2a <- lm(kid_score ~ mom_hs + mom_iq_c + mom_hs:mom_iq_c,
          data = kidsiq)
display(lm2a)
## lm(formula = kid_score ~ mom_hs + mom_iq_c + mom_hs:mom_iq_c, 
##     data = kidsiq)
##                 coef.est coef.se
## (Intercept)     85.41     2.22  
## mom_hs           2.84     2.43  
## mom_iq_c         0.97     0.15  
## mom_hs:mom_iq_c -0.48     0.16  
## ---
## n = 434, k = 4
## residual sd = 17.97, R-Squared = 0.23

2.4 lm3: z-Transformation

kidsiq <-
  kidsiq %>% 
  mutate(mom_iq_z = (mom_iq - mean(mom_iq)) / sd(mom_iq))
lm3 <- lm(kid_score ~ mom_hs + mom_iq_z + mom_hs:mom_iq_c, 
          data = kidsiq)

display(lm3)
## lm(formula = kid_score ~ mom_hs + mom_iq_z + mom_hs:mom_iq_c, 
##     data = kidsiq)
##                 coef.est coef.se
## (Intercept)     85.41     2.22  
## mom_hs           2.84     2.43  
## mom_iq_z        14.53     2.23  
## mom_hs:mom_iq_c -0.48     0.16  
## ---
## n = 434, k = 4
## residual sd = 17.97, R-Squared = 0.23

3 Modelle mit log(y)

3.1 Daten laden

earn <- read_csv("https://raw.githubusercontent.com/sebastiansauer/2021-sose/master/data/ARM/heights.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   earn = col_double(),
##   height1 = col_double(),
##   height2 = col_double(),
##   sex = col_double(),
##   race = col_double(),
##   hisp = col_double(),
##   ed = col_double(),
##   yearbn = col_double(),
##   height = col_double()
## )

3.2 lm4: earn_log

earn <-
  earn %>% 
  mutate(earn_log = log(earn))
lm4 <- lm(earn_log ~ height, 
          data = earn %>% filter(earn > 0))
display(lm4)
## lm(formula = earn_log ~ height, data = earn %>% filter(earn > 
##     0))
##             coef.est coef.se
## (Intercept) 5.78     0.45   
## height      0.06     0.01   
## ---
## n = 1192, k = 2
## residual sd = 0.89, R-Squared = 0.06

3.3 lm5: earn_log mit zwei Prädiktoren

earn %>% 
  count(sex)
## # A tibble: 2 x 2
##     sex     n
##   <dbl> <int>
## 1     1   748
## 2     2  1281

Leider wissen wir nicht, ob Frauen mit 1 oder mit 2 codiert sind. Gehen wir davon aus, dass Männer mit 1 kodiert sind (vgl. Buch ARM, S. 61).

earn <-
  earn %>% 
  mutate(male = case_when(
    sex == 2 ~ 0,
    sex == 1 ~ 1))
lm5 <- lm(earn_log ~ height + male,
          data = earn %>% filter(earn > 0))
display(lm5)
## lm(formula = earn_log ~ height + male, data = earn %>% filter(earn > 
##     0))
##             coef.est coef.se
## (Intercept) 8.15     0.60   
## height      0.02     0.01   
## male        0.42     0.07   
## ---
## n = 1192, k = 3
## residual sd = 0.88, R-Squared = 0.09

3.4 lm6: Mit z-Transformation und Interaktion

earn <-
  earn %>% 
  mutate(height_z = scale(height))
lm6 <- lm(earn_log ~ height_z + male + height_z:male,
          data = earn %>% filter(earn > 0))
display(lm6)
## lm(formula = earn_log ~ height_z + male + height_z:male, data = earn %>% 
##     filter(earn > 0))
##               coef.est coef.se
## (Intercept)   9.52     0.04   
## height_z      0.06     0.05   
## male          0.42     0.07   
## height_z:male 0.03     0.07   
## ---
## n = 1192, k = 4
## residual sd = 0.88, R-Squared = 0.09

Die Reihenfolge der Prädiktoren spielt keine Rolle:

lm6a <- lm(earn_log ~ height_z:male + male + height_z ,
          data = earn %>% filter(earn > 0))
display(lm6)
## lm(formula = earn_log ~ height_z + male + height_z:male, data = earn %>% 
##     filter(earn > 0))
##               coef.est coef.se
## (Intercept)   9.52     0.04   
## height_z      0.06     0.05   
## male          0.42     0.07   
## height_z:male 0.03     0.07   
## ---
## n = 1192, k = 4
## residual sd = 0.88, R-Squared = 0.09

4 LogY-LogX-Modelle

earn <-
  earn %>% 
  mutate(height_log = log(height))
lm7 <- lm(earn_log ~ height_log + male,
          data = earn %>% filter(earn > 0))
display(lm7)
## lm(formula = earn_log ~ height_log + male, data = earn %>% filter(earn > 
##     0))
##             coef.est coef.se
## (Intercept) 3.62     2.60   
## height_log  1.41     0.62   
## male        0.42     0.07   
## ---
## n = 1192, k = 3
## residual sd = 0.88, R-Squared = 0.09

5 Weitere Transformationen

5.1 Diskretisierung metrischer Prädiktoren

lm7 <- lm(kid_score ~ as.factor(mom_work),
          data = kidsiq)
display(lm7)
## lm(formula = kid_score ~ as.factor(mom_work), data = kidsiq)
##                      coef.est coef.se
## (Intercept)          82.00     2.31  
## as.factor(mom_work)2  3.85     3.09  
## as.factor(mom_work)3 11.50     3.55  
## as.factor(mom_work)4  5.21     2.70  
## ---
## n = 434, k = 4
## residual sd = 20.23, R-Squared = 0.02

6 “Buschbeispiel” - mesquite

6.1 Daten laden

mesquite <- read_csv("https://raw.githubusercontent.com/sebastiansauer/2021-sose/master/data/ARM/mesquite.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   Obs = col_double(),
##   Group = col_character(),
##   Diam1 = col_double(),
##   Diam2 = col_double(),
##   TotHt = col_double(),
##   CanHt = col_double(),
##   Dens = col_double(),
##   LeafWt = col_double()
## )
glimpse(mesquite)
## Rows: 46
## Columns: 8
## $ Obs    <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, …
## $ Group  <chr> "MCD", "MCD", "MCD", "MCD", "MCD", "MCD", "MCD", "MCD", "MCD", …
## $ Diam1  <dbl> 1.8, 1.7, 2.8, 1.3, 3.3, 1.4, 1.5, 3.9, 1.8, 2.1, 0.8, 1.3, 1.2…
## $ Diam2  <dbl> 1.15, 1.35, 2.55, 0.85, 1.90, 1.40, 0.50, 2.30, 1.35, 1.60, 0.6…
## $ TotHt  <dbl> 1.30, 1.35, 2.16, 1.80, 1.55, 1.20, 1.00, 1.70, 0.80, 1.20, 0.9…
## $ CanHt  <dbl> 1.00, 1.33, 0.60, 1.20, 1.05, 1.00, 0.90, 1.30, 0.60, 0.80, 0.6…
## $ Dens   <dbl> 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ LeafWt <dbl> 401.3, 513.7, 1179.2, 308.0, 855.2, 268.7, 155.5, 1253.2, 328.0…

6.2 Lineares Modell mit allen Prädiktoren

Vgl. S. 70 in ARM; die Daten passen nicht ganz zum Buch. Die Werte weichen etwas zum Buch ab.

lm7 <- lm(LeafWt ~ ., data = mesquite)
display(lm7)
## lm(formula = LeafWt ~ ., data = mesquite)
##             coef.est coef.se
## (Intercept) -970.66   312.68
## Obs           -2.88     6.10
## GroupMCD     292.19   181.51
## Diam1        189.17   113.91
## Diam2        372.31   125.65
## TotHt       -110.57   188.39
## CanHt        358.34   212.04
## Dens         129.16    34.98
## ---
## n = 46, k = 8
## residual sd = 271.68, R-Squared = 0.85

6.3 Multiplikatives Modell

lm8 <- lm(log(LeafWt) ~ log(Diam1) + log(Diam2) + log(TotHt) + log (CanHt) + log(Dens) + Group,
          data = mesquite)

display(lm8)
## lm(formula = log(LeafWt) ~ log(Diam1) + log(Diam2) + log(TotHt) + 
##     log(CanHt) + log(Dens) + Group, data = mesquite)
##             coef.est coef.se
## (Intercept) 4.77     0.16   
## log(Diam1)  0.39     0.28   
## log(Diam2)  1.15     0.21   
## log(TotHt)  0.39     0.31   
## log(CanHt)  0.37     0.28   
## log(Dens)   0.11     0.12   
## GroupMCD    0.58     0.13   
## ---
## n = 46, k = 7
## residual sd = 0.33, R-Squared = 0.89

7 Referenz

S. auch den Post zu ARM, Kap. 3