1 Pakete laden
library(tidyverse)
library(arm) # nicht wirklich wichtig, nur für Funktion "display"
library(foreign)
library(rio)
2 Lineare Transformationen
2.1 Daten laden: kidsiq
kidsiq <- read_csv("https://raw.githubusercontent.com/sebastiansauer/2021-sose/master/data/ARM/kidsiq.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## kid_score = col_double(),
## mom_hs = col_double(),
## mom_iq = col_double(),
## mom_work = col_double(),
## mom_age = col_double()
## )
2.2 lm1: Interaktionseffekt
lm1 <- lm(kid_score ~ mom_hs + mom_iq + mom_hs:mom_iq,
data = kidsiq)
display(lm1)
## lm(formula = kid_score ~ mom_hs + mom_iq + mom_hs:mom_iq, data = kidsiq)
## coef.est coef.se
## (Intercept) -11.48 13.76
## mom_hs 51.27 15.34
## mom_iq 0.97 0.15
## mom_hs:mom_iq -0.48 0.16
## ---
## n = 434, k = 4
## residual sd = 17.97, R-Squared = 0.23
2.3 lm2: Zentrieren
kidsiq <-
kidsiq %>%
mutate(mom_hs_c = mom_hs - mean(mom_hs),
mom_iq_c = mom_iq - mean(mom_iq))
lm2 <- lm(kid_score ~ mom_hs_c + mom_iq_c + mom_hs_c:mom_iq_c,
data = kidsiq)
display(lm2)
## lm(formula = kid_score ~ mom_hs_c + mom_iq_c + mom_hs_c:mom_iq_c,
## data = kidsiq)
## coef.est coef.se
## (Intercept) 87.64 0.91
## mom_hs_c 2.84 2.43
## mom_iq_c 0.59 0.06
## mom_hs_c:mom_iq_c -0.48 0.16
## ---
## n = 434, k = 4
## residual sd = 17.97, R-Squared = 0.23
lm2a <- lm(kid_score ~ mom_hs + mom_iq_c + mom_hs:mom_iq_c,
data = kidsiq)
display(lm2a)
## lm(formula = kid_score ~ mom_hs + mom_iq_c + mom_hs:mom_iq_c,
## data = kidsiq)
## coef.est coef.se
## (Intercept) 85.41 2.22
## mom_hs 2.84 2.43
## mom_iq_c 0.97 0.15
## mom_hs:mom_iq_c -0.48 0.16
## ---
## n = 434, k = 4
## residual sd = 17.97, R-Squared = 0.23
2.4 lm3: z-Transformation
kidsiq <-
kidsiq %>%
mutate(mom_iq_z = (mom_iq - mean(mom_iq)) / sd(mom_iq))
lm3 <- lm(kid_score ~ mom_hs + mom_iq_z + mom_hs:mom_iq_c,
data = kidsiq)
display(lm3)
## lm(formula = kid_score ~ mom_hs + mom_iq_z + mom_hs:mom_iq_c,
## data = kidsiq)
## coef.est coef.se
## (Intercept) 85.41 2.22
## mom_hs 2.84 2.43
## mom_iq_z 14.53 2.23
## mom_hs:mom_iq_c -0.48 0.16
## ---
## n = 434, k = 4
## residual sd = 17.97, R-Squared = 0.23
3 Modelle mit log(y)
3.1 Daten laden
earn <- read_csv("https://raw.githubusercontent.com/sebastiansauer/2021-sose/master/data/ARM/heights.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## earn = col_double(),
## height1 = col_double(),
## height2 = col_double(),
## sex = col_double(),
## race = col_double(),
## hisp = col_double(),
## ed = col_double(),
## yearbn = col_double(),
## height = col_double()
## )
3.2 lm4: earn_log
earn <-
earn %>%
mutate(earn_log = log(earn))
lm4 <- lm(earn_log ~ height,
data = earn %>% filter(earn > 0))
display(lm4)
## lm(formula = earn_log ~ height, data = earn %>% filter(earn >
## 0))
## coef.est coef.se
## (Intercept) 5.78 0.45
## height 0.06 0.01
## ---
## n = 1192, k = 2
## residual sd = 0.89, R-Squared = 0.06
3.3 lm5: earn_log mit zwei Prädiktoren
earn %>%
count(sex)
## # A tibble: 2 x 2
## sex n
## <dbl> <int>
## 1 1 748
## 2 2 1281
Leider wissen wir nicht, ob Frauen mit 1 oder mit 2 codiert sind. Gehen wir davon aus, dass Männer mit 1 kodiert sind (vgl. Buch ARM, S. 61).
earn <-
earn %>%
mutate(male = case_when(
sex == 2 ~ 0,
sex == 1 ~ 1))
lm5 <- lm(earn_log ~ height + male,
data = earn %>% filter(earn > 0))
display(lm5)
## lm(formula = earn_log ~ height + male, data = earn %>% filter(earn >
## 0))
## coef.est coef.se
## (Intercept) 8.15 0.60
## height 0.02 0.01
## male 0.42 0.07
## ---
## n = 1192, k = 3
## residual sd = 0.88, R-Squared = 0.09
3.4 lm6: Mit z-Transformation und Interaktion
earn <-
earn %>%
mutate(height_z = scale(height))
lm6 <- lm(earn_log ~ height_z + male + height_z:male,
data = earn %>% filter(earn > 0))
display(lm6)
## lm(formula = earn_log ~ height_z + male + height_z:male, data = earn %>%
## filter(earn > 0))
## coef.est coef.se
## (Intercept) 9.52 0.04
## height_z 0.06 0.05
## male 0.42 0.07
## height_z:male 0.03 0.07
## ---
## n = 1192, k = 4
## residual sd = 0.88, R-Squared = 0.09
Die Reihenfolge der Prädiktoren spielt keine Rolle:
lm6a <- lm(earn_log ~ height_z:male + male + height_z ,
data = earn %>% filter(earn > 0))
display(lm6)
## lm(formula = earn_log ~ height_z + male + height_z:male, data = earn %>%
## filter(earn > 0))
## coef.est coef.se
## (Intercept) 9.52 0.04
## height_z 0.06 0.05
## male 0.42 0.07
## height_z:male 0.03 0.07
## ---
## n = 1192, k = 4
## residual sd = 0.88, R-Squared = 0.09
4 LogY-LogX-Modelle
earn <-
earn %>%
mutate(height_log = log(height))
lm7 <- lm(earn_log ~ height_log + male,
data = earn %>% filter(earn > 0))
display(lm7)
## lm(formula = earn_log ~ height_log + male, data = earn %>% filter(earn >
## 0))
## coef.est coef.se
## (Intercept) 3.62 2.60
## height_log 1.41 0.62
## male 0.42 0.07
## ---
## n = 1192, k = 3
## residual sd = 0.88, R-Squared = 0.09
5 Weitere Transformationen
5.1 Diskretisierung metrischer Prädiktoren
lm7 <- lm(kid_score ~ as.factor(mom_work),
data = kidsiq)
display(lm7)
## lm(formula = kid_score ~ as.factor(mom_work), data = kidsiq)
## coef.est coef.se
## (Intercept) 82.00 2.31
## as.factor(mom_work)2 3.85 3.09
## as.factor(mom_work)3 11.50 3.55
## as.factor(mom_work)4 5.21 2.70
## ---
## n = 434, k = 4
## residual sd = 20.23, R-Squared = 0.02
6 “Buschbeispiel” - mesquite
6.1 Daten laden
mesquite <- read_csv("https://raw.githubusercontent.com/sebastiansauer/2021-sose/master/data/ARM/mesquite.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## Obs = col_double(),
## Group = col_character(),
## Diam1 = col_double(),
## Diam2 = col_double(),
## TotHt = col_double(),
## CanHt = col_double(),
## Dens = col_double(),
## LeafWt = col_double()
## )
glimpse(mesquite)
## Rows: 46
## Columns: 8
## $ Obs <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, …
## $ Group <chr> "MCD", "MCD", "MCD", "MCD", "MCD", "MCD", "MCD", "MCD", "MCD", …
## $ Diam1 <dbl> 1.8, 1.7, 2.8, 1.3, 3.3, 1.4, 1.5, 3.9, 1.8, 2.1, 0.8, 1.3, 1.2…
## $ Diam2 <dbl> 1.15, 1.35, 2.55, 0.85, 1.90, 1.40, 0.50, 2.30, 1.35, 1.60, 0.6…
## $ TotHt <dbl> 1.30, 1.35, 2.16, 1.80, 1.55, 1.20, 1.00, 1.70, 0.80, 1.20, 0.9…
## $ CanHt <dbl> 1.00, 1.33, 0.60, 1.20, 1.05, 1.00, 0.90, 1.30, 0.60, 0.80, 0.6…
## $ Dens <dbl> 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ LeafWt <dbl> 401.3, 513.7, 1179.2, 308.0, 855.2, 268.7, 155.5, 1253.2, 328.0…
6.2 Lineares Modell mit allen Prädiktoren
Vgl. S. 70 in ARM; die Daten passen nicht ganz zum Buch. Die Werte weichen etwas zum Buch ab.
lm7 <- lm(LeafWt ~ ., data = mesquite)
display(lm7)
## lm(formula = LeafWt ~ ., data = mesquite)
## coef.est coef.se
## (Intercept) -970.66 312.68
## Obs -2.88 6.10
## GroupMCD 292.19 181.51
## Diam1 189.17 113.91
## Diam2 372.31 125.65
## TotHt -110.57 188.39
## CanHt 358.34 212.04
## Dens 129.16 34.98
## ---
## n = 46, k = 8
## residual sd = 271.68, R-Squared = 0.85
6.3 Multiplikatives Modell
lm8 <- lm(log(LeafWt) ~ log(Diam1) + log(Diam2) + log(TotHt) + log (CanHt) + log(Dens) + Group,
data = mesquite)
display(lm8)
## lm(formula = log(LeafWt) ~ log(Diam1) + log(Diam2) + log(TotHt) +
## log(CanHt) + log(Dens) + Group, data = mesquite)
## coef.est coef.se
## (Intercept) 4.77 0.16
## log(Diam1) 0.39 0.28
## log(Diam2) 1.15 0.21
## log(TotHt) 0.39 0.31
## log(CanHt) 0.37 0.28
## log(Dens) 0.11 0.12
## GroupMCD 0.58 0.13
## ---
## n = 46, k = 7
## residual sd = 0.33, R-Squared = 0.89
7 Referenz
S. auch den Post zu ARM, Kap. 3