8 min read

Randomization in presence of an interaction effect

Load packages

library(tidyverse)
library(rockchalk)
library(MASS)
library(ggdag)

Problem statement

Assume that \(X\) and \(Y\) are correlated contingent on some third variable, \(Z\). For simplicity, assume that, if \(z=0\), \(_0=0.7\), and if \(z=1\), then \(r_1=-0.7\). This is not a causal statement.

Simulate data

Let the sample size amount to \(n=1000\).

n <- 1e03

Group A, \(z=0\):

myR <- lazyCor(X = 0.7, d = 2)
mySD <- c(1, 1)
myCov <- lazyCov(Rho = myR, Sd = mySD)

set.seed(12345)
x_z0 <- MASS::mvrnorm(n=n, mu = rep(0, 2), Sigma = myCov)


head(x_z0)
#>            [,1]       [,2]
#> [1,] -0.1098666  1.1895284
#> [2,]  0.6233152  0.6848755
#> [3,]  0.2309203 -0.4324656
#> [4,] -0.1164846 -0.7197229
#> [5,]  0.7061365  0.4110647
#> [6,] -0.9412289 -2.4109163
cor(x_z0)
#>           [,1]      [,2]
#> [1,] 1.0000000 0.6953559
#> [2,] 0.6953559 1.0000000

Group B, \(z=1\):

myR <- lazyCor(X = -0.7, d = 2)
mySD <- c(1, 1)
myCov <- lazyCov(Rho = myR, Sd = mySD)

set.seed(54321)
x_z1 <- MASS::mvrnorm(n=n, mu = rep(0, 2), Sigma = myCov)


head(x_z1)
#>            [,1]       [,2]
#> [1,] 0.11798164 -0.2118950
#> [2,] 1.34148393 -0.3697449
#> [3,] 1.68763031  0.2419436
#> [4,] 1.27139286 -1.7721640
#> [5,] 0.09414412 -0.6582934
#> [6,] 0.60327601 -1.4167803
cor(x_z1)
#>            [,1]       [,2]
#> [1,]  1.0000000 -0.6982262
#> [2,] -0.6982262  1.0000000

Enframe

x_z0_df <- data.frame(x_z0)
x_z1_df <- data.frame(x_z1)



mydata_raw <- bind_rows(x_z0_df, x_z1_df)

mydata <- mydata_raw %>% rename(x = X1, y = X2)

head(mydata)
#>            x          y
#> 1 -0.1098666  1.1895284
#> 2  0.6233152  0.6848755
#> 3  0.2309203 -0.4324656
#> 4 -0.1164846 -0.7197229
#> 5  0.7061365  0.4110647
#> 6 -0.9412289 -2.4109163

Add group information:

mydata <- mydata %>% 
  mutate(z = c(rep(0,n), rep(1, n)))

head(mydata)
#>            x          y z
#> 1 -0.1098666  1.1895284 0
#> 2  0.6233152  0.6848755 0
#> 3  0.2309203 -0.4324656 0
#> 4 -0.1164846 -0.7197229 0
#> 5  0.7061365  0.4110647 0
#> 6 -0.9412289 -2.4109163 0

Marginal correlation

We know the conditional correlation, because we simulated it. Now let’s look at the marginal (unconditional) correlation:

mydata %>% 
  dplyr::select(x, y) %>% 
  cor()
#>              x            y
#> x  1.000000000 -0.008160122
#> y -0.008160122  1.000000000

In sum, we have zero correlation.

Moderation framework

\(Z\) can be seen as a moderator in a regression framework:

lm1 <- lm(y ~ x * z, data = mydata)

summary(lm1)
#> 
#> Call:
#> lm(formula = y ~ x * z, data = mydata)
#> 
#> Residuals:
#>      Min       1Q   Median       3Q      Max 
#> -2.72085 -0.48517 -0.00496  0.49473  2.76392 
#> 
#> Coefficients:
#>              Estimate Std. Error t value Pr(>|t|)    
#> (Intercept) -0.008049   0.023002  -0.350    0.726    
#> x            0.715018   0.023301  30.686   <2e-16 ***
#> z           -0.002579   0.032532  -0.079    0.937    
#> x:z         -1.413871   0.032580 -43.397   <2e-16 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 0.7263 on 1996 degrees of freedom
#> Multiple R-squared:  0.4864, Adjusted R-squared:  0.4856 
#> F-statistic:   630 on 3 and 1996 DF,  p-value: < 2.2e-16

We see that \(Z\) is itself not associated with the outcome, \(Y\), but it changes the association of \(X\) with \(Y\).

mydata %>% 
  ggplot(aes(x = x, y = y, color = factor(z))) +
  geom_point() +
  geom_smooth(methode = "lm", se = FALSE)

Compare that to this model, which does not take the moderator into account:

lm2 <- lm(y ~ x, data = mydata)

summary(lm2)
#> 
#> Call:
#> lm(formula = y ~ x, data = mydata)
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -3.2100 -0.6867 -0.0082  0.6775  3.8245 
#> 
#> Coefficients:
#>              Estimate Std. Error t value Pr(>|t|)
#> (Intercept) -0.009931   0.022684  -0.438    0.662
#> x           -0.008284   0.022711  -0.365    0.715
#> 
#> Residual standard error: 1.013 on 1998 degrees of freedom
#> Multiple R-squared:  6.659e-05,  Adjusted R-squared:  -0.0004339 
#> F-statistic: 0.1331 on 1 and 1998 DF,  p-value: 0.7153
ggplot(mydata, aes(x = x, y = y)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE)

No (marginal) association, as can be seen in the above diagram.

Causal explanation

One causal model explaining this interaction could be:

dag1 <- dagitty::dagitty("dag{
x -> z
y -> z
}")

dag1
#> [1] "dag {\nx\ny\nz\nx -> z\ny -> z\n}\n"
#> attr(,"class")
#> [1] "dagitty"
dag1_tidy <- tidy_dagitty(dag1)
ggdag(dag1) +
  theme_dag_blank()

This causal setting is called a Collider Bias, see e.g. this source.

collider_triangle() %>% 
  ggdag_dseparated(controlling_for = "m") +
  theme_dag()

Controlling for m (\(Z\) in our notation) introduces a bias, that is a spurious association between x and y.

Regression according to Collider model

lm_collider <- lm(z ~ x*y, data = mydata)

summary(lm_collider)
#> 
#> Call:
#> lm(formula = z ~ x * y, data = mydata)
#> 
#> Residuals:
#>      Min       1Q   Median       3Q      Max 
#> -0.87261 -0.45349  0.02151  0.45476  1.18418 
#> 
#> Coefficients:
#>              Estimate Std. Error t value Pr(>|t|)    
#> (Intercept)  0.498183   0.009657  51.587   <2e-16 ***
#> x            0.002695   0.009669   0.279    0.780    
#> y           -0.005104   0.009541  -0.535    0.593    
#> x:y         -0.182551   0.006954 -26.252   <2e-16 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 0.4312 on 1996 degrees of freedom
#> Multiple R-squared:  0.2579, Adjusted R-squared:  0.2568 
#> F-statistic: 231.2 on 3 and 1996 DF,  p-value: < 2.2e-16

However, there’s no association (marginally) between \(X\) and \(Y\):

lm_xy <- lm(y ~ x, data = mydata)

summary(lm_xy)
#> 
#> Call:
#> lm(formula = y ~ x, data = mydata)
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -3.2100 -0.6867 -0.0082  0.6775  3.8245 
#> 
#> Coefficients:
#>              Estimate Std. Error t value Pr(>|t|)
#> (Intercept) -0.009931   0.022684  -0.438    0.662
#> x           -0.008284   0.022711  -0.365    0.715
#> 
#> Residual standard error: 1.013 on 1998 degrees of freedom
#> Multiple R-squared:  6.659e-05,  Adjusted R-squared:  -0.0004339 
#> F-statistic: 0.1331 on 1 and 1998 DF,  p-value: 0.7153

Effect of randomization of \(Z\)

Now let’s randomize (permutate) \(Z\) to see what happens.

mydata <- mydata %>% 
  mutate(z_permutated = sample(z, size = length(z)))

head(mydata)
#>            x          y z z_permutated
#> 1 -0.1098666  1.1895284 0            1
#> 2  0.6233152  0.6848755 0            1
#> 3  0.2309203 -0.4324656 0            0
#> 4 -0.1164846 -0.7197229 0            1
#> 5  0.7061365  0.4110647 0            0
#> 6 -0.9412289 -2.4109163 0            0
mydata %>% 
  dplyr::select(x, y, z, z_permutated) %>% 
  cor()
#>                         x            y            z z_permutated
#> x             1.000000000 -0.008160122  0.002151602  -0.05724464
#> y            -0.008160122  1.000000000 -0.040716742  -0.03281496
#> z             0.002151602 -0.040716742  1.000000000  -0.00200000
#> z_permutated -0.057244638 -0.032814956 -0.002000000   1.00000000

Unconditional correlated remains unchanged

mydata %>% 
  dplyr::select(x, y, z) %>% 
  group_by(z) %>% 
  summarise(r_marginal = cor(x,y))
#> # A tibble: 2 x 2
#>       z r_marginal
#>   <dbl>      <dbl>
#> 1     0      0.695
#> 2     1     -0.698

The permutation deleted the correlation

Within each group, that is, \(z=0\) and \(z=1\), there’s no correlation left.

mydata %>% 
  group_by(z_permutated) %>% 
  summarise(r_marginal = cor(x,y))
#> # A tibble: 2 x 2
#>   z_permutated r_marginal
#>          <dbl>      <dbl>
#> 1            0    -0.0448
#> 2            1     0.0271

Pause and ponder

Permutating (randomizing) a group variable that acts as a moderator as in the case above will erase the moderator effect.

This is due to the fact that a randomized experiment will have all the paths going into the to-be randomized variable having cut. This in turn means that the randomized variable, \(Z\) will not be biased/influenced by any other variable.

Can a fork be a moderator?

Assume the following DAG:

dag_fork <- dagitty::dagitty("dag{
x <- z
y <- z
}")

dag_fork
#> [1] "dag {\nx\ny\nz\nz -> x\nz -> y\n}\n"
#> attr(,"class")
#> [1] "dagitty"
dag_fork_tidy <- tidy_dagitty(dag_fork)
ggdag(dag_fork) +
  theme_dag_blank()

Simulate data accordingly

z <- sample(x = c(0,1), size = n, replace = TRUE)


x <- z + 0.3 * rnorm(n = n)
y <- z + 0.3 * rnorm(n = n)

Put the data together

mydata2 <- tibble(
  z = z,
  x = x,
  y = y,
  z_permuated = sample(z, size = n)
)

Regression time

lm_fork <- lm(y ~ x * z, data = mydata2)

summary(lm_fork)
#> 
#> Call:
#> lm(formula = y ~ x * z, data = mydata2)
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -0.9181 -0.2069  0.0067  0.1972  0.8498 
#> 
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)    
#> (Intercept) -0.01526    0.01334  -1.144    0.253    
#> x           -0.03727    0.04334  -0.860    0.390    
#> z            1.02056    0.04901  20.824   <2e-16 ***
#> x:z          0.04079    0.06255   0.652    0.514    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 0.2952 on 996 degrees of freedom
#> Multiple R-squared:  0.7512, Adjusted R-squared:  0.7504 
#> F-statistic:  1002 on 3 and 996 DF,  p-value: < 2.2e-16

No, it appears that this is not a way a fork can act as a moderator.

Can achain act as a moderator?

Assume the following DAG:

dag_chain <- dagitty::dagitty("dag{
z -> x -> y
}")

dag_chain
#> [1] "dag {\nx\ny\nz\nx -> y\nz -> x\n}\n"
#> attr(,"class")
#> [1] "dagitty"
dag_chain_tidy <- tidy_dagitty(dag_chain)
ggdag(dag_chain_tidy) +
  theme_dag_blank()

Simulate data

z1 <- sample(c(0,1), size = n, replace = TRUE, prob = c(0.3, .7))
z2 <- sample(c(0,1), size = n, replace = TRUE, prob = c(0.7, .3))


mydata3 <- tibble(
  z = c(z1, z2),
  x = z + 0.1 * rnorm(n=n),
  y = x + 0.1 * rnorm(n=n)
)

Regression time

lm_chain <- lm(y ~ x*z, data = mydata3)

summary(lm_chain)
#> 
#> Call:
#> lm(formula = y ~ x * z, data = mydata3)
#> 
#> Residuals:
#>      Min       1Q   Median       3Q      Max 
#> -0.26374 -0.07260  0.00033  0.06069  0.37606 
#> 
#> Coefficients:
#>              Estimate Std. Error t value Pr(>|t|)    
#> (Intercept) -0.003637   0.003102  -1.172    0.241    
#> x            0.987410   0.031431  31.415   <2e-16 ***
#> z            0.030316   0.032242   0.940    0.347    
#> x:z         -0.018902   0.044884  -0.421    0.674    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 0.09965 on 1996 degrees of freedom
#> Multiple R-squared:  0.9632, Adjusted R-squared:  0.9631 
#> F-statistic: 1.74e+04 on 3 and 1996 DF,  p-value: < 2.2e-16

Given this DAG, we cannot expect to see a moderator effect.

Debrief

It appears that a moderator is only compatible with a limited number of causal models. Secondly, we see that randomizing (permutating) erases the association, as should be expected.

There can be said much more to the relation of moderation (interaction) and causality, see eg., this paper, or this or this