Studying betas in multiple regression when predictors are correlated

Libraries

    library(tidyverse)
 
    
    
    library(psych)
    library(sjPlot)

  
    library(ggthemes)
    library(ggplot2)
    library(MASS)
    library(purrr)

   library(broom)
   library(lm.beta)
  
  

    library(RColorBrewer)  

    library(plotly)

    library(glmnet)
    library(glmnetUtils)

    source("http://www.labape.com.br/rprimi/R/cria_quartis.R")

Question

In multiple regression with $X_1 ... X_n$ predicting $Y$ each $B_i$ or $\beta_i$ coefficient represents the unique effect of that varible on $Y$ controling all the other variables in the model. If $\beta_1$ corresponds to the unique effect of $X_1$ , and $X_1$ is correlated with $X_2$ where does sharred effect goes ?

Data simulation

This code simulates draw 3 variables $Y$ , $X_1$ , and $X_2$ from a multivariate normal random distribution having a specified var_cor matrix of correlations.
It makes a grid of possible correlations params and then simulates data using mvrnormfunction.

  params <- expand.grid(
      r_Y1 = seq(0, .68, by=.05), 
      r_Y2 = seq(0, .68, by=.05),
      r_12= seq(0, .88, by=.05)
  )
 
  simulate_3vars <- function (r_Y1, r_Y2, r_12, n=400 ){
    var_cor <-  matrix(
     c(1.00,  r_12 , r_Y1 ,
       r_12,  1.00,  r_Y2, 
       r_Y1,  r_Y2,  1.00), 
    byrow=TRUE, nrow=3, ncol=3, 
    dimnames = list(
        c("X1", "X2", "Y"), 
        c("X1", "X2", "Y")
        ))
  df <-  try(
      mvrnorm(n, mu = c(0, 0, 0), 
      Sigma = var_cor, empirical = TRUE) %>% as.data.frame(), silent = TRUE) 
    return(df)
  }


  simulated_data  <- params %>% 
     mutate(data =  pmap(., simulate_3vars)
       )
 
  simulated_data  <- params %>% 
     mutate(data =  pmap(., simulate_3vars),
      cl = map_chr(data, class)
       )
  
  
 
 simulated_data <- simulated_data %>%
     filter(cl == "data.frame") %>%
     mutate(
      mult_regres = map(data, ~lm(Y ~ X1 + X2, data=.)),
      lasso_regres  = map(data, ~glmnet(Y ~ X1 + X2, data=. )),
      beta = map(mult_regres, ~lm.beta(.)),
      beta2 = map(beta, "coefficients"),
      glance = map(mult_regres, glance),
      
         )

 
 
  simulated_data <- simulated_data %>%
     mutate(
      beta2 = map(beta2, t)
         ) %>%
     mutate(
      beta2 = map(beta2, as.data.frame)
         )
 
  summar <- simulated_data %>% select(glance) %>% unnest 
  betas <- simulated_data %>% select(beta2) %>% unnest
     
  df <- simulated_data %>% select(1:3) %>% bind_cols(betas, summar)

  names(df)[5:6] <- c("b_x1", "b_x2")

Results

As explained here: https://stats.stackexchange.com/questions/24827/where-is-the-shared-variance-between-all-ivs-in-a-linear-multiple-regression-equ when $X_1$ and $X_2$ have the same correlation with $Y$ as $X_1$ and $X_2$ “come closer and closer to being perfectly correlated, their b-values in the multiple regression come closer and closer to HALF of the b-value in the simple linear regression of either one of them. However, as $X_1$ and $X_2$ come closer and closer to being perfectly correlated, the STANDARD ERROR of b1 and b2 moves closer and closer to infinity, so the t-values converge on zero. So, the t-values will converge on zero (i.e., no UNIQUE linear relationship between either $X_1$ and $Y$ or $X_2$ and $Y$ ), but the b-values converge to half the value of the b-values in the simple linear regression … so as the correlation between $X_1$ and $X_2$ approaches unity, EACH of the partial slope coefficients approaches contributing equally to the prediction of the $Y$ value, even though neither independent variable offers any UNIQUE explanation of the dependent variable” (HTH // Phil, 2017)

This is show in the simulation

  df %>% filter(r_Y2 == .50 & r_Y1 == .50) %>%
    ggplot(aes(y=b_x1, x=b_x2, color = r_12, size=r_12)) + 
    geom_point(alpha=1/2)  +
    scale_colour_gradientn(colours = brewer.pal(7, "Paired")) +
    scale_y_continuous(breaks= seq(0.20, .55, by=.05), limits = c(.20,.55)) +
    scale_x_continuous(breaks= seq(0.20, .55, by=.05), limits = c(.20,.55)) +
    geom_vline(xintercept = .5, color = "orange") +
    geom_hline(yintercept = .5,  color = "orange") +
    theme_minimal()

Now look how crazy $\beta's$ goes when $r_{yx_1} > r_{yx_2}$ as $r_{x_{12}}$ approaches 1. Here we are seeing Simpsom’s paradox.

 df %>% filter(r_Y1 == .50 & r_Y2 == .25 ) %>%
    ggplot(aes(y=b_x1, x=b_x2, color = r_12, size=r_12)) + 
    geom_point(alpha=1/2)  +
    scale_colour_gradientn(colours = brewer.pal(7, "Paired")) +
    geom_vline(xintercept = .25, color = "orange") +
    geom_hline(yintercept = .50,  color = "orange") +
    scale_y_continuous(breaks= round(seq(.3, 1, by=.10), 2), limits = c(.3,1)) +
    scale_x_continuous(breaks= round(seq(-.6, .40, by=.10), 2), limits = c(-.6,.40)) +
    theme_minimal()

Now 3d graph

    colors <- brewer.pal(7, "Paired")
    
    df %>% filter(r_Y1 == .50 & r_Y2 == .25 ) %>%
      plot_ly(y=~b_x1, z=~b_x1, x=~r_12,
        type="scatter3d", 
        mode="markers", color=~r_12, 
          colors = colors,
        size = 3.5,
        opacity = 1/1.5)

   colors <- brewer.pal(7, "Paired")
    
    df %>% filter(r_Y1 == .50 & r_Y2 == .50 ) %>%
      plot_ly(y=~b_x1, z=~b_x1, x=~r_12,
        type="scatter3d", 
        mode="markers", color=~r_12, 
          colors = colors,
        size = 3.5,
        opacity = 1/1.5)

Lasso regression

# http://varianceexplained.org/broom-gallery/snippets/broom-glmnet.html
 

    tidy(simulated_data$lasso_regr[[1]])

    tidied <- tidy(simulated_data$lasso_regr[[1]]) %>% filter(term != "(Intercept)")
    
    ggplot(tidied, aes(step, estimate, group = term)) + geom_line()

    coefficients <- simulated_data$lasso_regr[[1]] %>%
     broom::tidy() %>%
     mutate(log_lambda = log(lambda)) %>%
     filter(term != "(Intercept)")
    
    plot(simulated_data$lasso_regr[[1]], xvar = "lambda", label = TRUE)
    
    ggplot(coefficients, aes(x=log_lambda, y=estimate, col=term)) +
     geom_line() + coord_cartesian(xlim=log(coefficients$lambda))
    
   
    tidied_cv <- tidy(simulated_data$lasso_regr[[1]])
    glance_cv <- glance(simulated_data$lasso_regr[[1]])