library(tidyverse)
library(psych)
library(sjPlot)
library(ggthemes)
library(ggplot2)
library(MASS)
library(purrr)
library(broom)
library(lm.beta)
library(RColorBrewer)
library(plotly)
library(glmnet)
library(glmnetUtils)
source("http://www.labape.com.br/rprimi/R/cria_quartis.R")
In multiple regression with X1...Xn predicting Y each Bi or βi coefficient represents the unique effect of that varible on Y controling all the other variables in the model. If β1 corresponds to the unique effect of X1, and X1 is correlated with X2 where does sharred effect goes ?
This code simulates draw 3 variables Y, X1, and X2 from a multivariate normal random distribution having a specified var_cor
matrix of correlations.
It makes a grid of possible correlations params
and then simulates data using mvrnorm
function.
params <- expand.grid(
r_Y1 = seq(0, .68, by=.05),
r_Y2 = seq(0, .68, by=.05),
r_12= seq(0, .88, by=.05)
)
simulate_3vars <- function (r_Y1, r_Y2, r_12, n=400 ){
var_cor <- matrix(
c(1.00, r_12 , r_Y1 ,
r_12, 1.00, r_Y2,
r_Y1, r_Y2, 1.00),
byrow=TRUE, nrow=3, ncol=3,
dimnames = list(
c("X1", "X2", "Y"),
c("X1", "X2", "Y")
))
df <- try(
mvrnorm(n, mu = c(0, 0, 0),
Sigma = var_cor, empirical = TRUE) %>% as.data.frame(), silent = TRUE)
return(df)
}
simulated_data <- params %>%
mutate(data = pmap(., simulate_3vars)
)
simulated_data <- params %>%
mutate(data = pmap(., simulate_3vars),
cl = map_chr(data, class)
)
simulated_data <- simulated_data %>%
filter(cl == "data.frame") %>%
mutate(
mult_regres = map(data, ~lm(Y ~ X1 + X2, data=.)),
lasso_regres = map(data, ~glmnet(Y ~ X1 + X2, data=. )),
beta = map(mult_regres, ~lm.beta(.)),
beta2 = map(beta, "coefficients"),
glance = map(mult_regres, glance),
)
simulated_data <- simulated_data %>%
mutate(
beta2 = map(beta2, t)
) %>%
mutate(
beta2 = map(beta2, as.data.frame)
)
summar <- simulated_data %>% select(glance) %>% unnest
betas <- simulated_data %>% select(beta2) %>% unnest
df <- simulated_data %>% select(1:3) %>% bind_cols(betas, summar)
names(df)[5:6] <- c("b_x1", "b_x2")
As explained here: https://stats.stackexchange.com/questions/24827/where-is-the-shared-variance-between-all-ivs-in-a-linear-multiple-regression-equ when X1 and X2 have the same correlation with Y as X1 and X2 “come closer and closer to being perfectly correlated, their b-values in the multiple regression come closer and closer to HALF of the b-value in the simple linear regression of either one of them. However, as X1 and X2 come closer and closer to being perfectly correlated, the STANDARD ERROR of b1 and b2 moves closer and closer to infinity, so the t-values converge on zero. So, the t-values will converge on zero (i.e., no UNIQUE linear relationship between either X1 and Y or X2 and Y), but the b-values converge to half the value of the b-values in the simple linear regression … so as the correlation between X1 and X2 approaches unity, EACH of the partial slope coefficients approaches contributing equally to the prediction of the Y value, even though neither independent variable offers any UNIQUE explanation of the dependent variable” (HTH // Phil, 2017)
This is show in the simulation
df %>% filter(r_Y2 == .50 & r_Y1 == .50) %>%
ggplot(aes(y=b_x1, x=b_x2, color = r_12, size=r_12)) +
geom_point(alpha=1/2) +
scale_colour_gradientn(colours = brewer.pal(7, "Paired")) +
scale_y_continuous(breaks= seq(0.20, .55, by=.05), limits = c(.20,.55)) +
scale_x_continuous(breaks= seq(0.20, .55, by=.05), limits = c(.20,.55)) +
geom_vline(xintercept = .5, color = "orange") +
geom_hline(yintercept = .5, color = "orange") +
theme_minimal()
Now look how crazy β′s goes when ryx1>ryx2 as rx12 approaches 1. Here we are seeing Simpsom’s paradox.
df %>% filter(r_Y1 == .50 & r_Y2 == .25 ) %>%
ggplot(aes(y=b_x1, x=b_x2, color = r_12, size=r_12)) +
geom_point(alpha=1/2) +
scale_colour_gradientn(colours = brewer.pal(7, "Paired")) +
geom_vline(xintercept = .25, color = "orange") +
geom_hline(yintercept = .50, color = "orange") +
scale_y_continuous(breaks= round(seq(.3, 1, by=.10), 2), limits = c(.3,1)) +
scale_x_continuous(breaks= round(seq(-.6, .40, by=.10), 2), limits = c(-.6,.40)) +
theme_minimal()
Now 3d graph
colors <- brewer.pal(7, "Paired")
df %>% filter(r_Y1 == .50 & r_Y2 == .25 ) %>%
plot_ly(y=~b_x1, z=~b_x1, x=~r_12,
type="scatter3d",
mode="markers", color=~r_12,
colors = colors,
size = 3.5,
opacity = 1/1.5)
colors <- brewer.pal(7, "Paired")
df %>% filter(r_Y1 == .50 & r_Y2 == .50 ) %>%
plot_ly(y=~b_x1, z=~b_x1, x=~r_12,
type="scatter3d",
mode="markers", color=~r_12,
colors = colors,
size = 3.5,
opacity = 1/1.5)
# http://varianceexplained.org/broom-gallery/snippets/broom-glmnet.html
tidy(simulated_data$lasso_regr[[1]])
tidied <- tidy(simulated_data$lasso_regr[[1]]) %>% filter(term != "(Intercept)")
ggplot(tidied, aes(step, estimate, group = term)) + geom_line()
coefficients <- simulated_data$lasso_regr[[1]] %>%
broom::tidy() %>%
mutate(log_lambda = log(lambda)) %>%
filter(term != "(Intercept)")
plot(simulated_data$lasso_regr[[1]], xvar = "lambda", label = TRUE)
ggplot(coefficients, aes(x=log_lambda, y=estimate, col=term)) +
geom_line() + coord_cartesian(xlim=log(coefficients$lambda))
tidied_cv <- tidy(simulated_data$lasso_regr[[1]])
glance_cv <- glance(simulated_data$lasso_regr[[1]])