# Simulate IV with varying instrument strength
simulate_iv <- function(n = 150, strength = 1.0) {
tibble(
u = rnorm(n),
z = rnorm(n),
# Treatment depends on confounder u and instrument z
d = 0.8 * u + strength * z + rnorm(n, 0, 0.3),
# True effect of d is 1.5; confounder u biases OLS upward
y = 2 + 1.5 * d + 1.2 * u + rnorm(n, 0, 0.5)
)
}
set.seed(456)
n_sims <- 300
# Simulate across many strength values to get continuous F distribution
iv_results <- map_dfr(seq(0.02, 1.5, length.out = 15), function(str) {
map_dfr(1:20, function(i) {
dat <- simulate_iv(n = 150, strength = str)
fs <- lm(d ~ z, data = dat)
iv <- ivreg(y ~ d | z, data = dat)
ols <- lm(y ~ d, data = dat)
tibble(
strength = str,
iv_est = coef(iv)["d"],
ols_est = coef(ols)["d"],
f_stat = summary(fs)$fstatistic[1]
)
})
})
# Calculate OLS bias for reference
ols_biased <- mean(iv_results$ols_est)
ggplot(iv_results, aes(x = f_stat, y = iv_est)) +
# Shade the "danger zone" where F < 10
annotate("rect", xmin = 0, xmax = 10, ymin = -2, ymax = 6,
fill = accent_coral, alpha = 0.15) +
annotate("text", x = 5, y = 5.2, label = "Danger Zone\n(F < 10)",
color = accent_coral, fontface = "bold", size = 4.5) +
# Add points
geom_point(alpha = 0.5, size = 2.5, color = slate_gray) +
# True effect line
geom_hline(yintercept = 1.5, linetype = "solid", color = secondary_teal, linewidth = 1.5) +
annotate("text", x = 80, y = 1.5 + 0.3, label = "True Effect = 1.5",
color = secondary_teal, fontface = "bold", size = 4.5, hjust = 1) +
# OLS (biased) line
geom_hline(yintercept = ols_biased, linetype = "dashed", color = accent_coral, linewidth = 1.2) +
annotate("text", x = 80, y = ols_biased + 0.3, label = paste0("OLS Estimate = ", round(ols_biased, 2), " (biased)"),
color = accent_coral, fontface = "bold", size = 4.5, hjust = 1) +
# F = 10 threshold
geom_vline(xintercept = 10, linetype = "dashed", color = slate_gray, linewidth = 1) +
# Smooth trend to show bias pattern
geom_smooth(method = "loess", se = FALSE, color = primary_blue, linewidth = 1.5, span = 0.5) +
scale_x_continuous(limits = c(0, 90), breaks = c(0, 10, 20, 40, 60, 80)) +
scale_y_continuous(limits = c(-2, 6)) +
labs(
title = "Weak Instruments: Bias Toward OLS + High Variance",
subtitle = "Each dot is one simulation. Blue curve shows the average IV estimate at each F level.",
x = "First-Stage F-Statistic",
y = "IV Estimate"
) +
theme_health_econ(base_size = 18)