# Function to estimate beta_cy from height, PRS, and outcome
binary_2sls <- function(x, y, prs, method=c("probit", "logistic", "absolute")[1]) {
if(method == "probit") {
mod <- summary(glm(y ~ prs, family=binomial(link = "probit")))
} else if(method == "logistic") {
mod <- summary(glm(y ~ prs, family=binomial(link = "logit")))
} else if(method == "absolute") {
mod <- summary(lm(y ~ prs))
}
bgy <- mod$coefficients[2, 1]
se_bgy <- mod$coefficients[2, 2]
mod <- summary(lm(x ~ prs))
bgx <- mod$coefficients[2, 1]
se_bgx <- mod$coefficients[2, 2]
beta_xy <- bgy / bgx
se_beta_xy <- sqrt((se_bgy / bgx)^2 + (bgy * se_bgx / bgx^2)^2)
return(list(beta_xy = beta_xy, se_beta_xy = se_beta_xy))
}
estimate_beta_cy2 <- function(height, prs, outcome, var_c = 0.15, h2 = 0.7, n_boot = 1000, binary_method="probit") {
stopifnot(binary_method %in% c("probit", "logistic", "absolute"))
# Detect if outcome is binary or continuous
unique_vals <- unique(outcome)
is_binary <- length(unique_vals) == 2 && all(sort(unique_vals) == c(0, 1))
# Store original outcome for prevalence calculation
outcome_original <- outcome
# Calculate prevalence if binary
if (is_binary) {
prevalence <- mean(outcome)
# Calculate liability scale threshold transformation factor
threshold <- qnorm(prevalence)
liability_scale_factor <- dnorm(threshold)
cat(sprintf("Binary outcome detected (prevalence = %.3f)\n", prevalence))
cat(sprintf("Liability scale transformation factor = %.4f\n", liability_scale_factor))
} else {
# Standardize continuous outcome
outcome <- (outcome - mean(outcome)) / sd(outcome)
cat("Continuous outcome detected - standardized to mean=0, SD=1\n")
prevalence <- NA
liability_scale_factor <- 1
}
# Use instrumental variable regression to estimate causal effect
if(is_binary) {
iv_res <- binary_2sls(height, outcome, prs, method = binary_method)
} else {
iv_res <- binary_2sls(height, outcome, prs, method = "absolute")
}
beta_xy_prs <- iv_res$beta_xy
se_beta_xy <- iv_res$se_beta_xy
# Calculate R^2 for PRS
model_prs <- lm(height ~ prs)
prs_explained <- fitted.values(model_prs)
r2_prs <- var(prs_explained) / var(height)
# Estimate beta_cy using method 1 (from covariance of height and outcome)
beta_cy_method1_observed <- (cov(height, outcome) - beta_xy_prs * var(height)) / var_c
# Method 2: Using height residual
# Get residual height (height not explained by PRS)
residual_height <- residuals(model_prs)
# Estimate beta_cy using method 2 (from residual height)
beta_cy_method2_observed <- (cov(residual_height, outcome) - beta_xy_prs_observed * var(residual_height)) / var_c
# Transform to liability scale if binary
beta_cy_method1 <- beta_cy_method1_observed / liability_scale_factor
beta_cy_method2 <- beta_cy_method2_observed / liability_scale_factor
# Parametric bootstrap using 2SLS SE for beta_xy
n <- length(height)
boot_results <- matrix(NA, n_boot, 2)
for(i in 1:n_boot) {
# Sample with replacement
idx <- sample(1:n, n, replace = TRUE)
h_boot <- height[idx]
p_boot <- prs[idx]
y_boot <- outcome[idx]
# Parametrically resample beta_xy from its sampling distribution
beta_xy_boot <- rnorm(1, mean = beta_xy_prs, sd = se_beta_xy)
beta_xy_boot <- beta_xy_boot / liability_scale_factor
# Method 1 bootstrap
boot_observed_1 <- (cov(h_boot, y_boot) - beta_xy_boot_observed * var(h_boot)) / var_c
boot_results[i, 1] <- boot_observed_1 / liability_scale_factor
# Method 2 bootstrap
model_prs_boot <- lm(h_boot ~ p_boot)
res_boot <- residuals(model_prs_boot)
boot_observed_2 <- (cov(res_boot, y_boot) - beta_xy_boot_observed * var(res_boot)) / var_c
boot_results[i, 2] <- boot_observed_2 / liability_scale_factor
}
# Calculate standard errors
se_method1 <- sd(boot_results[, 1])
se_method2 <- sd(boot_results[, 2])
# Calculate covariance between the two estimators using GLS
# cov(β̂_cy,1, β̂_cy,2) ≈ 1/(N·(σ_C²)²) * [var(Y)var(X,X_res) + var(X,Y)var(X_res,Y)]
cov_x_xres <- cov(height, residual_height)
cov_x_y <- cov(height, outcome)
cov_xres_y <- cov(residual_height, outcome)
var_y <- var(outcome)
# Covariance of the two beta_cy estimates
cov_estimators <- (1 / (n * var_c^2)) * (var_y * cov_x_xres + cov_x_y * cov_xres_y)
# Build variance-covariance matrix
var_method1 <- se_method1^2
var_method2 <- se_method2^2
Sigma <- matrix(c(var_method1, cov_estimators,
cov_estimators, var_method2), nrow = 2)
# GLS weights (inverse of variance-covariance matrix)
Sigma_inv <- solve(Sigma)
weights <- Sigma_inv %*% c(1, 1)
weight_sum <- sum(weights)
# GLS combined estimate
beta_cy_combined <- (weights[1] * beta_cy_method1 + weights[2] * beta_cy_method2) / weight_sum
# Variance of combined estimate: var(β̂_combined) = 1 / (1' Σ^(-1) 1)
var_combined <- as.numeric(1 / (t(c(1, 1)) %*% Sigma_inv %*% c(1, 1)))
se_combined <- sqrt(var_combined)
# Apply GLS weights to bootstrap distributions
boot_combined <- (weights[1] * boot_results[, 1] + weights[2] * boot_results[, 2]) / weight_sum
# Return results
return(list(
beta_xy = beta_xy_prs,
se_beta_xy = se_beta_xy,
beta_cy_method1 = beta_cy_method1,
beta_cy_method2 = beta_cy_method2,
beta_cy_combined = beta_cy_combined,
se_method1 = se_method1,
se_method2 = se_method2,
se_combined = se_combined,
cov_estimators = cov_estimators,
correlation_estimators = cov_estimators / (se_method1 * se_method2),
weight_method1 = weights[1] / weight_sum,
weight_method2 = weights[2] / weight_sum,
r2_prs = r2_prs,
is_binary = is_binary,
prevalence = if(is_binary) prevalence else NA,
outcome_scale = if(is_binary) "liability" else "standardized",
beta_cy_method1_observed = beta_cy_method1_observed,
beta_cy_method2_observed = beta_cy_method2_observed,
boot_distribution_method1 = boot_results[, 1],
boot_distribution_method2 = boot_results[, 2],
boot_distribution_combined = boot_combined
))
}