Summary stats and imputation – Gibran Hemani’s lab book

Background

Summary imputation is very slow. This very fast approximation is based on simulating summary statistics for a region given knowledge of the causal variants and an LD matrix.

library(dplyr)


Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

library(data.table)


Attaching package: 'data.table'

The following objects are masked from 'package:dplyr':

    between, first, last

library(MASS)


Attaching package: 'MASS'

The following object is masked from 'package:dplyr':

    select

library(ggplot2)

# Alternatives to matrix inversion (ignore just use MASS::ginv)
solve2 <- function(A) {
    eig <- eigen(A)
    eig$values <- 1 / eig$values
    return(eig$vectors %*% diag(eig$values) %*% t(eig$vectors))
}

solve3 <- function(A, lambda=1e-6) {
    solve(A + diag(nrow(A)) * lambda)
}

Get some data for testing etc

system("plink2 --bfile ~/repo/opengwas-api-internal/opengwas-api/app/ld_files/EUR --chr 1 --from-mb 10 --to-mb 10.4 --recode A --out temp")
G <- fread("temp.raw", header = TRUE)
G <- G[,-c(1:6)] %>% as.matrix
X <- G
nsnp <- ncol(X)
for(i in 1:nsnp) {
    X[,i] <- X[,i] - mean(X[,i])
}
af <- colMeans(G) / 2
save(X, af, file="1kg_region.rdata")

Load the data

load(url("https://github.com/explodecomputer/lab-book/raw/refs/heads/main/posts/2024-09-18-conditional-summary-stats/1kg_region.rdata"))

#' Basic imputation function
#' 
#' @param R The correlation matrix - must be complete for the set of SNPs that need to be imputed
#' @param ss A data frame with columns betahat2 = vector of effect estimates in the same order as R and with NAs for variants that need to be imputed; se = as with betahat2 but for available standard errors, af = allele frequencies (no missing values allowed, so use reference panel if there are missing values)
#' @param index The positions of the SNPs that are causal and will be used to generate the simulated summary statistics. This can just be the top hit.
#' 
#' @return A list with the following elements:
#' - ss: The input data frame with the imputed values added
#' - b_adj: The adjustment factor for the effect sizes
#' - se_adj: The adjustment factor for the standard errors
#' - b_cor: The correlation between the true and imputed effect sizes - this is critical for evaluation of the performance of the imputation, it should be close to 1 e.g > 0.7 would be a reasonable threshold
#' - se_cor: The correlation between the true and imputed standard errors
imp <- function(R, ss, index) {
    b <- ss$betahat2
    se <- ss$se2
    af <- ss$af
    nsnp <- length(b)
    stopifnot(ncol(R) == nsnp)
    stopifnot(nrow(R) == nsnp)
    stopifnot(length(af) == nsnp)
    stopifnot(length(se) == nsnp)
    stopifnot(all(index) %in% 1:nsnp)
    stopifnot(length(index) < nsnp)
    stopifnot(all(af > 0 & af < 1))
    stopifnot(all(!is.na(af)))
    stopifnot(all(se > 0, na.rm=TRUE))
    if(all(!is.na(b))) {
        message("No missing values in b, imputation not required")
        b_cor=1
        se_cor=1
        mod1=1
        mod2=1
    } else {
        # Calculate the diagonal matrix of variances and the inverse
        D <- diag(sqrt(2 * af * (1 - af)))
        Di <- diag(1 / diag(D))

        # Get the conditional estimates of the index SNP effects
        if(length(index) == 1) {
            bhat2 <- b[index]
        } else {
            bhat2 <- D[index,index] %*% MASS::ginv(R[index,index]) %*% Di[index,index] %*% b[index]
        }
        b2 <- rep(0, nsnp)
        b2[index] <- bhat2

        # Get the simulated effect sizes
        betahat_sim <- as.numeric(Di %*% R %*% D %*% b2)

        # Initialise the SE - this doesn't account for var(y) or sample size, but those are constants that can be obtained from regression re-scaling
        sehat <- sqrt(diag(Di))

        # Re-scale effect sizes and standard errors
        # vb <- var(b, na.rm=TRUE)
        # vse <- var(se, na.rm=TRUE)
        # mod1 <- cov(b, betahat_sim, use="pair") / vb
        mod1 <- lm(betahat_sim ~ b)$coef[2]
        # mod2 <- cov(se, sehat, use="pair") / vse
        mod2 <- lm(sehat ~ se)$coef[2]

        # Performance metrics
        # b_cor = mod1 * sqrt(vb) / sd(betahat_sim, na.rm=TRUE)
        b_cor <- cor(b, betahat_sim, use="pair")
        # se_cor = mod2 * sqrt(vse) / sd(sehat, na.rm=TRUE)
        se_cor <- cor(se, sehat, use="pair")

        # Re-scale
        betahat_sim <- betahat_sim / mod1
        sehat <- sehat / mod2

        # Fill in missing values
        b[is.na(b)] <- betahat_sim[is.na(b)]
        se[is.na(se)] <- sehat[is.na(se)]

        stopifnot(all(!is.na(b)))
        stopifnot(all(!is.na(se)))
    }

    ss$betahatimp <- b
    ss$seimp <- se
    ss$zimp <- b / se
    ss$pimp <- 2 * pnorm(-abs(ss$zimp))

    # Output
    out <- list(
        ss = ss,
        b_adj = mod1,
        se_adj = mod2,
        b_cor = b_cor,
        se_cor = se_cor,
        n_ind = length(index)
    )
    return(out)
}

Run some simulations to test the performance across different scenarios

simulate_ss <- function(X, af, ncause, sigmag, seed=1234) {
    set.seed(seed)
    nsnp <- length(af)
    nid <- nrow(X)
    b <- rep(0, nsnp)
    b[sample(1:nsnp, ncause)] <- rnorm(ncause, sd=sigmag)

    e <- rnorm(nid)
    y <- X %*% b + e 

    betahat <- sapply(1:nsnp, \(i) {cov(X[,i], y) / var(X[,i])})
    se <- sapply(1:nsnp, \(i) {sqrt(var(y) / (var(X[,i] * sqrt(nid))))})
    zhat <- betahat/se
    pval <- 2 * pnorm(-abs(zhat))

    return(tibble(betahat, b, se, zhat, pval, af))
}

generate_missing <- function(ss, frac) {
    ss <- ss %>% mutate(
        betahat2 = ifelse(runif(n()) < frac, NA, betahat),
        se2 = ifelse(is.na(betahat2), NA, se),
        zhat2 = ifelse(is.na(betahat2), NA, zhat))
    return(ss)
}

clump <- function(z, R, zthresh = qnorm(1e-5, low=F), rthresh = 0.2) {
    z <- abs(z)
    z[z < zthresh] <- NA
    k <- c()
    while(!all(is.na(z))) {
        i <- which.max(z)
        k <- c(k, i)
        z[i] <- NA
        z[which(R[i,]^2 > rthresh)] <- NA
    }
    return(k)
}

One simulation example where there are 3 causal variants and they are known and 10% of the data is masked for imputation

R <- cor(X)
ss <- simulate_ss(X, af, 3, 20)
ss <- generate_missing(ss, 0.1)
ss1 <- imp(R, ss, which(ss$b != 0))
ss1

$ss
# A tibble: 1,075 × 13
   betahat     b    se   zhat     pval    af betahat2   se2  zhat2 betahatimp
     <dbl> <dbl> <dbl>  <dbl>    <dbl> <dbl>    <dbl> <dbl>  <dbl>      <dbl>
 1   3.17      0 0.841  3.77  1.62e- 4 0.155    3.17  0.841  3.77       3.17 
 2   0.844     0 2.53   0.333 7.39e- 1 0.989    0.844 2.53   0.333      0.844
 3  -0.185     0 1.30  -0.142 8.87e- 1 0.939   -0.185 1.30  -0.142     -0.185
 4   4.21      0 2.84   1.48  1.38e- 1 0.988    4.21  2.84   1.48       4.21 
 5  -8.30      0 2.40  -3.46  5.36e- 4 0.983   -8.30  2.40  -3.46      -8.30 
 6   4.15      0 0.699  5.94  2.93e- 9 0.241    4.15  0.699  5.94       4.15 
 7 -12.6       0 1.49  -8.46  2.72e-17 0.960  -12.6   1.49  -8.46     -12.6  
 8  -1.71      0 1.99  -0.859 3.90e- 1 0.977   -1.71  1.99  -0.859     -1.71 
 9   4.21      0 2.84   1.48  1.38e- 1 0.988    4.21  2.84   1.48       4.21 
10   0.370     0 1.29   0.286 7.75e- 1 0.938    0.370 1.29   0.286      0.370
# ℹ 1,065 more rows
# ℹ 3 more variables: seimp <dbl>, zimp <dbl>, pimp <dbl>

$b_adj
        b 
0.9762786 

$se_adj
       se 
0.6521467 

$b_cor
[1] 0.9925273

$se_cor
[1] 0.9928014

$n_ind
[1] 3

Show the performance of the imputation at the missing values

ggplot(ss1$ss, aes(x=betahatimp, y=betahat)) + geom_point(aes(colour=is.na(betahat2)))

Now we can run a simulation to test the performance of the imputation across different scenarios

sim_all <- function(X, R, frac_missing, ncause, sigmag, zthresh, rthresh, seed=1234) {
    ss <- simulate_ss(X, af, ncause, sigmag, seed)
    ss <- generate_missing(ss, frac_missing)
    if(zthresh == -1) {
        index <- which(ss$b != 0)
    } else if(zthresh == -2) {
        index <- which.max(abs(ss$zhat))
    } else {
        index <- clump(ss$zhat2, R, qnorm(zthresh, low=FALSE), rthresh)
    }
    ss <- imp(R, ss, index)
    return(ss)
}

params <- expand.grid(
    frac_missing = c(0.1, 0.3),
    ncause = c(1, 2, 3),
    sigmag = c(10, 20),
    zthresh = c(-1, -2, 1e-5, 1e-8),
    rthresh = c(0.01),
    sim = 1:20
)
dim(params)

res <- lapply(1:nrow(params), \(i) {
    message(i)
    p <- params[i,]
    r <- tryCatch(sim_all(X, R, p$frac_missing, p$ncause, p$sigmag, p$zthresh, p$rthresh, seed=i), error=function(e) {return(NULL)})
    tibble(
        frac_missing = p$frac_missing,
        ncause = p$ncause,
        sigmag = p$sigmag,
        zthresh = p$zthresh,
        rthresh = p$rthresh,
        sim = p$sim,
        b_cor = r$b_cor,
        se_cor = r$se_cor,
        b_adj = r$b_adj,
        se_adj = r$se_adj
    )
}) %>% bind_rows()
save(res, file="simres.rdata")

Look at simulation results

load("simres.rdata")
res %>% mutate(
    zthresh = case_when(zthresh == -1 ~ "Known causal variants",
                        zthresh == -2 ~ "Top hit",
                        TRUE ~ paste("Clump at", zthresh))
) %>%
    ggplot(aes(x=as.factor(frac_missing), y=b_cor)) + 
        geom_boxplot(aes(fill=as.factor(zthresh))) + 
        facet_grid(ncause ~ sigmag, labeller=label_both) + 
        labs(y="Correlation between known and imputed effect sizes", x="Fraction of missing values", fill="Index variant method")

Warning: Removed 136 rows containing non-finite outside the scale range
(`stat_boxplot()`).

Summary

Using a single tophit to generate the sumstats seems to be fine even when there are multiple causal variants
Clumping with strict rsq threshold and relaxed p-value threshold to obtain index SNPs seems to be most effective
The performance doesn’t change drastically based on fraction of missing SNPs
Previous iterations showed that if doing clumping, using rsq thresh 0.2 led to major problems, so having index SNPs be in relative linkage equilibrium seems important

sessionInfo()

R version 4.4.1 (2024-06-14)
Platform: aarch64-apple-darwin20
Running under: macOS Sonoma 14.6.1

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: Europe/London
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] ggplot2_3.5.1     MASS_7.3-60.2     data.table_1.15.4 dplyr_1.1.4      

loaded via a namespace (and not attached):
 [1] vctrs_0.6.5       cli_3.6.2         knitr_1.47        rlang_1.1.3      
 [5] xfun_0.44         generics_0.1.3    jsonlite_1.8.8    labeling_0.4.3   
 [9] glue_1.7.0        colorspace_2.1-0  htmltools_0.5.8.1 scales_1.3.0     
[13] fansi_1.0.6       rmarkdown_2.27    grid_4.4.1        munsell_0.5.1    
[17] evaluate_0.23     tibble_3.2.1      fastmap_1.2.0     yaml_2.3.8       
[21] lifecycle_1.0.4   compiler_4.4.1    htmlwidgets_1.6.4 pkgconfig_2.0.3  
[25] farver_2.1.2      digest_0.6.35     R6_2.5.1          tidyselect_1.2.1 
[29] utf8_1.2.4        pillar_1.9.0      magrittr_2.0.3    withr_3.0.0      
[33] gtable_0.3.5      tools_4.4.1