Summary stats and imputation

Author

Gibran Hemani

Published

September 21, 2024

Background

Summary imputation is very slow. This very fast approximation is based on simulating summary statistics for a region given knowledge of the causal variants and an LD matrix.

library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(data.table)

Attaching package: 'data.table'
The following objects are masked from 'package:dplyr':

    between, first, last
library(MASS)

Attaching package: 'MASS'
The following object is masked from 'package:dplyr':

    select
library(ggplot2)

# Alternatives to matrix inversion (ignore just use MASS::ginv)
solve2 <- function(A) {
    eig <- eigen(A)
    eig$values <- 1 / eig$values
    return(eig$vectors %*% diag(eig$values) %*% t(eig$vectors))
}

solve3 <- function(A, lambda=1e-6) {
    solve(A + diag(nrow(A)) * lambda)
}

Get some data for testing etc

system("plink2 --bfile ~/repo/opengwas-api-internal/opengwas-api/app/ld_files/EUR --chr 1 --from-mb 10 --to-mb 10.4 --recode A --out temp")
G <- fread("temp.raw", header = TRUE)
G <- G[,-c(1:6)] %>% as.matrix
X <- G
nsnp <- ncol(X)
for(i in 1:nsnp) {
    X[,i] <- X[,i] - mean(X[,i])
}
af <- colMeans(G) / 2
save(X, af, file="1kg_region.rdata")

Load the data

load(url("https://github.com/explodecomputer/lab-book/raw/refs/heads/main/posts/2024-09-18-conditional-summary-stats/1kg_region.rdata"))
#' Basic imputation function
#' 
#' @param R The correlation matrix - must be complete for the set of SNPs that need to be imputed
#' @param ss A data frame with columns betahat2 = vector of effect estimates in the same order as R and with NAs for variants that need to be imputed; se = as with betahat2 but for available standard errors, af = allele frequencies (no missing values allowed, so use reference panel if there are missing values)
#' @param index The positions of the SNPs that are causal and will be used to generate the simulated summary statistics. This can just be the top hit.
#' 
#' @return A list with the following elements:
#' - ss: The input data frame with the imputed values added
#' - b_adj: The adjustment factor for the effect sizes
#' - se_adj: The adjustment factor for the standard errors
#' - b_cor: The correlation between the true and imputed effect sizes - this is critical for evaluation of the performance of the imputation, it should be close to 1 e.g > 0.7 would be a reasonable threshold
#' - se_cor: The correlation between the true and imputed standard errors
imp <- function(R, ss, index) {
    b <- ss$betahat2
    se <- ss$se2
    af <- ss$af
    nsnp <- length(b)
    stopifnot(ncol(R) == nsnp)
    stopifnot(nrow(R) == nsnp)
    stopifnot(length(af) == nsnp)
    stopifnot(length(se) == nsnp)
    stopifnot(all(index) %in% 1:nsnp)
    stopifnot(length(index) < nsnp)
    stopifnot(all(af > 0 & af < 1))
    stopifnot(all(!is.na(af)))
    stopifnot(all(se > 0, na.rm=TRUE))
    if(all(!is.na(b))) {
        message("No missing values in b, imputation not required")
        b_cor=1
        se_cor=1
        mod1=1
        mod2=1
    } else {
        # Calculate the diagonal matrix of variances and the inverse
        D <- diag(sqrt(2 * af * (1 - af)))
        Di <- diag(1 / diag(D))

        # Get the conditional estimates of the index SNP effects
        if(length(index) == 1) {
            bhat2 <- b[index]
        } else {
            bhat2 <- D[index,index] %*% MASS::ginv(R[index,index]) %*% Di[index,index] %*% b[index]
        }
        b2 <- rep(0, nsnp)
        b2[index] <- bhat2

        # Get the simulated effect sizes
        betahat_sim <- as.numeric(Di %*% R %*% D %*% b2)

        # Initialise the SE - this doesn't account for var(y) or sample size, but those are constants that can be obtained from regression re-scaling
        sehat <- sqrt(diag(Di))

        # Re-scale effect sizes and standard errors
        # vb <- var(b, na.rm=TRUE)
        # vse <- var(se, na.rm=TRUE)
        # mod1 <- cov(b, betahat_sim, use="pair") / vb
        mod1 <- lm(betahat_sim ~ b)$coef[2]
        # mod2 <- cov(se, sehat, use="pair") / vse
        mod2 <- lm(sehat ~ se)$coef[2]

        # Performance metrics
        # b_cor = mod1 * sqrt(vb) / sd(betahat_sim, na.rm=TRUE)
        b_cor <- cor(b, betahat_sim, use="pair")
        # se_cor = mod2 * sqrt(vse) / sd(sehat, na.rm=TRUE)
        se_cor <- cor(se, sehat, use="pair")

        # Re-scale
        betahat_sim <- betahat_sim / mod1
        sehat <- sehat / mod2

        # Fill in missing values
        b[is.na(b)] <- betahat_sim[is.na(b)]
        se[is.na(se)] <- sehat[is.na(se)]

        stopifnot(all(!is.na(b)))
        stopifnot(all(!is.na(se)))
    }

    ss$betahatimp <- b
    ss$seimp <- se
    ss$zimp <- b / se
    ss$pimp <- 2 * pnorm(-abs(ss$zimp))

    # Output
    out <- list(
        ss = ss,
        b_adj = mod1,
        se_adj = mod2,
        b_cor = b_cor,
        se_cor = se_cor,
        n_ind = length(index)
    )
    return(out)
}

Run some simulations to test the performance across different scenarios

simulate_ss <- function(X, af, ncause, sigmag, seed=1234) {
    set.seed(seed)
    nsnp <- length(af)
    nid <- nrow(X)
    b <- rep(0, nsnp)
    b[sample(1:nsnp, ncause)] <- rnorm(ncause, sd=sigmag)

    e <- rnorm(nid)
    y <- X %*% b + e 

    betahat <- sapply(1:nsnp, \(i) {cov(X[,i], y) / var(X[,i])})
    se <- sapply(1:nsnp, \(i) {sqrt(var(y) / (var(X[,i] * sqrt(nid))))})
    zhat <- betahat/se
    pval <- 2 * pnorm(-abs(zhat))

    return(tibble(betahat, b, se, zhat, pval, af))
}

generate_missing <- function(ss, frac) {
    ss <- ss %>% mutate(
        betahat2 = ifelse(runif(n()) < frac, NA, betahat),
        se2 = ifelse(is.na(betahat2), NA, se),
        zhat2 = ifelse(is.na(betahat2), NA, zhat))
    return(ss)
}

clump <- function(z, R, zthresh = qnorm(1e-5, low=F), rthresh = 0.2) {
    z <- abs(z)
    z[z < zthresh] <- NA
    k <- c()
    while(!all(is.na(z))) {
        i <- which.max(z)
        k <- c(k, i)
        z[i] <- NA
        z[which(R[i,]^2 > rthresh)] <- NA
    }
    return(k)
}

One simulation example where there are 3 causal variants and they are known and 10% of the data is masked for imputation

R <- cor(X)
ss <- simulate_ss(X, af, 3, 20)
ss <- generate_missing(ss, 0.1)
ss1 <- imp(R, ss, which(ss$b != 0))
ss1
$ss
# A tibble: 1,075 × 13
   betahat     b    se   zhat     pval    af betahat2   se2  zhat2 betahatimp
     <dbl> <dbl> <dbl>  <dbl>    <dbl> <dbl>    <dbl> <dbl>  <dbl>      <dbl>
 1   3.17      0 0.841  3.77  1.62e- 4 0.155    3.17  0.841  3.77       3.17 
 2   0.844     0 2.53   0.333 7.39e- 1 0.989    0.844 2.53   0.333      0.844
 3  -0.185     0 1.30  -0.142 8.87e- 1 0.939   -0.185 1.30  -0.142     -0.185
 4   4.21      0 2.84   1.48  1.38e- 1 0.988    4.21  2.84   1.48       4.21 
 5  -8.30      0 2.40  -3.46  5.36e- 4 0.983   -8.30  2.40  -3.46      -8.30 
 6   4.15      0 0.699  5.94  2.93e- 9 0.241    4.15  0.699  5.94       4.15 
 7 -12.6       0 1.49  -8.46  2.72e-17 0.960  -12.6   1.49  -8.46     -12.6  
 8  -1.71      0 1.99  -0.859 3.90e- 1 0.977   -1.71  1.99  -0.859     -1.71 
 9   4.21      0 2.84   1.48  1.38e- 1 0.988    4.21  2.84   1.48       4.21 
10   0.370     0 1.29   0.286 7.75e- 1 0.938    0.370 1.29   0.286      0.370
# ℹ 1,065 more rows
# ℹ 3 more variables: seimp <dbl>, zimp <dbl>, pimp <dbl>

$b_adj
        b 
0.9762786 

$se_adj
       se 
0.6521467 

$b_cor
[1] 0.9925273

$se_cor
[1] 0.9928014

$n_ind
[1] 3

Show the performance of the imputation at the missing values

ggplot(ss1$ss, aes(x=betahatimp, y=betahat)) + geom_point(aes(colour=is.na(betahat2)))

Now we can run a simulation to test the performance of the imputation across different scenarios

sim_all <- function(X, R, frac_missing, ncause, sigmag, zthresh, rthresh, seed=1234) {
    ss <- simulate_ss(X, af, ncause, sigmag, seed)
    ss <- generate_missing(ss, frac_missing)
    if(zthresh == -1) {
        index <- which(ss$b != 0)
    } else if(zthresh == -2) {
        index <- which.max(abs(ss$zhat))
    } else {
        index <- clump(ss$zhat2, R, qnorm(zthresh, low=FALSE), rthresh)
    }
    ss <- imp(R, ss, index)
    return(ss)
}

params <- expand.grid(
    frac_missing = c(0.1, 0.3),
    ncause = c(1, 2, 3),
    sigmag = c(10, 20),
    zthresh = c(-1, -2, 1e-5, 1e-8),
    rthresh = c(0.01),
    sim = 1:20
)
dim(params)

res <- lapply(1:nrow(params), \(i) {
    message(i)
    p <- params[i,]
    r <- tryCatch(sim_all(X, R, p$frac_missing, p$ncause, p$sigmag, p$zthresh, p$rthresh, seed=i), error=function(e) {return(NULL)})
    tibble(
        frac_missing = p$frac_missing,
        ncause = p$ncause,
        sigmag = p$sigmag,
        zthresh = p$zthresh,
        rthresh = p$rthresh,
        sim = p$sim,
        b_cor = r$b_cor,
        se_cor = r$se_cor,
        b_adj = r$b_adj,
        se_adj = r$se_adj
    )
}) %>% bind_rows()
save(res, file="simres.rdata")

Look at simulation results

load("simres.rdata")
res %>% mutate(
    zthresh = case_when(zthresh == -1 ~ "Known causal variants",
                        zthresh == -2 ~ "Top hit",
                        TRUE ~ paste("Clump at", zthresh))
) %>%
    ggplot(aes(x=as.factor(frac_missing), y=b_cor)) + 
        geom_boxplot(aes(fill=as.factor(zthresh))) + 
        facet_grid(ncause ~ sigmag, labeller=label_both) + 
        labs(y="Correlation between known and imputed effect sizes", x="Fraction of missing values", fill="Index variant method")
Warning: Removed 136 rows containing non-finite outside the scale range
(`stat_boxplot()`).

Summary

  • Using a single tophit to generate the sumstats seems to be fine even when there are multiple causal variants
  • Clumping with strict rsq threshold and relaxed p-value threshold to obtain index SNPs seems to be most effective
  • The performance doesn’t change drastically based on fraction of missing SNPs
  • Previous iterations showed that if doing clumping, using rsq thresh 0.2 led to major problems, so having index SNPs be in relative linkage equilibrium seems important

sessionInfo()
R version 4.4.1 (2024-06-14)
Platform: aarch64-apple-darwin20
Running under: macOS Sonoma 14.6.1

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: Europe/London
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] ggplot2_3.5.1     MASS_7.3-60.2     data.table_1.15.4 dplyr_1.1.4      

loaded via a namespace (and not attached):
 [1] vctrs_0.6.5       cli_3.6.2         knitr_1.47        rlang_1.1.3      
 [5] xfun_0.44         generics_0.1.3    jsonlite_1.8.8    labeling_0.4.3   
 [9] glue_1.7.0        colorspace_2.1-0  htmltools_0.5.8.1 scales_1.3.0     
[13] fansi_1.0.6       rmarkdown_2.27    grid_4.4.1        munsell_0.5.1    
[17] evaluate_0.23     tibble_3.2.1      fastmap_1.2.0     yaml_2.3.8       
[21] lifecycle_1.0.4   compiler_4.4.1    htmlwidgets_1.6.4 pkgconfig_2.0.3  
[25] farver_2.1.2      digest_0.6.35     R6_2.5.1          tidyselect_1.2.1 
[29] utf8_1.2.4        pillar_1.9.0      magrittr_2.0.3    withr_3.0.0      
[33] gtable_0.3.5      tools_4.4.1