Sandwich variance estimators to control LD leakage

Author

Gibran Hemani

Published

February 10, 2023

Background

See https://explodecomputer.github.io/lab-book/posts/2022-12-16-vqtl/ for inflation issues with vQTLs.

Can sandwich estimators help https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5943197/

library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(ggplot2)
library(sandwich)
set.seed(12345)

test_drm <- function(g, y, sandwich)
{
  nom <- c(b="Estimate", se="Std. Error", tval="t value", pval="Pr(>|t|)")
  y.i <- tapply(y, g, median, na.rm=T)
  z.ij <- abs(y - y.i[g+1])
  a <- lm(z.ij ~ g)
  m1 <- a %>%
    summary %>%
    coef %>%
    as_tibble() %>%
    # rename(all_of(nom)) %>%
    slice(2) %>%
    mutate(method="drm")
  if(sandwich)
  {
    o <- sandwich::vcovHC(a, type="HC")
    m2 <- m1
    m2$`Std. Error` <- sqrt(o[2,2])
    m2$`t value` <- m2$Estimate / m2$`Std. Error`
    m2$`Pr(>|t|)` <- pnorm(m2$`t value`, lower.tail=TRUE)
    m2$method <- "drm sandwich"
    return(m2)
  }
  return(m1)
}

correlated_binomial <- function (nid, p1, p2, rho, n = 2, round = TRUE, print = FALSE) 
{
    p <- p1
    q <- p2
    a <- function(rho, p, q) {
        rho * sqrt(p * q * (1 - p) * (1 - q)) + (1 - p) * (1 - q)
    }
    a.0 <- a(rho, p, q)
    prob <- c(`(0,0)` = a.0, `(1,0)` = 1 - q - a.0, `(0,1)` = 1 - 
        p - a.0, `(1,1)` = a.0 + p + q - 1)
    if (min(prob) < 0) {
        print(prob)
        stop("Error: a probability is negative.")
    }
    n.sim <- nid
    u <- sample.int(4, n.sim * n, replace = TRUE, prob = prob)
    y <- floor((u - 1)/2)
    x <- 1 - u%%2
    x <- colSums(matrix(x, nrow = n))
    y <- colSums(matrix(y, nrow = n))
    if (round) {
        x <- round(x)
        y <- round(y)
    }
    if (print) {
        print(table(x, y))
        print(stats::cor(x, y))
    }
    return(cbind(x, y))
}

gendatp <- function(n, p1, p2, p3, r1)
{
    dat <- correlated_binomial(n, p1, p2, r1) %>% as_tibble()
    names(dat) <- c("y1", "y2")
    dat$y3 <- rbinom(n, 1, p3)
    return(dat)
}

run_simp <- function(param, i)
{
    set.seed(i*10)
    dat <- gendatp(param$n[i], param$p1[i], param$p2[i], param$p3[i], param$r1[i])
    x <- dat$y1 + rnorm(nrow(dat), sd=sd(dat$y1)/4)
    mod1 <- lm(x ~ y2 + y3, dat)
    mod2 <- lm(x ~ y2 + y3 + y2*y3, dat)
    amod <- anova(mod1, mod2)
    param$F[i] <- amod$P[2]
    o1 <- test_drm(dat$y1, x, param$sandwich[i])
    o2 <- test_drm(dat$y2, x, param$sandwich[i])
    o3 <- test_drm(dat$y3, x, param$sandwich[i])
    param$drm1[i] <- o1$`Pr(>|t|)`
    param$drm2[i] <- o2$`Pr(>|t|)`
    param$drm3[i] <- o3$`Pr(>|t|)`
    return(param[i,])
}

param <- expand.grid(
    sandwich=c(T,F),
    p1=0.1,
    p2=0.1,
    p3=0.5,
    p4=0.1,
    n=1000,
    r1=seq(0, 1, by=0.2),
    sim=1:250,
    r2=NA,
    F=NA,
    drm1=NA,
    drm2=NA,
    drm3=NA
)

resp <- lapply(1:nrow(param), function(x) run_simp(param, x)) %>% bind_rows()
str(resp)
'data.frame':   3000 obs. of  13 variables:
 $ sandwich: logi  TRUE FALSE TRUE FALSE TRUE FALSE ...
 $ p1      : num  0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 ...
 $ p2      : num  0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 ...
 $ p3      : num  0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
 $ p4      : num  0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 ...
 $ n       : num  1000 1000 1000 1000 1000 1000 1000 1000 1000 1000 ...
 $ r1      : num  0 0 0.2 0.2 0.4 0.4 0.6 0.6 0.8 0.8 ...
 $ sim     : int  1 1 1 1 1 1 1 1 1 1 ...
 $ r2      : logi  NA NA NA NA NA NA ...
 $ F       : num  0.4136 0.4151 0.0649 0.1828 0.3566 ...
 $ drm1    : num  0.8477 0.376 0.9038 0.0149 0.3456 ...
 $ drm2    : num  0.988716 0.12422 1 0.000167 1 ...
 $ drm3    : num  0.45884 0.14746 0.00898 0.5823 0.18002 ...
 - attr(*, "out.attrs")=List of 2
  ..$ dim     : Named int [1:13] 2 1 1 1 1 1 6 250 1 1 ...
  .. ..- attr(*, "names")= chr [1:13] "sandwich" "p1" "p2" "p3" ...
  ..$ dimnames:List of 13
  .. ..$ sandwich: chr [1:2] "sandwich=TRUE" "sandwich=FALSE"
  .. ..$ p1      : chr "p1=0.1"
  .. ..$ p2      : chr "p2=0.1"
  .. ..$ p3      : chr "p3=0.5"
  .. ..$ p4      : chr "p4=0.1"
  .. ..$ n       : chr "n=1000"
  .. ..$ r1      : chr [1:6] "r1=0.0" "r1=0.2" "r1=0.4" "r1=0.6" ...
  .. ..$ sim     : chr [1:250] "sim=  1" "sim=  2" "sim=  3" "sim=  4" ...
  .. ..$ r2      : chr "r2=NA"
  .. ..$ F       : chr "F=NA"
  .. ..$ drm1    : chr "drm1=NA"
  .. ..$ drm2    : chr "drm2=NA"
  .. ..$ drm3    : chr "drm3=NA"
ggplot(resp, aes(x=r1, y=-log10(drm2))) +
geom_boxplot(aes(fill=as.factor(r1))) +
scale_fill_brewer(type="seq") +
labs(y="DRM -log10 p", x="LD between tagging\nvariant and causal variant", fill="") +
facet_grid(. ~ sandwich)

Power

run_simp2 <- function(param, i)
{
    set.seed(i*10)
    dat <- gendatp(param$n[i], param$p1[i], param$p2[i], param$p3[i], param$r1[i])
    x <- dat$y1 + dat$y2 * dat$y3 * param$b[i] + rnorm(nrow(dat), sd=sd(dat$y1)/4)
    mod1 <- lm(x ~ y2 + y3, dat)
    mod2 <- lm(x ~ y2 + y3 + y2*y3, dat)
    amod <- anova(mod1, mod2)
    param$F[i] <- amod$P[2]
    o1 <- test_drm(dat$y1, x, param$sandwich[i])
    o2 <- test_drm(dat$y2, x, param$sandwich[i])
    o3 <- test_drm(dat$y3, x, param$sandwich[i])
    param$drm1[i] <- o1$`Pr(>|t|)`
    param$drm2[i] <- o2$`Pr(>|t|)`
    param$drm3[i] <- o3$`Pr(>|t|)`
    return(param[i,])
}

param <- expand.grid(
    sandwich=c(T,F),
    b=c(0, 0.1, 1),
    p1=0.1,
    p2=0.1,
    p3=0.5,
    p4=0.1,
    n=1000,
    r1=seq(0, 1, by=0.2),
    sim=1:250,
    r2=NA,
    F=NA,
    drm1=NA,
    drm2=NA,
    drm3=NA
)

resp2 <- lapply(1:nrow(param), function(x) run_simp2(param, x)) %>% bind_rows()
str(resp)
'data.frame':   3000 obs. of  13 variables:
 $ sandwich: logi  TRUE FALSE TRUE FALSE TRUE FALSE ...
 $ p1      : num  0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 ...
 $ p2      : num  0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 ...
 $ p3      : num  0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
 $ p4      : num  0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 ...
 $ n       : num  1000 1000 1000 1000 1000 1000 1000 1000 1000 1000 ...
 $ r1      : num  0 0 0.2 0.2 0.4 0.4 0.6 0.6 0.8 0.8 ...
 $ sim     : int  1 1 1 1 1 1 1 1 1 1 ...
 $ r2      : logi  NA NA NA NA NA NA ...
 $ F       : num  0.4136 0.4151 0.0649 0.1828 0.3566 ...
 $ drm1    : num  0.8477 0.376 0.9038 0.0149 0.3456 ...
 $ drm2    : num  0.988716 0.12422 1 0.000167 1 ...
 $ drm3    : num  0.45884 0.14746 0.00898 0.5823 0.18002 ...
 - attr(*, "out.attrs")=List of 2
  ..$ dim     : Named int [1:13] 2 1 1 1 1 1 6 250 1 1 ...
  .. ..- attr(*, "names")= chr [1:13] "sandwich" "p1" "p2" "p3" ...
  ..$ dimnames:List of 13
  .. ..$ sandwich: chr [1:2] "sandwich=TRUE" "sandwich=FALSE"
  .. ..$ p1      : chr "p1=0.1"
  .. ..$ p2      : chr "p2=0.1"
  .. ..$ p3      : chr "p3=0.5"
  .. ..$ p4      : chr "p4=0.1"
  .. ..$ n       : chr "n=1000"
  .. ..$ r1      : chr [1:6] "r1=0.0" "r1=0.2" "r1=0.4" "r1=0.6" ...
  .. ..$ sim     : chr [1:250] "sim=  1" "sim=  2" "sim=  3" "sim=  4" ...
  .. ..$ r2      : chr "r2=NA"
  .. ..$ F       : chr "F=NA"
  .. ..$ drm1    : chr "drm1=NA"
  .. ..$ drm2    : chr "drm2=NA"
  .. ..$ drm3    : chr "drm3=NA"
ggplot(resp2, aes(x=r1, y=-log10(drm2))) +
geom_boxplot(aes(fill=as.factor(r1))) +
scale_fill_brewer(type="seq") +
labs(y="DRM -log10 p", x="LD between tagging\nvariant and causal variant", fill="") +
facet_grid(b ~ sandwich)
Warning: Removed 2 rows containing non-finite values (`stat_boxplot()`).


sessionInfo()
R version 4.2.1 Patched (2022-09-06 r82817)
Platform: aarch64-apple-darwin20 (64-bit)
Running under: macOS Monterey 12.6.2

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRlapack.dylib

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] sandwich_3.0-2 ggplot2_3.4.0  dplyr_1.0.10  

loaded via a namespace (and not attached):
 [1] RColorBrewer_1.1-3 pillar_1.8.1       compiler_4.2.1     tools_4.2.1       
 [5] digest_0.6.31      jsonlite_1.8.4     evaluate_0.19      lifecycle_1.0.3   
 [9] tibble_3.1.8       gtable_0.3.1       lattice_0.20-45    pkgconfig_2.0.3   
[13] rlang_1.0.6        DBI_1.1.3          cli_3.5.0          yaml_2.3.6        
[17] xfun_0.36          fastmap_1.1.0      withr_2.5.0        stringr_1.5.0     
[21] knitr_1.41         generics_0.1.3     vctrs_0.5.1        htmlwidgets_1.5.4 
[25] grid_4.2.1         tidyselect_1.2.0   glue_1.6.2         R6_2.5.1          
[29] fansi_1.0.3        rmarkdown_2.16     farver_2.1.1       magrittr_2.0.3    
[33] scales_1.2.1       htmltools_0.5.4    assertthat_0.2.1   colorspace_2.0-3  
[37] labeling_0.4.2     utf8_1.2.2         stringi_1.7.8      munsell_0.5.0     
[41] zoo_1.8-11