library(glue)
library(data.table)
library(dplyr)
library(ggplot2)
Background
How sensitive is DRM to incomplete LD in more realistic scenarios? Take a region of 1000 genomes data and choose a SNP at random to have an additive effect of some determined magnitude. We’d now determine how many SNPs in the region (i.e. with incomplete LD with the causal SNP) have evidence of vQTL using DRM or SVLM. How does genetic effect size and sample size relate to minimum p-value and number of positions with FDR < 0.05.
<- function(g, y)
test_drm
{<- tapply(y, g, median, na.rm=T)
y.i <- abs(y - y.i[g+1])
z.ij summary(lm(z.ij ~ g))$coef %>%
as_tibble() %>%
slice(2) %>%
mutate(method="drm")
}
<- function(g, y)
test_svlm
{<- residuals(lm(y ~ g))^2
yres summary(lm(yres ~ g))$coefficients %>%
as_tibble() %>%
slice(n=2) %>%
mutate(method="svlm")
}
Get some plink data - e.g. just 1-2k SNPs from 1000 genomes European samples
/Users/gh13047/Downloads/plink_mac_20230116/plink \
--bfile /Users/gh13047/repo/opengwas-api-internal/opengwas-api/app/ld_files/EUR \
--chr 22 \
--recode A \
--out eur22 \
--from-bp 0 --to-bp 17000000
PLINK v1.90b7 64-bit (16 Jan 2023) www.cog-genomics.org/plink/1.9/
(C) 2005-2023 Shaun Purcell, Christopher Chang GNU General Public License v3
Logging to eur22.log.
Options in effect:
--bfile /Users/gh13047/repo/opengwas-api-internal/opengwas-api/app/ld_files/EUR
--chr 22
--from-bp 0
--out eur22
--recode A
--to-bp 17000000
16384 MB RAM detected; reserving 8192 MB for main workspace.
1734 out of 8550156 variants loaded from .bim file.
503 people (0 males, 0 females, 503 ambiguous) loaded from .fam.
Ambiguous sex IDs written to eur22.nosex .
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 503 founders and 0 nonfounders present.
Calculating allele frequencies... 0%1%2%3%4%5%6%7%8%9%10%11%12%13%14%15%16%17%18%19%20%21%22%23%24%25%26%27%28%29%30%31%32%33%34%35%36%37%38%39%40%41%42%43%44%45%46%47%48%49%50%51%52%53%54%55%56%57%58%59%60%61%62%63%64%65%66%67%68%69%70%71%72%73%74%75%76%77%78%79%80%81%82%83%84%85%86%87%88%89%90%91%92%93%94%95%96%97%98%99% done.
Total genotyping rate is in [0.9999995, 1).
1734 variants and 503 people pass filters and QC.
Note: No phenotypes present.
--recode A to eur22.raw ... 0%1%2%3%4%5%6%7%8%9%10%11%12%13%14%15%16%17%18%19%20%21%22%23%24%25%26%27%28%29%30%31%32%33%34%35%36%37%38%39%40%41%42%43%44%45%46%47%48%49%50%51%52%53%54%55%56%57%58%59%60%61%62%63%64%65%66%67%68%69%70%71%72%73%74%75%76%77%78%79%80%81%82%83%84%85%86%87%88%89%90%91%92%93%94%95%96%97%98%99%done.
Read in genotype data
<- fread("eur22.raw")
geno <- geno[,1:6]
fam <- as.matrix(geno[,-c(1:6)])
geno 1:10,1:10] geno[
rs587616822_G rs62224609_C rs4965031_A rs587646183_C rs139918843_C
[1,] 0 0 1 0 0
[2,] 0 1 0 0 0
[3,] 0 1 0 0 0
[4,] 0 0 0 0 0
[5,] 0 0 0 0 0
[6,] 0 1 0 0 0
[7,] 0 0 1 0 0
[8,] 0 0 0 0 0
[9,] 0 1 0 0 0
[10,] 0 0 1 0 0
rs376238049_T rs200777521_A rs587701155_A rs80167676_T rs915675_A
[1,] 0 0 0 0 1
[2,] 1 0 0 0 1
[3,] 1 0 0 0 1
[4,] 0 0 0 0 0
[5,] 0 0 0 0 0
[6,] 1 0 0 0 1
[7,] 0 0 1 0 1
[8,] 0 0 0 0 1
[9,] 0 1 0 1 0
[10,] 0 0 0 0 0
dim(geno)
[1] 503 1734
<- cor(geno)
cormat dim(cormat)
[1] 1734 1734
<- function(y, x)
fast_assoc
{<- is.finite(y) & is.finite(x)
index <- sum(index)
n <- y[index]
y <- x[index]
x <- var(x)
vx <- stats::cov(y, x)/vx
bhat <- mean(y) - bhat * mean(x)
ahat <- (bhat * vx)^2/(vx * var(y))
rsq <- rsq * (n - 2)/(1 - rsq)
fval <- sqrt(fval)
tval <- abs(bhat/tval)
se <- stats::pf(fval, 1, n - 2, lower.tail = FALSE)
p return(list(ahat = ahat, bhat = bhat, se = se, fval = fval, pval = p, n = n))
}
<- function(g, y)
test_drm
{<- tapply(y, g, median, na.rm=T)
y.i <- abs(y - y.i[g+1])
z.ij fast_assoc(z.ij, g) %>% as_tibble()
}
<- function(rsq, geno, n, cormat)
sim_full
{<- sample(1:ncol(geno), 1)
i <- as.numeric(scale(geno[,i])) * sqrt(rsq) + rnorm(nrow(geno), 0, sqrt(1-rsq))
y <- lapply(1:ncol(geno), function(i)
res
{test_drm(geno[1:n,i], y[1:n])
%>% bind_rows() %>% mutate(snp=1:n(), ldrsq=cormat[i,]^2)
}) return(res)
%>%
res mutate(fdr = p.adjust(pval, "fdr")) %>%
summarise(
vqtl = which.min(pval),
minp = min(pval, na.rm=T),
nfdr = sum(fdr < 0.05, na.rm=T),
vqtl_ldrsq = ldrsq[vqtl]
%>%
) mutate(rsq = rsq, n = n, qtl=i, af=sum(geno[,i])/(2*nrow(geno))) %>%
ungroup()
}set.seed(1234)
<- sim_full(0.5, geno, 500, cormat)
o %>% ggplot(., aes(x=ldrsq, y = -log10(pval))) +
o geom_point() +
geom_smooth()
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
Determining spurious effects
Test DRM on each SNP in region for varying sample sizes and additive variance explained in the region
Simulation here: https://github.com/explodecomputer/mz-gwas/blob/main/scripts/drm_sims.r
load(url("https://github.com/explodecomputer/mz-gwas/raw/main/scripts/drm_sims.rdata"))
ggplot(res %>% filter(minp < 5e-8), aes(x=as.factor(rsq), y=nfdr)) +
geom_boxplot(aes(fill=as.factor(n)))
ggplot(res %>% filter(minp < 5e-8), aes(x=as.factor(rsq), y=-log10(minp))) +
geom_boxplot(aes(fill=as.factor(n)))
ggplot(res %>% filter(minp < 5e-10), aes(x=af, y=vqtl_ldrsq)) +
geom_point()
Summary
- SVLM and DGLM suffer equally from the issue
- The problem is likely a function of F statistic and LD - i.e. as sample sizes get larger, smaller effects additive become more relevant
- With n=500 additive effect of rsq = 0.1 is enough to generate a spurious significant vQTL
unlink("eur22*")
sessionInfo()
R version 4.2.3 Patched (2023-03-15 r84020)
Platform: aarch64-apple-darwin20 (64-bit)
Running under: macOS Monterey 12.6.2
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRlapack.dylib
locale:
[1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] ggplot2_3.4.0 dplyr_1.0.10 data.table_1.14.2 glue_1.6.2
loaded via a namespace (and not attached):
[1] pillar_1.8.1 compiler_4.2.3 tools_4.2.3 digest_0.6.31
[5] lattice_0.20-45 nlme_3.1-162 jsonlite_1.8.4 evaluate_0.19
[9] lifecycle_1.0.3 tibble_3.1.8 gtable_0.3.1 mgcv_1.8-42
[13] pkgconfig_2.0.3 rlang_1.0.6 Matrix_1.5-3 cli_3.5.0
[17] DBI_1.1.3 yaml_2.3.6 xfun_0.36 fastmap_1.1.0
[21] withr_2.5.0 stringr_1.5.0 knitr_1.41 generics_0.1.3
[25] vctrs_0.5.1 htmlwidgets_1.5.4 grid_4.2.3 tidyselect_1.2.0
[29] R6_2.5.1 fansi_1.0.3 rmarkdown_2.16 farver_2.1.1
[33] magrittr_2.0.3 scales_1.2.1 htmltools_0.5.4 splines_4.2.3
[37] assertthat_0.2.1 colorspace_2.0-3 labeling_0.4.2 utf8_1.2.2
[41] stringi_1.7.8 munsell_0.5.0