library(MASS)
## calculation of asymptotic covariance matrix for Odds ratio
### convert logit values to probabilities
expit <- function(x){exp(x)/(1+exp(x))}
## calculate asymptotic version of (X^t W X)
asymp_var_logistic <- function(n,G_prob,Gamma_0,Gamma_1){
N <- length(Gamma_0)
disease_probs <- expit(outer(Gamma_0,c(1,1,1)) + outer(Gamma_1,c(0,1,2)))
diag_weights <- disease_probs*(1-disease_probs)
a <- n*apply(G_prob*diag_weights,1,sum)
b <- n*apply(G_prob*diag_weights*matrix(rep(c(0,1,2),N),byrow=TRUE,nrow=N),1,sum)
c <- b
d <- n*apply(G_prob*diag_weights*matrix(rep(c(0,1,4),N),byrow=TRUE,nrow=N),1,sum)
## invert matrix and take element for Gamma1
return(a/(a*d-b*c))
}
asymp_var_linear <- function(n,G_prob,sigma=1){
N <- nrow(G_prob)
a <- n*apply(G_prob,1,sum)
b <- n*apply(G_prob*matrix(rep(c(0,1,2),N),byrow=TRUE,nrow=N),1,sum)
c <- b
d <- n*apply(G_prob*matrix(rep(c(0,1,4),N),byrow=TRUE,nrow=N),1,sum)
## invert matrix and take element for Gamma1
return(a/(a*d-b*c)*sigma^2)
}
asymp_cov_linear_linear <- function(n_overlap,n_gx,n_gy,G_prob,sigma_x=1,sigma_y=1,cor_xy=.5){
N <- nrow(G_prob)
cov_xy <- cor_xy*sigma_x*sigma_y
a <- apply(G_prob,1,sum)
b <- apply(G_prob*matrix(rep(c(0,1,2),N),byrow=TRUE,nrow=N),1,sum)
c <- b
d <- apply(G_prob*matrix(rep(c(0,1,4),N),byrow=TRUE,nrow=N),1,sum)
## invert matrix and take element for Gamma1
return(n_overlap*(a/(a*d-b*c))*cov_xy/(n_gx*n_gy))
}
asymp_cov_linear_logistic <- function(n_overlap,n_gx,n_gy,G_prob,cor_xy=0,sigma=1,Gamma_0,Gamma_1,prev=0.01){
N <- length(Gamma_0)
cov_xy <- cor_xy*sigma*sqrt(prev*(1-prev))
disease_probs <- expit(outer(Gamma_0,c(1,1,1)) + outer(Gamma_1,c(0,1,2)))
diag_weights <- disease_probs*(1-disease_probs)
a <- apply(G_prob*diag_weights,1,sum)
b <- apply(G_prob*diag_weights*matrix(rep(c(0,1,2),N),byrow=TRUE,nrow=N),1,sum)
c <- b
d <- apply(G_prob*diag_weights*matrix(rep(c(0,1,4),N),byrow=TRUE,nrow=N),1,sum)
## invert matrix and take element for Gamma1
return(n_overlap*(a/(a*d-b*c))*cov_xy/(n_gx*n_gy))
}
################################################## Linear Logistic case ###################################################
## parameters
N <- 100000 # number of snps to simulate (assumed in LD and HWE)
prev <- 0.1 # disease prevalence
gamma1 <- rnorm(N,mean=0,sd=0.3) # true instrument strengths
beta <- 0.1 # causal effect
OR <- exp(beta*gamma1) ## true odds ratio (G-Y association)
MAF <- seq(from=.01,to=.5,length=N) # minor allele frequency
n_G_X <- 100000 # sample size G-X (# individuals genotyped on G with X phenotype)
n_G_Y <- 200000 # sample size G-Y (# individuals genotyped on G with Y phenotype)
overlap <- 100000 # shared sample size
sigma_x=1 #variation in trait
cor_xy=0.1 # correlation between x and y on same person
## assuming HWE
G_prob <- cbind(MAF^2,2*MAF*(1-MAF),(1-MAF)^2)
## find Gamma0 for logistic model so prevalence is 0.1
Gamma1 <- log(OR) # G, Y association on log-odds scale
myfunc <- function(MAF,Gamma0, Gamma1, prev){
MAF^2*expit(Gamma0+Gamma1*2)+2*MAF*(1-MAF)*expit(Gamma0+Gamma1)+(1-MAF)^2*expit(Gamma0)-prev
}
Gamma0 <- numeric(N)
for(i in 1:N) Gamma0[i] <- uniroot(myfunc,Gamma1=Gamma1[i],MAF=MAF[i],prev=prev,lower=-10, upper=10)$root
var_Gamma_y <- asymp_var_logistic(n_G_Y,G_prob,Gamma0,Gamma1)
var_gamma_x <- asymp_var_linear(n_G_X,G_prob,sigma=sigma_x)
cov_gamma_x_Gamma_y <- asymp_cov_linear_logistic(overlap, n_G_X,n_G_Y,G_prob,cor_xy=cor_xy,sigma=sigma_x,Gamma_0=Gamma0,Gamma_1=Gamma1,prev=prev)
## now what if G-X and G-Y GWASs share sample overlap (simulate joint association)
## simulate summary statistics
## create array, first 2 dimensions represent individual covariance matrices, 3rd dimension indexing SNPs
library(MASS)
cov_array <- array(dim=c(2,2,length(var_gamma_x)))
cov_array[1,1,] <- var_gamma_x
cov_array[2,1,] <- cov_gamma_x_Gamma_y
cov_array[1,2,] <- cov_array[2,1,]
cov_array[2,2,] <- var_Gamma_y
summary_stats <- apply(cov_array,3,function(x){mvrnorm(n=1,mu=c(0,0),Sigma=x)})
summary_stats <- t(summary_stats + rbind(gamma1,Gamma1))
## checking correlation
cor(summary_stats-cbind(gamma1,Gamma1))
#> gamma1 Gamma1
#> gamma1 1.00000000 0.07241586
#> Gamma1 0.07241586 1.00000000
################################################## Linear Linear case ###################################################
## parameters
N <- 100000 # number of snps to simulate (assumed in LD and HWE)
prev <- 0.1 # disease prevalence
beta <- 0.1 # causal effect of X and Y
gamma1 <- rnorm(N,mean=0,sd=0.3) # true instrument strengths
Gamma1 <- beta*gamma1 # true G/Y associations
MAF <- seq(from=.01,to=.5,length=N) # minor allele frequency
n_G_X <- 100000 # sample size G-X
n_G_Y <- 200000 # sample size G-Y
overlap <- 100000 # shared sample size (must be less than or equal to min(n_G_X,n_G_Y))
sigma_x=1 #variation in trait
sigma_y=1 #variation in outcome
cor_xy=0.1 # correlation between x and y on same person # note if there is no confounding this will be beta*sqrt(var_X)/sqrt(var_Y)
## assuming HWE
G_prob <- cbind(MAF^2,2*MAF*(1-MAF),(1-MAF)^2)
var_Gamma_y <- asymp_var_linear(n_G_Y,G_prob,sigma=sigma_y)
var_gamma_x <- asymp_var_linear(n_G_X,G_prob,sigma=sigma_x)
cov_gamma_x_Gamma_y <- asymp_cov_linear_linear(overlap, n_G_X,n_G_Y,G_prob,sigma_x=sigma_x,sigma_y=sigma_y,cor_xy=cor_xy)
## now what if G-X and G-Y GWASs share sample overlap (simulate joint association)
## simulate summary statistics
## create array, first 2 dimensions represent individual covariance matrices, 3rd dimension indexing SNPs
cov_array <- array(dim=c(2,2,length(var_gamma_x)))
cov_array[1,1,] <- var_gamma_x
cov_array[2,1,] <- cov_gamma_x_Gamma_y
cov_array[1,2,] <- cov_array[2,1,]
cov_array[2,2,] <- var_Gamma_y
summary_stats <- apply(cov_array,3,function(x){mvrnorm(n=1,mu=c(0,0),Sigma=x)})
summary_stats <- t(summary_stats + rbind(gamma1,Gamma1))
## checking correlation
cor(summary_stats-cbind(gamma1,Gamma1))
#> gamma1 Gamma1
#> gamma1 1.00000000 0.06836868
#> Gamma1 0.06836868 1.00000000