load(url("https://github.com/explodecomputer/lab-book/raw/refs/heads/main/posts/2024-09-18-conditional-summary-stats/1kg_region.rdata"))
library(MASS)
library(dplyr)
<- function(A) {
solve2 <- eigen(A)
eig $values <- 1 / eig$values
eigreturn(eig$vectors %*% diag(eig$values) %*% t(eig$vectors))
}
<- function(A, lambda=1e-6) {
solve3 solve(A + diag(nrow(A)) * lambda)
}
<- function(n) {
sim_mat <- qr.Q(qr(matrix(rnorm(n^2), n)))
p <- crossprod(p, p*(n:1))
Sigma <- diag(Sigma)
v <- diag(1/sqrt(v)) %*% Sigma %*% diag(1/sqrt(v))
Sigma <- v / max(v) / 2
v <- (1 + sqrt(1-2*v)) / 2
af return(list(R=Sigma, af=af))
}# sim_mat(100)[[1]][1:10,1:10]
<- function(X, af, ncause, sigmag, seed=1234) {
simulate_ss set.seed(seed)
<- length(af)
nsnp <- nrow(X)
nid <- rep(0, nsnp)
b sample(1:nsnp, ncause)] <- rnorm(ncause, sd=sigmag)
b[
<- rnorm(nid)
e <- X %*% b + e
y print(mean(y))
<- sapply(1:nsnp, \(i) {cov(X[,i], y) / var(X[,i])})
betahat <- sapply(1:nsnp, \(i) {sqrt(var(y) / (var(X[,i] * sqrt(nid))))})
se <- betahat/se
zhat <- 2 * pnorm(-abs(zhat))
pval
<- MASS::ginv(t(X) %*% X) %*% t(X) %*% y
betaexp # betaexp <- solve3(t(X) %*% X) %*% t(X) %*% y
return(tibble(betahat, betaexp, b, se, zhat, pval, af))
}
<- rbind(X, X, X, X)
X
<- simulate_ss(X, af, 20, 0.1)
ss
plot(ss$betahat)
plot(ss$betaexp)
Background
Data generating model
\[ y = Xb + e \]
where \(y\) is the phenotype, \(X\) is the genotype matrix, \(b\) is the effect sizes and \(e\) is the error term. X is centred to have each column mean zero to avoid need to handle intercept terms.
The expected associations between each variant and y is
\[ \beta = (X^T X)^{-1} X^T y \]
plot(ss$betaexp, ss$betahat)
This is equivalent to
\[ \beta = D^{-1} R D b \]
where \(D\) is a diagonal matrix with the diagonal elements being the variance of the columns of \(X\), \(R\) is the correlation matrix of \(X\) and \(b\) is a vector of true effect sizes. Expect that \(b\) is quite sparse, and can be estimated from finemapping methods.
<- diag(2 * af * (1 - af))
D <- diag(1 / diag(D))
Di <- cor(X)
R $betahat2 <- sqrt(Di) %*% R %*% D %*% ss$b
ssplot(ss$betahat2, ss$betaexp)
\[ \beta = D^{-1} R D b \]
where \(D\) is a diagonal matrix with the diagonal elements being the variance of the columns of \(X\), \(R\) is the correlation matrix of \(X\) and \(b\) is a vector of true effect sizes. Expect that \(b\) is quite sparse, and can be estimated from finemapping methods.
If there are \(K\) causal variants, we would like to obtain \(K\) versions of the summary statistics where each \(k\)th version is obtained by removing the effects of all other causal variants.
\[ \hat{b}_{(v)} = D_{(v)} R^{-1}_{(v)} D^{-1}_{(v)} \beta_{(v)} \]
where \(\hat{b}_{(v)}\) is the vector true causal effect estimates for the \(K\) causal variants only.
So the region-wide associations generated from all causal variants can be converted into a set of \(K\) regional associations, where each represents the causal effect of one variant and with all other causal effects removed.
\[ \hat{b}_{(k)} = \beta - \hat{b}_{(v)} \]
<- function(n, nid) {
sim_mat <- qr.Q(qr(matrix(rnorm(n^2), n)))
p <- crossprod(p, p*(n:1))
Sigma <- diag(Sigma)
v <- diag(1/sqrt(v)) %*% Sigma %*% diag(1/sqrt(v))
Sigma <- v / max(v) / 2
v <- (1 + sqrt(1-2*v)) / 2
af <- mvtnorm::rmvnorm(nid, rep(0, n), Sigma)
X return(list(X=X, R=Sigma, af=af))
}
<- sim_mat(100, 500)
m <- m$X
X
<- rep(0, ncol(X))
b sample(1:ncol(X), 3)] <- rnorm(3, sd=10)
b[<- X %*% b + rnorm(nrow(X))
y
b<- solve(t(X) %*% X) %*% t(X) %*% y
bexp plot(bexp)
<- sapply(1:ncol(X), \(i) {cov(X[,i], y) / var(X[,i])})
bhat plot(bhat)
plot(bhat ~ bexp)
<- cor(X)
R 1:10,1:10]
R[
<- diag(apply(X, 2, var))
D
plot(
c(sqrt(D) %*% R %*% sqrt(D))*(nrow(X)-1),
c(t(X) %*% X)
)
plot(
c(t(X) %*% y),
%*% sqrt(D) * (nrow(X)-1)
bhat
)
<- solve(t(X) %*% X)
Xi
<- solve(R)
Ri
# sim_mat(100)[[1]][1:10,1:10]
I want there to be 3 causal variants.
I will create summary statistics for all variants. Then SS for each variant removed.
I want to recapitulate each variant removed.
<- sim_mat(100, 500)
m <- m$X
X <- rep(0, ncol(X))
b sample(1:ncol(X), 3)] <- rnorm(3, sd=10)
b[<- rnorm(nrow(X))
e <- X %*% b + e
y
# Generate summary stats for all variants
<- sapply(1:ncol(X), \(i) {cov(X[,i], y) / var(X[,i])})
bhat
# Expected summary stats
<- diag(apply(X, 2, var))
D <- diag(1 / diag(D))
Di <- cor(X)
R <- Di %*% R %*% D %*% b
bexp
# Generate summary stats for each variant on its own
<- which(b != 0)
ind <- lapply(ind, \(i) {
bhatcond <- b[i]
bk <- rep(0, ncol(X))
b <- bk
b[i] <- X %*% b + e
y <- sapply(1:ncol(X), \(i) {cov(X[,i], y) / var(X[,i])})
bhat return(bhat)
})
<- lapply(1:length(bhatcond), \(i) {
bhatcondl tibble(b=bhatcond[[i]], i, pos=1:length(b))
%>% bind_rows()
})
%>% ggplot(aes(x=pos, y=b)) + geom_point() + facet_grid(i ~ .)
bhatcondl
# simulate summary stats excluding each variant
<- lapply(seq_along(ind), \(i) {
bcond <- b
bt <- 0
bt[ind[i]] <- Di %*% R %*% D %*% bt
bsub - bsub
bhat
})
<- lapply(1:length(bcond), \(i) {
bcondl tibble(b=bcond[[i]], i, pos=1:length(b), what="bcond")
%>% bind_rows()
})
%>% ggplot(aes(x=pos, y=b)) + geom_point() + facet_grid(i ~ .)
bcondl
inner_join(bhatcondl, bcondl, by=c("i", "pos")) %>% ggplot(aes(x=b.x, y=b.y)) + geom_point() + geom_abline() + facet_grid(i ~ .)
lapply(seq_along(ind), \(i) {
cor(bhatcond[[i]], bcond[[i]])
})
Do it again with genotype matrix
load(url("https://github.com/explodecomputer/lab-book/raw/refs/heads/main/posts/2024-09-18-conditional-summary-stats/1kg_region.rdata"))
<- rep(0, ncol(X))
b sample(1:ncol(X), 3)] <- rnorm(3, sd=10)
b[<- rnorm(nrow(X))
e <- X %*% b + e
y
# Generate summary stats for all variants
<- sapply(1:ncol(X), \(i) {cov(X[,i], y) / var(X[,i])})
bhat plot(bhat)
# Expected summary stats
<- diag(apply(X, 2, var))
D <- diag(1 / diag(D))
Di <- cor(X)
R <- sqrt(Di) %*% R %*% sqrt(D) %*% b
bexp plot(bexp)
plot(bhat ~ bexp)
# Generate summary stats for each variant on its own
<- which(b != 0)
ind <- lapply(ind, \(i) {
bhatcond <- b[i]
bk <- rep(0, ncol(X))
b <- bk
b[i] <- X %*% b + e
y <- sapply(1:ncol(X), \(i) {cov(X[,i], y) / var(X[,i])})
bhat return(bhat)
})
<- lapply(1:length(bhatcond), \(i) {
bhatcondl tibble(b=bhatcond[[i]], i, pos=1:length(b))
%>% bind_rows()
})
%>% ggplot(aes(x=pos, y=b)) + geom_point() + facet_grid(i ~ .)
bhatcondl
# simulate summary stats excluding each variant
<- lapply(seq_along(ind), \(i) {
bcond <- b
bt <- 0
bt[ind[i]] <- sqrt(Di) %*% R %*% sqrt(D) %*% bt
bsub - bsub
bhat
})
<- lapply(1:length(bcond), \(i) {
bcondl tibble(b=bcond[[i]], i, pos=1:length(b), what="bcond")
%>% bind_rows()
})
%>% ggplot(aes(x=pos, y=b)) + geom_point() + facet_grid(i ~ .)
bcondl
inner_join(bhatcondl, bcondl, by=c("i", "pos")) %>% ggplot(aes(x=b.x, y=b.y)) + geom_point() + geom_abline() + facet_grid(i ~ .)
lapply(seq_along(ind), \(i) {
cor(bhatcond[[i]], bcond[[i]])
})
<- function(bhat, R, af, index) {
generate_marginals <- 2 * af * (1 - af)
v <- diag(sqrt(v))
D <- diag(1 / diag(D))
Di
if(length(index) == 1) {
<- bhat[index]
bhat2 else {
} <- (D[index,index] %*% MASS::ginv(R[index,index]) %*% Di[index,index] %*% bhat[index]) %>% drop()
bhat2
}<- rep(0, nsnp)
b2 <- bhat2
b2[index]
<- lapply(seq_along(index), \(i) {
bcond <- b2
bt <- 0
bt[ind[i]] <- (sqrt(Di) %*% R %*% sqrt(D) %*% bt) %>% drop()
bsub - bsub
bhat
})return(bcond)
}
<- generate_marginals(bhat, R, af, which(b != 0))
bcond
<- lapply(1:length(bcond), \(i) {
bcondl tibble(b=bcond[[i]], i, pos=1:length(b), what="bcond")
%>% bind_rows()
})
%>% ggplot(aes(x=pos, y=b)) + geom_point() + facet_grid(i ~ .)
bcondl
inner_join(bhatcondl, bcondl, by=c("i", "pos")) %>% ggplot(aes(x=b.x, y=b.y)) + geom_point() + geom_abline() + facet_grid(i ~ .)
True
sstrue <- lapply(1:length(i), (j) { bb <- ba bb[i[-j]] <- 0 print(which(bb != 0)) beta_ld <- as.numeric(diag(1/xvar) %% R %% diag(xvar) %*% bb) tibble(j, beta_ld, pos = 1:length(beta_ld)) }) %>% bind_rows()
sstrue
ggplot(sstrue, aes(x = pos, y=abs(beta_ld))) + geom_point() + facet_grid(j ~ .)
cor(sstrue\(beta_ld, ss\)bhj)
inner_join(sstrue, ss, by=c(“j”, “pos”)) %>% ggplot(aes(x=(beta_ld), y=(bhj))) + geom_point() + geom_abline() + facet_grid(j ~ .)
sessionInfo()