library(dplyr)
library(MASS)
<- function(m, f, rho) {
generate_assortment stopifnot(length(m) == length(f))
require(MASS)
<- mvrnorm(n = length(m), mu=c(0,0), Sigma=matrix(c(1,rho,rho,1), 2,2))
mvdat <- rank(mvdat[ , 1], ties.method = "first")
rm <- rank(mvdat[ , 2], ties.method = "first")
rf <- order(m)
m_order <- order(f)
f_order return(tibble(m = m_order[rm], f=f_order[rf]))
}
<- function(betas, nfam, h2, rho) {
make_families <- length(betas)
npoly <- sapply(1:npoly, \(i) rbinom(nfam, 1, 0.5))
g_mother1 <- sapply(1:npoly, \(i) rbinom(nfam, 1, 0.5))
g_mother2 <- sapply(1:npoly, \(i) rbinom(nfam, 1, 0.5))
g_father1 <- sapply(1:npoly, \(i) rbinom(nfam, 1, 0.5))
g_father2 <- cbind(diag(nfam), g_mother1)
g_mother1 <- cbind(matrix(0, nfam, nfam), g_mother2)
g_mother2 <- cbind(diag(nfam), g_father1)
g_father1 <- cbind(matrix(0, nfam, nfam), g_father2)
g_father2
<- (g_mother1[,(nfam+1):ncol(g_mother1)] + g_mother2[,(nfam+1):ncol(g_mother2)]) %*% betas
prs_mother <- (g_father1[,(nfam+1):ncol(g_father1)] + g_father2[,(nfam+1):ncol(g_father2)]) %*% betas
prs_father <- scale(prs_mother) + rnorm(nfam, 0, sqrt(1 - h2))
y_mother <- scale(prs_father) + rnorm(nfam, 0, sqrt(1 - h2))
y_father
<- generate_assortment(y_mother, y_father, rho)
m <- g_mother1[m$m,]
g_mother1 <- g_mother2[m$m,]
g_mother2 <- g_father1[m$f,]
g_father1 <- g_father2[m$f,]
g_father2 <- y_mother[m$m]
y_mother <- y_father[m$f]
y_father <- prs_mother[m$m]
prs_mother <- prs_father[m$f]
prs_father return(list(g_mother1 = g_mother1, g_mother2 = g_mother2, g_father1 = g_father1, g_father2 = g_father2, x = tibble(y_mother, y_father, prs_mother, prs_father)))
}
<- rnorm(100)
betas <- make_families(betas = betas, nfam = 100, nchild = 100, h2 = 0.8, rho = 0.4, ngen = 100)
dat
str(dat)
cor(dat$x$y_mother, dat$x$y_father)
cor(dat$x$y_mother, dat$x$prs_mother)^2
Background
Generate families with polygenic effects plus null rare variants. Allow for assortative mating. See if there is confounding of rare variants by polygenic effects.
Simulations
<- function(dat, betas, h2, rho) {
create_child <- nrow(dat$x)
nfam <- ncol(dat$g_mother1)
nsnp <- matrix(0, nfam, nsnp)
sib1_m <- matrix(0, nfam, nsnp)
sib1_f <- matrix(0, nfam, nsnp)
sib2_m <- matrix(0, nfam, nsnp)
sib2_f for(i in 1:nsnp) {
<- sample(c(TRUE, FALSE), nfam, replace=TRUE)
ind <- dat$g_mother1[ind, i]
sib1_m[ind, i] !ind, i] <- dat$g_mother2[!ind, i]
sib1_m[<- sample(c(TRUE, FALSE), nfam, replace=TRUE)
ind <- dat$g_father1[ind, i]
sib1_f[ind, i] !ind, i] <- dat$g_father2[!ind, i]
sib1_f[<- sample(c(TRUE, FALSE), nfam, replace=TRUE)
ind <- dat$g_mother1[ind, i]
sib2_m[ind, i] !ind, i] <- dat$g_mother2[!ind, i]
sib2_m[<- sample(c(TRUE, FALSE), nfam, replace=TRUE)
ind <- dat$g_father1[ind, i]
sib2_f[ind, i] !ind, i] <- dat$g_father2[!ind, i]
sib2_f[
}
<- (sib1_m[,(nfam+1):nsnp] + sib1_f[,(nfam+1):nsnp]) %*% betas
prs_sib1 <- (sib2_m[,(nfam+1):nsnp] + sib2_f[,(nfam+1):nsnp]) %*% betas
prs_sib2 <- scale(prs_sib1) + rnorm(nfam, 0, sqrt(1 - h2))
y_sib1 <- scale(prs_sib2) + rnorm(nfam, 0, sqrt(1 - h2))
y_sib2
# This doesn't exclude inbreeding
<- generate_assortment(y_sib1, y_sib2, rho)
m <- sib1_m[m$m,]
sib1_m <- sib1_f[m$m,]
sib1_f <- sib2_m[m$f,]
sib2_m <- sib2_f[m$f,]
sib2_f <- y_sib1[m$m]
y_sib1 <- y_sib2[m$f]
y_sib2 <- prs_sib1[m$m]
prs_sib1 <- prs_sib2[m$f]
prs_sib2 return(list(g_mother1 = sib1_m, g_mother2 = sib1_f, g_father1 = sib2_m, g_father2 = sib2_f, x = tibble(y_mother=y_sib1, y_father=y_sib2, prs_mother=prs_sib1, prs_father=prs_sib2)))
}
<- create_child(dat, betas, h2 = 0.8, rho = 0.4)
dat2 str(dat2)
cor(dat2$x$y_mother, dat2$x$y_father)
cor(dat2$x$prs_mother, dat2$x$prs_father)
cor(dat2$x$y_mother, dat2$x$prs_mother)^2
<- rnorm(100)
betas <- make_families(betas = betas, nfam = 100, h2 = 0.8, rho = 0.4)
dat for(i in 1:10) {
<- create_child(dat, betas, h2 = 0.8, rho = 0.4)
dat }
<- function(dat) {
collapse_dat <- dat$g_mother1 + dat$g_mother2
g_mother <- dat$g_father1 + dat$g_father2
g_father <- rbind(g_mother, g_father)
g <- tibble(y = c(dat$x$y_mother, dat$x$y_father), prs = c(dat$x$prs_mother, dat$x$prs_father))
x return(list(g = g, x = x))
}
<- collapse_dat(dat) dat2
<- function(l) {
collapse_dat2 <- do.call(rbind, lapply(l, \(x) x$g))
g <- do.call(rbind, lapply(1:length(l), \(i) l[[i]]$x %>% mutate(gen=i)))
x return(list(g = g, x = x))
}
<- collapse_dat2(list(collapse_dat(dat), collapse_dat(create_child(dat, betas, h2 = 0.8, rho = 0.4))))
temp str(temp)
<- function(y, x) {
fastAssoc <- is.finite(y) & is.finite(x)
index <- sum(index)
n <- y[index]
y <- x[index]
x <- var(x)
vx <- cov(y, x) / vx
bhat <- mean(y) - bhat * mean(x)
ahat # fitted <- ahat + x * bhat
# residuals <- y - fitted
# SSR <- sum((residuals - mean(residuals))^2)
# SSF <- sum((fitted - mean(fitted))^2)
<- (bhat * vx)^2 / (vx * var(y))
rsq <- rsq * (n-2) / (1-rsq)
fval <- sqrt(fval)
tval <- sum(x) / n / 2
af <- abs(bhat / tval)
se
# Fval <- (SSF) / (SSR/(n-2))
# pval <- pf(Fval, 1, n-2, lowe=F)
<- pf(fval, 1, n-2, lowe=F)
p return(list(
ahat=ahat, bhat=bhat, se=se, fval=fval, pval=p, af = af
))
}
<- function(y, g) {
gwas <- matrix(0, ncol(g), 6)
out for(i in 1:ncol(g))
{<- fastAssoc(y, g[,i])
o <- unlist(o)
out[i, ]
}<- as.data.frame(out)
out names(out) <- names(o)
return(out)
}
gwas(dat2$x$prs, dat2$g)
<- function(g, x) {
genotype_means lapply(1:ncol(g), \(i) {
tibble(g = g[,i], x = x) %>% group_by(g) %>% summarise(snp=i, mean = mean(x), sd = sd(x), n = n())
%>% bind_rows()
})
}
<- function(pvector) {
qqplot <- pvector[!is.na(pvector) & !is.nan(pvector) & !is.null(pvector) & is.finite(pvector) & pvector<1 & pvector>0]
pvector <- -log10(sort(pvector, decreasing=FALSE))
o <- -log10(ppoints(length(pvector)))
e <- qchisq(1-pvector, 1)
cs <- median(cs, na.rm=TRUE) / qchisq(0.5, 1)
lambda plot(e, o, xlab = "Expected", ylab = "Observed")
abline(0, 1)
return(lambda)
}
genotype_means(dat2$g, dat2$x$prs)
<- function(betas, nfam, h2, rho, ngen) {
whole_sim <- make_families(betas = betas, nfam = nfam, h2 = h2, rho = rho)
dat <- list()
l <- list()
l2 <- collapse_dat(dat)
dat2 1]] <- gwas(dat2$x$prs, dat2$g) %>% mutate(gen = 1)
l[[1]] <- l[[1]]
l2[[for(i in 1:ngen) {
message(i)
<- create_child(dat, betas, h2 = h2, rho = rho)
dat print(cor(dat$x$y_mother, dat$x$y_father))
<- collapse_dat(dat)
dat3 <- collapse_dat2(list(dat2, dat3))
dat2 +1]] <- gwas(dat2$x$prs, dat2$g) %>% mutate(gen = i+1)
l[[i+1]] <- gwas(dat3$x$prs, dat3$g) %>% mutate(gen = i+1)
l2[[i
}<- bind_rows(l)
res <- bind_rows(l2)
res2 <- genotype_means(dat2$g, dat2$x$prs)
m return(list(res, res2, m))
}
<- whole_sim(betas = rnorm(100), nfam = 1000, h2 = 0.8, rho = 0.4, ngen = 3) out
qqplot(-log10(out[[2]]$pval[1:1000]))
<- whole_sim(betas = rnorm(100), nfam = 10000, h2 = 0, rho = 0, ngen = 3)
out2 qqplot(-log10(out2[[2]]$pval[10001:11000]))
qqplot(-log10(out2[[2]]$pval[1:10000]))
<- whole_sim(betas = rnorm(100), nfam = 10000, h2 = 0.8, rho = 0.4, ngen = 3)
out qqplot(-log10(out[[2]]$pval[10001:11000]))
qqplot(-log10(out[[2]]$pval[1:10000]))
- Find families
- Mean PRS per family
- For families with large PRS, do they have rare variants?
- Do those rare variants associate with PRS?
<- function(g) {
make_grm <- colMeans(g) / 2
f <- scale(g)
g <- matrix(0, nrow(g), nrow(g))
m <- sum(f > 0)
npol for(i in 1:nrow(g)) {
for(j in 1:i) {
<- sum(g[i,] * g[j,], na.rm=TRUE) / npol
x # print(x)
<- x
m[i,j] <- m[i,j]
m[j,i]
}
}# hist(m[lower.tri(m)], breaks=100)
return(m)
}
<- function(m) {
greedy_remove_relateds <- nrow(m)
n <- rep(TRUE, n)
keep
<- tibble(
b id=1:n,
rel=sapply(1:n, \(i) sum(m[i,] > 0.2) - 1)
%>% arrange(desc(rel)) %>%
) filter(rel > 0)
diag(m) <- 0
while(nrow(b) > 0) {
message(nrow(b))
$id[1]] <- FALSE
keep[b$id[1],] <- 0
m[b$id[1]] <- 0
m[,b<- tibble(
b id=1:n,
rel=sapply(1:n, \(i) sum(m[i,] > 0.2) - 1)
%>% arrange(desc(rel)) %>%
) filter(rel > 0)
print(head(b))
}return(keep)
}
<- rnorm(100)
betas <- make_families(betas = betas, nfam = 100, h2 = 0.8, rho = 0.4)
dat <- create_child(dat, betas, h2 = 0.8, rho = 0.4)
dat2 <- create_child(dat2, betas, h2 = 0.8, rho = 0.4)
dat3 <- create_child(dat3, betas, h2 = 0.8, rho = 0.4)
dat4 <- create_child(dat4, betas, h2 = 0.8, rho = 0.4)
dat5 <- collapse_dat2(lapply(list(dat, dat2, dat3, dat4, dat5), collapse_dat))
cdat str(cdat)
str(dat2)
<- cdat$g
g <- collapse_dat(dat)$g
g <- collapse_dat(dat2)$g
g <- make_grm(g)
m
<- greedy_remove_relateds(m)
k <- m[k, k]
m dim(m)
dim(m)
1:10,1:10]
g[apply(g, 2, mean)
sum(g[1,] * g[2,]) / length(g[1,])
<- graph_
gr
library(ggplot2)
hist(m[lower.tri(m)], breaks=100)
<- function(cdat) {
find_families <- unique(cdat$x$gen)
gens lapply(gens, function(i) {
<- make_grm(cdat$g[cdat$x$gen == i,])
m diag(m) <- 0
< 0.2] <- 0
m[m >= 0.2] <- 1
m[m <- graph_from_adjacency_matrix(m)
gr <- cluster_walktrap(gr)
rw $membership
rw<- cdat$x[cdat$x$gen == i,] %>% mutate(fid = rw$membership)
x <- x %>% group_by(fid) %>% summarise(mean_prs = mean(prs), n = n(), se_prs = sd(prs) / sqrt(n())) %>% arrange(desc(mean_prs)) %>% filter(n > 1) %>% mutate(fam=1:n(), gen=i)
xs return(xs)
%>% bind_rows()
})
}
<- collapse_dat2(lapply(list(dat, dat2), collapse_dat))
cdat find_families(cdat)
<- collapse_dat2(lapply(list(dat, dat2, dat3, dat4, dat5), collapse_dat))
cdat <- find_families(cdat)
f table(f$gen)
ggplot(f, aes(x=mean_prs, y=fam)) +
geom_point() +
geom_errorbarh(aes(xmin=mean_prs - 1.96 * se_prs, xmax=mean_prs + 1.96 * se_prs), height=0) +
facet_grid(. ~ gen)
<- function(ngen, nfam, npoly, h2, rho) {
fam_sim <- rnorm(npoly)
betas <- list()
l 1]] <- make_families(betas = betas, nfam = nfam, h2 = h2, rho = rho)
l[[for(i in 2:ngen) {
<- create_child(l[[i-1]], betas, h2 = h2, rho = rho)
l[[i]]
}<- collapse_dat2(lapply(l, collapse_dat))
cdat <- find_families(cdat) %>% mutate(npoly=npoly, nfam=nfam, h2=h2, rho=rho)
xs return(xs)
}
<- fam_sim(11, 500, 100, 0.8, 0)
a
a
%>% group_by(gen) %>%
a filter(row_number()==1 | row_number()==n()) %>%
ggplot(., aes(y=mean_prs, x=gen)) +
geom_point() +
geom_errorbar(aes(ymin=mean_prs - 1.96 * se_prs, ymax=mean_prs + 1.96 * se_prs), width=0)
<- expand.grid(
param ngen = 10,
nfam = 250,
npoly = 100,
h2 = 0.8,
rho = c(0, 0.2, 0.4, 0.6, 0.8),
sims = 1:10
)
<- lapply(1:nrow(param), \(i) {
o message(i)
fam_sim(param$ngen[i], param$nfam[i], param$npoly[i], param$h2[i], param$rho[i])
%>% bind_rows() })
<- rnorm(100)
betas <- make_families(betas = betas, nfam = 100, h2 = 0.8, rho = 0.4)
dat <- create_child(dat, betas, h2 = 0.8, rho = 0.4)
dat2 <- collapse_dat2(list(collapse_dat(dat), collapse_dat(dat2)))
cdat str(cdat)
$x %>% mutate(s=prs_mother+prs_father) %>% mutate(fid=1:n()) %>% arrange(desc(s))
dat
hist(dat$x$prs_mother, breaks=100)
sessionInfo()