library(dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
library(tidyr)
Gibran Hemani
February 23, 2024
Selection bias where X influences inclusion in the data can distort genetic associations. Is this also true for family based genetic associations?
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
# from https://github.com/MRCIEU/mrtwin_power/blob/master/scripts/sib_mr_functions.r
make_families <- function(af, nfam) {
nsnp <- length(af)
dads <- matrix(0, nfam, nsnp)
mums <- matrix(0, nfam, nsnp)
sibs1 <- matrix(0, nfam, nsnp)
sibs2 <- matrix(0, nfam, nsnp)
ibd <- matrix(0, nfam, nsnp)
ibs <- matrix(0, nfam, nsnp)
for(i in 1:nsnp)
{
dad1 <- rbinom(nfam, 1, af[i]) + 1
dad2 <- (rbinom(nfam, 1, af[i]) + 1) * -1
mum1 <- rbinom(nfam, 1, af[i]) + 1
mum2 <- (rbinom(nfam, 1, af[i]) + 1) * -1
dadindex <- sample(c(TRUE, FALSE), nfam, replace=TRUE)
dadh <- rep(NA, nfam)
dadh[dadindex] <- dad1[dadindex]
dadh[!dadindex] <- dad2[!dadindex]
mumindex <- sample(c(TRUE, FALSE), nfam, replace=TRUE)
mumh <- rep(NA, nfam)
mumh[mumindex] <- mum1[mumindex]
mumh[!mumindex] <- mum2[!mumindex]
sib1 <- cbind(dadh, mumh)
dadindex <- sample(c(TRUE, FALSE), nfam, replace=TRUE)
dadh <- rep(NA, nfam)
dadh[dadindex] <- dad1[dadindex]
dadh[!dadindex] <- dad2[!dadindex]
mumindex <- sample(c(TRUE, FALSE), nfam, replace=TRUE)
mumh <- rep(NA, nfam)
mumh[mumindex] <- mum1[mumindex]
mumh[!mumindex] <- mum2[!mumindex]
sib2 <- cbind(dadh, mumh)
ibd[,i] <- (as.numeric(sib1[,1] == sib2[,1]) + as.numeric(sib1[,2] == sib2[,2])) / 2
sibs1[,i] <- rowSums(abs(sib1) - 1)
sibs2[,i] <- rowSums(abs(sib2) - 1)
dads[,i] <- dad1 - 1 + abs(dad2) - 1
mums[,i] <- mum1 - 1 + abs(mum2) - 1
# l[[i]] <- (sum(sib1[,1] == sib2[,1]) / nsnp + sum(sib1[,2] == sib2[,2]) / nsnp) / 2
}
# This may not be correct - getting some really large values
ibs <- scale(sibs1) * scale(sibs2)
# Just count how many alleles are in common
ibs_unw <- abs(abs(sibs1 - sibs2) - 2) / 2
sibs1 <- as_tibble(sibs1)
sibs2 <- as_tibble(sibs2)
sdat <- bind_rows(
tibble(fid = 1:nfam, iid = paste0(1:nfam, "a"), sibs1),
tibble(fid = 1:nfam, iid = paste0(1:nfam, "b"), sibs2)
)
return(sdat)
return(list(dads=dads, mums=mums, sibs1=sibs1, sibs2=sibs2, ibd=ibd, ibs=ibs, ibs_unw=ibs_unw))
}
DAG
g1 -> x
g2 -> x
x -> s
Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
`.name_repair` is omitted as of tibble 2.0.0.
ℹ Using compatibility `.name_repair`.
dat$x <- dat$V1 * 0.4 + dat$V2 * -0.4 + rnorm(nrow(dat))
dat$s <- rbinom(nrow(dat), 1, plogis(dat$x))
table(dat$s)
0 1
106442 93558
Estimation in populations
Call:
lm(formula = x ~ V1, data = dat, subset = grepl("a", dat$iid))
Residuals:
Min 1Q Median 3Q Max
-4.4397 -0.7020 -0.0030 0.7069 4.7084
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.477390 0.005022 -95.07 <2e-16 ***
V1 0.398992 0.004738 84.21 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.039 on 99998 degrees of freedom
Multiple R-squared: 0.06622, Adjusted R-squared: 0.06621
F-statistic: 7091 on 1 and 99998 DF, p-value: < 2.2e-16
Call:
lm(formula = x ~ V1, data = dat, subset = grepl("a", dat$iid) &
dat$s)
Residuals:
Min 1Q Median 3Q Max
-3.7922 -0.6350 -0.0020 0.6344 4.2493
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.062212 0.006989 8.901 <2e-16 ***
V1 0.318548 0.006195 51.417 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.9372 on 46899 degrees of freedom
Multiple R-squared: 0.05336, Adjusted R-squared: 0.05334
F-statistic: 2644 on 1 and 46899 DF, p-value: < 2.2e-16
Estimation in families
a <- subset(dat, grepl("a", dat$iid))
b <- subset(dat, grepl("b", dat$iid))
ab <- inner_join(a, b, by="fid")
ab
# A tibble: 100,000 × 11
fid iid.x V1.x V2.x x.x s.x iid.y V1.y V2.y x.y s.y
<int> <chr> <dbl> <dbl> <dbl> <int> <chr> <dbl> <dbl> <dbl> <int>
1 1 1a 2 2 0.974 1 1b 1 2 -0.220 1
2 2 2a 2 0 0.174 1 2b 1 1 0.211 0
3 3 3a 1 2 0.404 1 3b 1 1 -0.351 0
4 4 4a 0 2 -0.00896 0 4b 0 1 0.381 1
5 5 5a 1 2 -1.86 0 5b 0 2 -0.682 0
6 6 6a 1 2 -0.433 0 6b 0 2 0.161 1
7 7 7a 1 1 -0.189 0 7b 1 2 0.578 0
8 8 8a 1 2 0.0850 0 8b 2 2 -0.726 0
9 9 9a 2 2 1.25 0 9b 2 1 0.236 0
10 10 10a 0 2 -0.479 0 10b 1 2 -0.0461 0
# ℹ 99,990 more rows
Call:
lm(formula = I(x.x - x.y) ~ I(V1.x - V1.y), data = ab)
Residuals:
Min 1Q Median 3Q Max
-6.3807 -0.9730 0.0080 0.9722 5.6960
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.005116 0.004559 1.122 0.262
I(V1.x - V1.y) 0.397261 0.006595 60.238 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.442 on 99998 degrees of freedom
Multiple R-squared: 0.03502, Adjusted R-squared: 0.03501
F-statistic: 3629 on 1 and 99998 DF, p-value: < 2.2e-16
Call:
lm(formula = I(x.x - x.y) ~ I(V1.x - V1.y), data = ab, subset = ab$s.x ==
1 & ab$s.y == 1)
Residuals:
Min 1Q Median 3Q Max
-5.2012 -0.8784 0.0009 0.8807 5.4372
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.302e-05 8.759e-03 0.003 0.998
I(V1.x - V1.y) 3.258e-01 1.257e-02 25.928 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.302 on 22087 degrees of freedom
Multiple R-squared: 0.02954, Adjusted R-squared: 0.02949
F-statistic: 672.3 on 1 and 22087 DF, p-value: < 2.2e-16
The selection bias is the same using within family and between family
R version 4.3.2 (2023-10-31)
Platform: aarch64-apple-darwin20 (64-bit)
Running under: macOS Ventura 13.6
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0
locale:
[1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
time zone: Europe/London
tzcode source: internal
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] tidyr_1.3.0 dplyr_1.1.4
loaded via a namespace (and not attached):
[1] digest_0.6.33 utf8_1.2.4 R6_2.5.1 fastmap_1.1.1
[5] tidyselect_1.2.0 xfun_0.41 magrittr_2.0.3 glue_1.6.2
[9] tibble_3.2.1 knitr_1.45 pkgconfig_2.0.3 htmltools_0.5.7
[13] rmarkdown_2.25 generics_0.1.3 lifecycle_1.0.4 cli_3.6.1
[17] fansi_1.0.5 vctrs_0.6.4 compiler_4.3.2 purrr_1.0.2
[21] tools_4.3.2 pillar_1.9.0 evaluate_0.23 yaml_2.3.7
[25] rlang_1.1.2 jsonlite_1.8.7 htmlwidgets_1.6.3