Selection bias influencing family-based GWAS

Author

Gibran Hemani

Published

February 23, 2024

Background

Selection bias where X influences inclusion in the data can distort genetic associations. Is this also true for family based genetic associations?

library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(tidyr)
# from https://github.com/MRCIEU/mrtwin_power/blob/master/scripts/sib_mr_functions.r
make_families <- function(af, nfam) {
    nsnp <- length(af)
    dads <- matrix(0, nfam, nsnp)
    mums <- matrix(0, nfam, nsnp)
    sibs1 <- matrix(0, nfam, nsnp)
    sibs2 <- matrix(0, nfam, nsnp)
    ibd <- matrix(0, nfam, nsnp)
    ibs <- matrix(0, nfam, nsnp)
    for(i in 1:nsnp)
    {
        dad1 <- rbinom(nfam, 1, af[i]) + 1
        dad2 <- (rbinom(nfam, 1, af[i]) + 1) * -1
        mum1 <- rbinom(nfam, 1, af[i]) + 1
        mum2 <- (rbinom(nfam, 1, af[i]) + 1) * -1

        dadindex <- sample(c(TRUE, FALSE), nfam, replace=TRUE)
        dadh <- rep(NA, nfam)
        dadh[dadindex] <- dad1[dadindex]
        dadh[!dadindex] <- dad2[!dadindex]

        mumindex <- sample(c(TRUE, FALSE), nfam, replace=TRUE)
        mumh <- rep(NA, nfam)
        mumh[mumindex] <- mum1[mumindex]
        mumh[!mumindex] <- mum2[!mumindex]

        sib1 <- cbind(dadh, mumh)

        dadindex <- sample(c(TRUE, FALSE), nfam, replace=TRUE)
        dadh <- rep(NA, nfam)
        dadh[dadindex] <- dad1[dadindex]
        dadh[!dadindex] <- dad2[!dadindex]

        mumindex <- sample(c(TRUE, FALSE), nfam, replace=TRUE)
        mumh <- rep(NA, nfam)
        mumh[mumindex] <- mum1[mumindex]
        mumh[!mumindex] <- mum2[!mumindex]

        sib2 <- cbind(dadh, mumh)

        ibd[,i] <- (as.numeric(sib1[,1] == sib2[,1]) + as.numeric(sib1[,2] == sib2[,2])) / 2


        sibs1[,i] <- rowSums(abs(sib1) - 1)
        sibs2[,i] <- rowSums(abs(sib2) - 1)
        dads[,i] <- dad1 - 1 + abs(dad2) - 1
        mums[,i] <- mum1 - 1 + abs(mum2) - 1

        # l[[i]] <- (sum(sib1[,1] == sib2[,1]) / nsnp + sum(sib1[,2] == sib2[,2]) / nsnp) / 2

    }

    # This may not be correct - getting some really large values
    ibs <- scale(sibs1) * scale(sibs2)

    # Just count how many alleles are in common
    ibs_unw <- abs(abs(sibs1 - sibs2) - 2) / 2

    sibs1 <- as_tibble(sibs1)
    sibs2 <- as_tibble(sibs2)
    sdat <- bind_rows(
        tibble(fid = 1:nfam, iid = paste0(1:nfam, "a"), sibs1),
        tibble(fid = 1:nfam, iid = paste0(1:nfam, "b"), sibs2)
    )
    return(sdat)

    return(list(dads=dads, mums=mums, sibs1=sibs1, sibs2=sibs2, ibd=ibd, ibs=ibs, ibs_unw=ibs_unw))

}

DAG

g1 -> x
g2 -> x
x -> s
dat <- make_families(c(0.4, 0.6), 100000)
Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
`.name_repair` is omitted as of tibble 2.0.0.
ℹ Using compatibility `.name_repair`.
dat$x <- dat$V1 * 0.4 + dat$V2 * -0.4 + rnorm(nrow(dat))
dat$s <- rbinom(nrow(dat), 1, plogis(dat$x))
table(dat$s)

     0      1 
106442  93558 

Estimation in populations

# No selection
summary(lm(x ~ V1, dat, subset=grepl("a", dat$iid)))

Call:
lm(formula = x ~ V1, data = dat, subset = grepl("a", dat$iid))

Residuals:
    Min      1Q  Median      3Q     Max 
-4.4397 -0.7020 -0.0030  0.7069  4.7084 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) -0.477390   0.005022  -95.07   <2e-16 ***
V1           0.398992   0.004738   84.21   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.039 on 99998 degrees of freedom
Multiple R-squared:  0.06622,   Adjusted R-squared:  0.06621 
F-statistic:  7091 on 1 and 99998 DF,  p-value: < 2.2e-16
# With selection
summary(lm(x ~ V1, dat, subset=grepl("a", dat$iid) & dat$s))

Call:
lm(formula = x ~ V1, data = dat, subset = grepl("a", dat$iid) & 
    dat$s)

Residuals:
    Min      1Q  Median      3Q     Max 
-3.7922 -0.6350 -0.0020  0.6344  4.2493 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 0.062212   0.006989   8.901   <2e-16 ***
V1          0.318548   0.006195  51.417   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.9372 on 46899 degrees of freedom
Multiple R-squared:  0.05336,   Adjusted R-squared:  0.05334 
F-statistic:  2644 on 1 and 46899 DF,  p-value: < 2.2e-16

Estimation in families

a <- subset(dat, grepl("a", dat$iid))
b <- subset(dat, grepl("b", dat$iid))
ab <- inner_join(a, b, by="fid")
ab
# A tibble: 100,000 × 11
     fid iid.x  V1.x  V2.x      x.x   s.x iid.y  V1.y  V2.y     x.y   s.y
   <int> <chr> <dbl> <dbl>    <dbl> <int> <chr> <dbl> <dbl>   <dbl> <int>
 1     1 1a        2     2  0.974       1 1b        1     2 -0.220      1
 2     2 2a        2     0  0.174       1 2b        1     1  0.211      0
 3     3 3a        1     2  0.404       1 3b        1     1 -0.351      0
 4     4 4a        0     2 -0.00896     0 4b        0     1  0.381      1
 5     5 5a        1     2 -1.86        0 5b        0     2 -0.682      0
 6     6 6a        1     2 -0.433       0 6b        0     2  0.161      1
 7     7 7a        1     1 -0.189       0 7b        1     2  0.578      0
 8     8 8a        1     2  0.0850      0 8b        2     2 -0.726      0
 9     9 9a        2     2  1.25        0 9b        2     1  0.236      0
10    10 10a       0     2 -0.479       0 10b       1     2 -0.0461     0
# ℹ 99,990 more rows
# No selection
summary(lm(I(x.x - x.y) ~ I(V1.x - V1.y), ab))

Call:
lm(formula = I(x.x - x.y) ~ I(V1.x - V1.y), data = ab)

Residuals:
    Min      1Q  Median      3Q     Max 
-6.3807 -0.9730  0.0080  0.9722  5.6960 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)    
(Intercept)    0.005116   0.004559   1.122    0.262    
I(V1.x - V1.y) 0.397261   0.006595  60.238   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.442 on 99998 degrees of freedom
Multiple R-squared:  0.03502,   Adjusted R-squared:  0.03501 
F-statistic:  3629 on 1 and 99998 DF,  p-value: < 2.2e-16
# With selection
summary(lm(I(x.x - x.y) ~ I(V1.x - V1.y), ab, subset=ab$s.x==1 & ab$s.y==1))

Call:
lm(formula = I(x.x - x.y) ~ I(V1.x - V1.y), data = ab, subset = ab$s.x == 
    1 & ab$s.y == 1)

Residuals:
    Min      1Q  Median      3Q     Max 
-5.2012 -0.8784  0.0009  0.8807  5.4372 

Coefficients:
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)    2.302e-05  8.759e-03   0.003    0.998    
I(V1.x - V1.y) 3.258e-01  1.257e-02  25.928   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.302 on 22087 degrees of freedom
Multiple R-squared:  0.02954,   Adjusted R-squared:  0.02949 
F-statistic: 672.3 on 1 and 22087 DF,  p-value: < 2.2e-16

Summary

The selection bias is the same using within family and between family


sessionInfo()
R version 4.3.2 (2023-10-31)
Platform: aarch64-apple-darwin20 (64-bit)
Running under: macOS Ventura 13.6

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0

locale:
[1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8

time zone: Europe/London
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] tidyr_1.3.0 dplyr_1.1.4

loaded via a namespace (and not attached):
 [1] digest_0.6.33     utf8_1.2.4        R6_2.5.1          fastmap_1.1.1    
 [5] tidyselect_1.2.0  xfun_0.41         magrittr_2.0.3    glue_1.6.2       
 [9] tibble_3.2.1      knitr_1.45        pkgconfig_2.0.3   htmltools_0.5.7  
[13] rmarkdown_2.25    generics_0.1.3    lifecycle_1.0.4   cli_3.6.1        
[17] fansi_1.0.5       vctrs_0.6.4       compiler_4.3.2    purrr_1.0.2      
[21] tools_4.3.2       pillar_1.9.0      evaluate_0.23     yaml_2.3.7       
[25] rlang_1.1.2       jsonlite_1.8.7    htmlwidgets_1.6.3