Comparing expected vs observed

Author

Gibran Hemani

Published

April 29, 2024

Background

Is binomial test appropriate?

set.seed(100)
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(ggplot2)


#' Simulate GWAS summary data discovery and replication
#'
#' @param n1 The number of individuals in the first sample.
#' @param n2 The number of individuals in the second sample.
#' @param nsnp The number of causal SNPs (Single Nucleotide Polymorphisms) to simulate.
#' @param nflip The number of SNPs to set to have 0 effect in the second group.
#' @param afshared A logical value indicating whether the allele frequencies should be shared between the two groups.
#' @param bsd The standard deviation of the distribution of effect sizes.
#'
#' @return Data frame
sim <- function(n1, n2, nsnp, nflip=1, afshared=FALSE, bsd=1) {
    # different allele frequencies per study
    af1 <- runif(nsnp, 0.01, 0.99)
    if(afshared) {
        af2 <- af1
    } else {
        af2 <- runif(nsnp, 0.01, 0.99)
    }

    # identifical effect sizes across studies
    b1 <- rnorm(nsnp, sd=bsd)
    b2 <- b1

    # make one of the effects different
    if(nflip > 0) {
        b1[1:nflip] <- 0
    }

    # Assume variance of trait is the same across studies
    se1 <- 1 / sqrt(2 * af1 * (1-af1) * n1)
    se2 <- 1 / sqrt(2 * af2 * (1-af2) * n2)

    dat <- tibble(
        af1, af2, b1, b2, se1, se2,
        bhat1 = rnorm(nsnp, b1, se1),
        bhat2 = rnorm(nsnp, b2, se2),
        pval1 = 2 * pnorm(-abs(bhat1/se1)),
        pval2 = 2 * pnorm(-abs(bhat2/se2)),
        r21 = b1^2 * af1 * (1-af1) * 2,
        r22 = b2^2 * af2 * (1-af2) * 2,
    )
    return(dat)
}

prop_overlap <- function(b_disc, b_rep, se_disc, se_rep, alpha) {
  p_sign <- pnorm(-abs(b_disc) / se_disc) * pnorm(-abs(b_disc) / se_rep) + ((1 - pnorm(-abs(b_disc) / se_disc)) * (1 - pnorm(-abs(b_disc) / se_rep)))
  p_sig <- pnorm(-abs(b_disc) / se_rep + qnorm(alpha / 2)) + (1 - pnorm(-abs(b_disc) / se_rep - qnorm(alpha / 2)))
  p_rep <- pnorm(abs(b_rep) / se_rep, lower.tail = FALSE)
  res <- tibble::tibble(
    nsnp = length(b_disc),
    metric = c("Sign", "Sign", "P-value", "P-value"),
    datum = c("Expected", "Observed", "Expected", "Observed"),
    value = c(sum(p_sign, na.rm = TRUE), sum(sign(b_disc) == sign(b_rep)), sum(p_sig, na.rm = TRUE), sum(p_rep < alpha, na.rm = TRUE))
  ) %>%
    dplyr::group_by(metric) %>%
      dplyr::do({
        x <- .
        if(.$nsnp[1] > 0) {
          bt <- binom.test(
            x=.$value[.$datum == "Observed"], 
            n=.$nsnp[1], 
            p=.$value[.$datum == "Expected"] / .$nsnp[1]
          )$p.value
          x$pdiff <- bt
        }
        x
      })
  return(list(res = res, variants = dplyr::tibble(sig = p_sig, sign = p_sign)))
}

sim_test <- function(res, nboot=1000) {
    res$res$pdiff_sim <- 0
    res$res$pdiff_sim_np <- 0
    bootsig <- sapply(1:nboot, \(i) {
        rbinom(nrow(res$variants), 1, res$variants$sig) %>% sum
    })
    res$res$pdiff_sim[1:2] <- pnorm(res$res$value[2], mean=mean(bootsig), sd=sd(bootsig), lower.tail=FALSE)
    res$res$pdiff_sim_np[1:2] <- sum(res$res$value[2] < bootsig) / nboot
    bootsign <- sapply(1:nboot, \(i) {
        rbinom(nrow(res$variants), 1, res$variants$sign) %>% sum
    })
    res$res$pdiff_sim[3:4] <- pnorm(res$res$value[4], mean=mean(bootsign), sd=sd(bootsign), lower.tail=TRUE)
    res$res$pdiff_sim_np[3:4] <- sum(res$res$value[4] > bootsign) / nboot
    return(res)
}
s <- sim(10000, 1000, 100, nflip=0)
p <- prop_overlap(s$b1, s$b2, s$se1, s$se2, 0.05)
p <- sim_test(p)
p
$res
# A tibble: 4 × 7
# Groups:   metric [2]
   nsnp metric  datum    value pdiff pdiff_sim pdiff_sim_np
  <int> <chr>   <chr>    <dbl> <dbl>     <dbl>        <dbl>
1   100 P-value Expected  86.4 0.559     0.112         0.08
2   100 P-value Observed  89   0.559     0.112         0.08
3   100 Sign    Expected  97.6 0.182     0.969         0.94
4   100 Sign    Observed 100   0.182     0.969         0.94

$variants
# A tibble: 100 × 2
     sig  sign
   <dbl> <dbl>
 1 1.00  1.00 
 2 1     1    
 3 0.996 1.00 
 4 1     1    
 5 1     1    
 6 1.00  1    
 7 1.00  1.00 
 8 1.00  1.00 
 9 1.00  1.00 
10 0.669 0.992
# ℹ 90 more rows
param <- expand.grid(
    n1 = c(10000),
    n2 = c(1000, 10000),
    nsnp = c(100),
    nflip = c(rep(0, 10), 1:20),
    afshared = c(FALSE),
    nsim=1:100
)
dim(param)

res <- lapply(1:nrow(param), \(i) {
    s <- sim(param$n1[i], param$n2[i], param$nsnp[i], nflip=param$nflip[i], afshared=param$afshared[i])
    p <- prop_overlap(s$b1, s$b2, s$se1, s$se2, 0.05)
    res <- sim_test(p)

    pa <- bind_cols(param[i,], res$res %>% select(-nsnp))
    return(pa)
}) %>% bind_rows()
head(res)   
load("res.rdata")
res %>% 
    as_tibble() %>%
    filter(datum == "Observed") %>%
    tidyr::pivot_longer(cols=c(pdiff, pdiff_sim, pdiff_sim_np), names_to="type", values_to="p") %>%
    group_by(nflip, afshared, metric, n1, n2, nsnp, type) %>%
    summarise(n=n(), p = sum(p < 0.05)/n()) %>%
    ggplot(aes(x=nflip, y=p, colour=type)) +
        geom_point() +
        geom_line() +
        facet_grid(n2 ~ metric)
`summarise()` has grouped output by 'nflip', 'afshared', 'metric', 'n1', 'n2',
'nsnp'. You can override using the `.groups` argument.

res %>% 
    as_tibble() %>%
    filter(datum == "Observed") %>%
    group_by(nflip, metric) %>%
    summarise(n=n(), pdiff = sum(pdiff < 0.05)/n(), pdiff_sim = sum(pdiff_sim < 0.05)/n())
`summarise()` has grouped output by 'nflip'. You can override using the
`.groups` argument.
# A tibble: 42 × 5
# Groups:   nflip [21]
   nflip metric      n  pdiff pdiff_sim
   <dbl> <chr>   <int>  <dbl>     <dbl>
 1     0 P-value  2000 0.0005     0.085
 2     0 Sign     2000 0.0105     0    
 3     1 P-value   200 0          0.35 
 4     1 Sign      200 0          0    
 5     2 P-value   200 0.005      0.78 
 6     2 Sign      200 0          0    
 7     3 P-value   200 0.025      0.96 
 8     3 Sign      200 0          0.02 
 9     4 P-value   200 0.2        0.975
10     4 Sign      200 0          0.06 
# ℹ 32 more rows
res %>% 
    as_tibble() %>%
    filter(datum == "Observed") %>%
    group_by(nflip, metric) %>%
    summarise(n=n(), pdiff = sum(pdiff < 0.05)/n(), pdiff_sim = sum(pdiff_sim < 0.05)/n()) %>%
    tidyr::pivot_longer(cols=c(pdiff, pdiff_sim), names_to="type", values_to="p") %>%
    ggplot(aes(x=nflip, y=p, colour=type)) +
        geom_point() +
        facet_grid(. ~ metric)
`summarise()` has grouped output by 'nflip'. You can override using the
`.groups` argument.

res %>%
    as_tibble() %>%
    filter(datum == "Observed" & nflip == 0) %>%
    tidyr::pivot_longer(cols=c(pdiff, pdiff_sim, pdiff_sim_np), names_to="type", values_to="p") %>%
    group_by(metric, type, n1, n2, nsnp) %>%
    summarise(n=n(), p = sum(p < 0.05)/n()) %>%
    ggplot(aes(x=type, y=p, colour=metric)) +
        geom_point() +
        facet_grid(n2 ~ .)
`summarise()` has grouped output by 'metric', 'type', 'n1', 'n2'. You can
override using the `.groups` argument.


sessionInfo()
R version 4.4.0 (2024-04-24)
Platform: x86_64-apple-darwin20
Running under: macOS Ventura 13.6

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/lib/libRblas.0.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: Europe/London
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] ggplot2_3.5.1 dplyr_1.1.4  

loaded via a namespace (and not attached):
 [1] vctrs_0.6.5       cli_3.6.2         knitr_1.46        rlang_1.1.3      
 [5] xfun_0.43         purrr_1.0.2       generics_0.1.3    jsonlite_1.8.8   
 [9] labeling_0.4.3    glue_1.7.0        colorspace_2.1-0  htmltools_0.5.8.1
[13] scales_1.3.0      fansi_1.0.6       rmarkdown_2.26    grid_4.4.0       
[17] munsell_0.5.1     evaluate_0.23     tibble_3.2.1      fastmap_1.1.1    
[21] yaml_2.3.8        lifecycle_1.0.4   compiler_4.4.0    htmlwidgets_1.6.4
[25] pkgconfig_2.0.3   tidyr_1.3.1       farver_2.1.1      digest_0.6.35    
[29] R6_2.5.1          tidyselect_1.2.1  utf8_1.2.4        pillar_1.9.0     
[33] magrittr_2.0.3    withr_3.0.0       tools_4.4.0       gtable_0.3.5