Gibran Hemani’s lab book - Bootstrap p-values example

Background

Example of using bootstrapping to get p-values

This function

Simulates some data
Gets the linear regression p-value
Gets the bootstrap p-value

library(dplyr)


Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

library(purrr)

#' Simulate simple dataset and compare p-values from linear regression and bootstrap
#' 
#' @param n Number of observations
#' @param b True effect size
#' @param nboot Number of bootstrap samples
#' @param sim Simulation number
#' 
#' 
#' @return A tibble with p-values
boot_sim <- function(n, b, nboot, sim=1) {
    # Simple simulation of y being influenced by x
    x <- rnorm(n)
    y <- rnorm(n) + b * x

    # Linear model
    mod <- summary(lm(y ~ x))
    bhat <- mod$coefficients[2, 1]
    pval <- mod$coefficients[2, 4]

    # Get p-value using bootstrap
    # Generate a null distribution of the slope
    # Do this by resampling x
    bboot <- numeric(nboot)
    for(i in 1:nboot) {
        x1 <- sample(x)
        bboot[i] <- lm(y ~ x1)$coefficients[2]
    }

    # This is a non-parametric p-value
    pvalboot_nonpara <- sum(abs(bboot) > abs(b)) / nboot

    # This is a parametric p-value
    pvalboot_para <- pnorm(bhat, mean=mean(bboot), sd=sd(bboot), low=F)

    # Compare parametric pval, non para pval and linear model pval
    tibble(
        b=b,
        n=n,
        sim=sim,
        nboot=nboot,
        pval = pval,
        pvalboot_nonpara = pvalboot_nonpara,
        pvalboot_para = pvalboot_para
    )
}

Example run of this simulation

boot_sim(1000, 0.3, 1000)

# A tibble: 1 × 7
      b     n   sim nboot     pval pvalboot_nonpara pvalboot_para
  <dbl> <dbl> <dbl> <dbl>    <dbl>            <dbl>         <dbl>
1   0.3  1000     1  1000 1.50e-18                0      1.10e-18

To evaluate performance do it over a range of simulation values

param <- expand.grid(
    n = c(100, 1000, 10000),
    b = c(0.1, 0.3, 0.5),
    nboot = c(1000),
    sim=1:10
)

Run the simulation over the parameter grid

o <- pmap(param, boot_sim) %>% bind_rows()

Compare the p-values

plot(-log10(o$pval), -log10(o$pvalboot_para))
abline(0, 1)

sessionInfo()

R version 4.4.1 (2024-06-14)
Platform: aarch64-apple-darwin20
Running under: macOS Sonoma 14.6.1

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: Europe/London
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] purrr_1.0.2 dplyr_1.1.4

loaded via a namespace (and not attached):
 [1] digest_0.6.37     utf8_1.2.4        R6_2.5.1          fastmap_1.2.0    
 [5] tidyselect_1.2.1  xfun_0.48         magrittr_2.0.3    glue_1.8.0       
 [9] tibble_3.2.1      knitr_1.48        pkgconfig_2.0.3   htmltools_0.5.8.1
[13] rmarkdown_2.27    generics_0.1.3    lifecycle_1.0.4   cli_3.6.3        
[17] fansi_1.0.6       vctrs_0.6.5       compiler_4.4.1    tools_4.4.1      
[21] pillar_1.9.0      evaluate_1.0.1    yaml_2.3.10       rlang_1.1.4      
[25] jsonlite_1.8.9    htmlwidgets_1.6.4