2024-08-22-parse-sample-sizes

Author

Gibran Hemani

Published

September 22, 2024

Background

library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
fn <- function(x) {
    # Remove commas from numbers
    b <- gsub("(\\d+),(?=\\d+)", "\\1", x, perl = TRUE) %>% 
    # Split sample components based on ','
        strsplit(", ") %>% 
        {.[[1]]}
    # Keep components that contain the word 'cases'
    b1 <- grep("cases", b, value = TRUE)
    # If none, probably a continuous trait
    if(length(b1) == 0) {
        b1 <- b
    }
    # Now just extract the numbers from each 'cases' sample component and sum them

    suppressWarnings({
    b1 %>% 
        sapply(., \(y) {
            strsplit(y, " ") %>% 
            unlist() %>% as.numeric() %>% na.omit() %>% first() %>% as.numeric()
        }) %>% sum(na.rm=TRUE)
    })
}

# Example
x <- "25,453 European ancestry cases, 58,113 European ancestry controls, 360 cases and controls, PMID:25056061, 6,524 cases, 24,001 controls"
fn(x)
[1] 32337
x <- "360 cases and controls"
fn(x)
[1] 360

sessionInfo()
R version 4.4.1 (2024-06-14)
Platform: aarch64-apple-darwin20
Running under: macOS Sonoma 14.6.1

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: Europe/London
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] dplyr_1.1.4

loaded via a namespace (and not attached):
 [1] digest_0.6.35     utf8_1.2.4        R6_2.5.1          fastmap_1.2.0    
 [5] tidyselect_1.2.1  xfun_0.44         magrittr_2.0.3    glue_1.7.0       
 [9] tibble_3.2.1      knitr_1.47        pkgconfig_2.0.3   htmltools_0.5.8.1
[13] rmarkdown_2.27    generics_0.1.3    lifecycle_1.0.4   cli_3.6.2        
[17] fansi_1.0.6       vctrs_0.6.5       compiler_4.4.1    tools_4.4.1      
[21] pillar_1.9.0      evaluate_0.23     yaml_2.3.8        rlang_1.1.3      
[25] jsonlite_1.8.8    htmlwidgets_1.6.4