Simulate trait clusters

Author

Gibran Hemani

Published

May 15, 2025

Background

Need a function that will create GWAS summary statistics for multiple traits - There is a set of underlying latent traits - There may be pleiotropic or causal relationships between the latent traits - A number of observed traits arise from each latent trait

Sim

library(simulateGP)
library(GWASBrewer)
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(ggplot2)

G_from_df <- function(df) {
    stopifnot(inherits(df, "data.frame"))
    stopifnot(nrow(df) >0)
    stopifnot(all(c("i", "j", "eff") %in% names(df)))
    nodes <- unique(c(df$i, df$j))
    message(nrow(df), " causal relationships specified amongst ", length(nodes), " traits")
    G <- matrix(0, length(nodes), length(nodes))
    rownames(G) <- nodes
    colnames(G) <- nodes
    for(x in 1:nrow(df)) {
        G[df$i[x], df$j[x]] <- df$eff[x]
    }
    return(G)
}


#' @param latent_causal_matrix For L latent traits, the causal relationships between them. Should be a DAG
#' @param pleio_min Mimumum fraction of h2 due to pleiotropy. How deviant should proximal traits be to the underlying latent trait? Proximal traits will have the same h2 as the latent trait, but some fraction of that h2 will be from other variants. 
#' @param pleio_max Maximum fraction of h2 due to pleiotropy
#' @param n_proximal_traits Vector of length L saying how many traits for each latent trait
#' @param nid Vector of size sum(n_proximal_traits). Sample size of each proximal trait
#' @param nsnp Vector of size L saying how many SNPs for each latent trait
#' @param h2 Vector of size L saying variance explained in proximal trait by SNPs
pie_hat_sim <- function(latent_causal_matrix, pleio_min, pleio_max, n_proximal_traits, nid, nsnp, h2) {
    n_latent <- nrow(latent_causal_matrix)
    a <- lapply(1:n_latent, \(trait) {
        latent_names <- unique(c(latent_causal_matrix$i, latent_causal_matrix$j))
        tibble(
            i = latent_names[trait], 
            j = paste0(latent_names[trait], 1:n_proximal_traits[trait]), 
            eff=runif(n_proximal_traits[trait], 1-pleio_max, 1-pleio_min),
            pi = 0.1,
            h2 = h2[trait]
        )
    }) %>% bind_rows()

    J <- sum(nsnp)
    latent_causal_matrix$pi <- nsnp / J
    latent_causal_matrix$h2 <- h2

    mat <- G_from_df(bind_rows(latent_causal_matrix, a))

    o <- sim_mv(
        G = mat,
        N = c(rep(500000, n_latent), nid), 
        J = sum(nsnp),
        h2 = c(latent_causal_matrix$h2, a$h2),
        pi = c(latent_causal_matrix$pi, a$pi),
        est_s = TRUE,
        af = function(n) {runif(n, 0.01, 0.99)}
    )
    dat <- lapply(1:ncol(o$beta_hat), \(i) {
        as_tibble(o$snp_info) %>%
        mutate(
            trait = colnames(o$direct_trait_effects)[i],
            bhat = o$beta_hat[,i],
            se = o$se_beta_hat[,i],
            pval = pnorm(abs(bhat)/se, low=F)
        )
    }) %>% bind_rows()
    return(dat)
}

plot_pair <- function(dats, exposure, outcome) {
    e <- subset(dats, pval < 5e-8 & trait == exposure)
    o <- subset(dats, SNP %in% e$SNP & trait == outcome)
    tibble(eb=e$bhat, ese=e$se, ob=o$bhat, ose=o$se) %>%
    ggplot(aes(x=eb, y=ob)) +
    geom_errorbar(aes(ymin=ob - ose * 1.96, ymax=ob+ose*1.96), colour="grey") +
    geom_errorbarh(aes(xmin=eb - ese * 1.96, xmax=eb+ese*1.96), colour="grey") +
    geom_smooth(method="lm") +
    geom_point()
}

Use the simulation function to generate a dataset of summary statistics for each latent trait plus all the proxy traits.

# Specify the causal relationship amongst the latent traits
# Every latent trait must be included in this matrix
# If a latent trait isn't causally related to anything else just give it a 0 causal effect e.g. for Z here
latent_causal_matrix <- dplyr::tribble(
    ~i, ~j, ~eff,
    "Y", "X", 0.25,
    "X", "A", 0.24,
    "Y", "A", 0.34,
    "Z", "A", 0
)
dat <- pie_hat_sim(
    latent_causal_matrix,
    pleio_min = 0.1,
    pleio_max = 0.2,
    n_proximal_traits = c(3, 5, 10, 8),
    nid = runif(3+5+10+8, 50000, 150000), 
    nsnp = c(200, 250, 300, 200), 
    h2 = c(0.08, 0.1, 0.1, 0.07)
)
30 causal relationships specified amongst 30 traits
SNP effects provided for 950 SNPs and 30 traits.
str(dat)
tibble [28,500 × 6] (S3: tbl_df/tbl/data.frame)
 $ SNP  : int [1:28500] 1 2 3 4 5 6 7 8 9 10 ...
 $ AF   : num [1:28500] 0.054 0.647 0.505 0.827 0.845 ...
 $ trait: chr [1:28500] "Y" "Y" "Y" "Y" ...
 $ bhat : num [1:28500] 0.00258 0.0179 -0.00121 -0.00651 -0.00249 ...
 $ se   : num [1:28500] 0.00443 0.00209 0.002 0.00264 0.00276 ...
 $ pval : num [1:28500] 2.80e-01 5.92e-18 2.73e-01 6.83e-03 1.84e-01 ...

e.g. Plot the betas for one proximal trait against another

plot_pair(dat, "Y1", "A2")
`geom_smooth()` using formula = 'y ~ x'

Plot for different proximal traits for the same latent trait

plot_pair(dat, "Z1", "Z8")
`geom_smooth()` using formula = 'y ~ x'

Simulate again but with no pleiotropy

latent_causal_matrix <- dplyr::tribble(
    ~i, ~j, ~eff,
    "Y", "X", 0.25,
    "X", "A", 0.24,
    "Y", "A", 0.34
)
dat <- pie_hat_sim(
    latent_causal_matrix,
    pleio_min = 0,
    pleio_max = 0,
    n_proximal_traits = c(3, 5, 10),
    nid = runif(3+5+10, 50000, 150000), 
    nsnp = c(200, 250, 300), 
    h2 = c(0.08, 0.1, 0.1)
)
21 causal relationships specified amongst 21 traits
SNP effects provided for 750 SNPs and 21 traits.
plot_pair(dat, "Y1", "Y3")
`geom_smooth()` using formula = 'y ~ x'


sessionInfo()
R version 4.5.0 (2025-04-11)
Platform: aarch64-apple-darwin20
Running under: macOS Sonoma 14.6.1

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: Europe/London
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] ggplot2_3.5.2         dplyr_1.1.4           GWASBrewer_0.3.0.0233
[4] simulateGP_0.1.3     

loaded via a namespace (and not attached):
 [1] Matrix_1.7-3       gtable_0.3.6       jsonlite_2.0.0     compiler_4.5.0    
 [5] tidyselect_1.2.1   Rcpp_1.0.14        tidyr_1.3.1        splines_4.5.0     
 [9] scales_1.4.0       yaml_2.3.10        fastmap_1.2.0      lattice_0.22-7    
[13] R6_2.6.1           labeling_0.4.3     generics_0.1.3     knitr_1.50        
[17] htmlwidgets_1.6.4  MASS_7.3-65        tibble_3.2.1       pillar_1.10.2     
[21] RColorBrewer_1.1-3 rlang_1.1.6        xfun_0.52          cli_3.6.5         
[25] withr_3.0.2        magrittr_2.0.3     mgcv_1.9-3         digest_0.6.37     
[29] grid_4.5.0         nlme_3.1-168       lifecycle_1.0.4    vctrs_0.6.5       
[33] evaluate_1.0.3     glue_1.8.0         farver_2.1.2       rmarkdown_2.29    
[37] purrr_1.0.4        tools_4.5.0        pkgconfig_2.0.3    htmltools_0.5.8.1