2025-01-21-clustering-prs

Author

Gibran Hemani

Published

March 20, 2025

Background

If a trait is a composite of different sub-traits then is it possible to recover those sub-traits by identifying clustering variants into scores and then clustering scores?

library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(simulateGP)


n <- 100000
nclust <- 5
nsnp_per_trait <- 10
ntrait_per_clust <- 10

make_cluster <- function(n, nsnp_per_trait, ntrait_per_clust) {
    
    G <- list()
    for(i in 1:ntrait_per_clust) {
        g <- matrix(rbinom(n * nsnp_per_trait, 2, 0.3), nrow=n)
        G[[i]] <- g
        trait <- scale(g) %*% rnorm(nsnp_per_trait)
        if(i == 1) {
            trait_all <- trait
        } else {
            trait_all <- cbind(trait_all, trait)
        }
    }
    G <- do.call(cbind, G)

    return(list(G=G, T=trait_all))
}

make_disease <- function(n, nclust, nsnp_per_trait, ntrait_per_clust) {
    
    L <- list()
    for(i in 1:nclust) {
        L[[i]] <- make_cluster(n, nsnp_per_trait, ntrait_per_clust)
    }
    
    T <- lapply(L, \(x) x$T) %>% do.call(cbind, .)
    D <- tibble(
        liability = as.matrix(T) %*% rnorm(ncol(T)) %>% drop(),
        disease = rbinom(n, 1, plogis(liability))
    )
    G <- lapply(L, \(x) x$G) %>% do.call(cbind, .)
    return(list(G=G, T=T, D=D))
}

gwas <- function(y, g) {
    (cov(y, g) / apply(g, 2, var)) %>% drop()
}

make_effects_matrix <- function(dat) {
    o <- lapply(1:ncol(dat$T), \(i) {
        simulateGP::gwas(dat$T[,i], dat$G)$fval
    }) %>% do.call(cbind, .)
    return(o)
}
dat <- make_disease(n, nclust, nsnp_per_trait, ntrait_per_clust=1)
effects_matrix <- make_effects_matrix(dat)

dim(effects_matrix)
dim(gwas(dat$D$disease, dat$G))
effects_matrix[1:5, 1:5]
o[1:5, 1:5]

a <- princomp(sqrt(effects_matrix))
s <- a$scores
s[s < 0] <- NA

plot(s[,1], s[,3], col=dat$D$disease+1)



str(dat)
table(dat$D$disease)

sessionInfo()
R version 4.4.3 (2025-02-28)
Platform: aarch64-apple-darwin20
Running under: macOS Sonoma 14.6.1

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: Europe/London
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] simulateGP_0.1.3 dplyr_1.1.4     

loaded via a namespace (and not attached):
 [1] digest_0.6.37     utf8_1.2.4        R6_2.5.1          fastmap_1.2.0    
 [5] tidyselect_1.2.1  xfun_0.48         magrittr_2.0.3    glue_1.8.0       
 [9] tibble_3.2.1      knitr_1.48        pkgconfig_2.0.3   htmltools_0.5.8.1
[13] rmarkdown_2.27    generics_0.1.3    lifecycle_1.0.4   cli_3.6.3        
[17] fansi_1.0.6       vctrs_0.6.5       compiler_4.4.3    tools_4.4.3      
[21] pillar_1.9.0      evaluate_1.0.1    yaml_2.3.10       rlang_1.1.4      
[25] jsonlite_1.8.9    htmlwidgets_1.6.4