Generating LD matrices • simulateGP

library(simulateGP)
library(jsonlite)
library(data.table)
library(tidyverse)

Obtain independent LD regions

Use https://bitbucket.org/nygcresearch/ldetect-data from Berisa and Pickrell (2016) - a list of independent LD regions for Africans, Europeans and Asians.

Note: The Africans dataset has a couple of None values that I am interpolating in a simple way to avoid errors.

a <- data.table::fread("https://bitbucket.org/nygcresearch/ldetect-data/raw/ac125e47bf7ff3e90be31f278a7b6a61daaba0dc/AFR/fourier_ls-all.bed")
midpoint <- round((108823642 + 111048570)/2)
a$stop[a$stop=="None"] <- midpoint
a$start[a$start=="None"] <- midpoint
a$start <- as.numeric(a$start)
a$stop <- as.numeric(a$stop)
a$pop <- "AFR"

b <- data.table::fread("https://bitbucket.org/nygcresearch/ldetect-data/raw/ac125e47bf7ff3e90be31f278a7b6a61daaba0dc/ASN/fourier_ls-all.bed")
b$pop <- "EAS"

c <- data.table::fread("https://bitbucket.org/nygcresearch/ldetect-data/raw/ac125e47bf7ff3e90be31f278a7b6a61daaba0dc/EUR/fourier_ls-all.bed")
c$pop <- "EUR"

ldetect <- dplyr::bind_rows(a,b,c)

# Avoid overlap between regions
ldetect$stop <- ldetect$stop - 1

This ldetect object is saved as a data object in this package.

Generate LD matrix objects

For each region create an .rds object that contains a list of map and ld

Create a generate_ldobj_config.json file:

{
    "dl_dir": "/path/to/downloads"
}

Setup directories

conf <- read_json("generate_ldobj_config.json")
dir.create(conf$dl_dir)
dir.create(file.path(conf$dl_dir, "EUR_1kg_hm3"))
dir.create(file.path(conf$dl_dir, "EAS_1kg_hm3"))
dir.create(file.path(conf$dl_dir, "AFR_1kg_hm3"))
setwd(conf$dl_dir)

Get the 1000 genomes files

wget -O 1kg.v3.tgz http://fileserve.mrcieu.ac.uk/ld/1kg.v3.tgz
tar xzvf 1kg.v3.tgz
rm 1kg.v3.tgz
wget https://github.com/MRCIEU/gwasglue/raw/master/inst/hapmap3/hapmap3_autosome.snplist.gz
gunzip hapmap3_autosome.snplist.gz
plink --bfile EUR --extract hapmap3_autosome.snplist --make-bed --keep-allele-order --out EUR_1kg_hm3
plink --bfile EAS --extract hapmap3_autosome.snplist --make-bed --keep-allele-order --out EAS_1kg_hm3
plink --bfile AFR --extract hapmap3_autosome.snplist --make-bed --keep-allele-order --out AFR_1kg_hm3

Generate matrices for each population

data(ldetect)

map_eas <- generate_ldobj(
    outdir=file.path(conf$dl_dir, "EAS_1kg_hm3"), 
    bfile=file.path(conf$dl_dir, "EAS_1kg_hm3"), 
    regions=subset(ldetect, pop=="EAS"),
    nthreads=16
)

map_afr <- generate_ldobj(
    outdir=file.path(conf$dl_dir, "AFR_1kg_hm3"), 
    bfile=file.path(conf$dl_dir, "AFR_1kg_hm3"), 
    regions=subset(ldetect, pop=="AFR"),
    nthreads=16
)

map_eur <- generate_ldobj(
    outdir=file.path(conf$dl_dir, "EUR_1kg_hm3"), 
    bfile=file.path(conf$dl_dir, "EUR_1kg_hm3"), 
    regions=subset(ldetect, pop=="EUR"),
    nthreads=16
)

Package them up

cmd <- paste0("cd ", conf$dl_dir, "; tar cvf EUR_1kg_hm3_ldobj.tar EUR_1kg_hm3")
system(cmd)

cmd <- paste0("cd ", conf$dl_dir, "; tar cvf EAS_1kg_hm3_ldobj.tar EAS_1kg_hm3")
system(cmd)

cmd <- paste0("cd ", conf$dl_dir, "; tar cvf AFR_1kg_hm3_ldobj.tar AFR_1kg_hm3")
system(cmd)

Try it out for generating summary data

Using just one region with just one causal variant. Read in a regional LD matrix

set.seed(1234)
fn <- list.files(file.path(conf$dl_dir, "EAS_1kg_hm3"), full.names=TRUE) %>% 
    grep("ldobj_chr", ., value=TRUE) %>% 
    {.[7]}
ldobj_eas <- readRDS(fn)

Generate the LD-aware effects from a single causal variant

params <- ldobj_eas$map %>%
    generate_gwas_params(h2=0.01, Pi=1/nrow(.)) %>%
    add_ld_to_params(ldobj=ldobj_eas)

Add some random noise for a sample size of 100000 and plot

ss <- params %>%
    generate_gwas_ss(100000)

ggplot(ss, aes(x=pos, y=-log10(pval))) +
geom_point()

Now try whole genome with 100 causal variants - from files - takes less than 2 minutes for HapMap3 with 1 thread

# Generate effects
params <- map_eas %>% 
    generate_gwas_params(h2=0.01, Pi=100/nrow(.)) %>%
    add_ld_to_params(ldobjdir = file.path(conf$dl_dir, "EAS_1kg_hm3"), nthreads=16)

# Generate sample estimates
ss <- params %>%
    generate_gwas_ss(10000000)

# Plot
ggplot(ss, aes(x=pos, y=-log10(pval))) +
geom_point() +
facet_grid(. ~ chr, scale="free_x", space="free_x")