Use https://bitbucket.org/nygcresearch/ldetect-data from Berisa and Pickrell (2016) - a list of independent LD regions for Africans, Europeans and Asians.
Note: The Africans dataset has a couple of None values that I am interpolating in a simple way to avoid errors.
a <- data.table::fread("https://bitbucket.org/nygcresearch/ldetect-data/raw/ac125e47bf7ff3e90be31f278a7b6a61daaba0dc/AFR/fourier_ls-all.bed")
midpoint <- round((108823642 + 111048570)/2)
a$stop[a$stop=="None"] <- midpoint
a$start[a$start=="None"] <- midpoint
a$start <- as.numeric(a$start)
a$stop <- as.numeric(a$stop)
a$pop <- "AFR"
b <- data.table::fread("https://bitbucket.org/nygcresearch/ldetect-data/raw/ac125e47bf7ff3e90be31f278a7b6a61daaba0dc/ASN/fourier_ls-all.bed")
b$pop <- "EAS"
c <- data.table::fread("https://bitbucket.org/nygcresearch/ldetect-data/raw/ac125e47bf7ff3e90be31f278a7b6a61daaba0dc/EUR/fourier_ls-all.bed")
c$pop <- "EUR"
ldetect <- dplyr::bind_rows(a,b,c)
# Avoid overlap between regions
ldetect$stop <- ldetect$stop - 1
This ldetect
object is saved as a data object in this
package.
For each region create an .rds
object that contains a
list of map
and ld
Create a generate_ldobj_config.json
file:
{
"dl_dir": "/path/to/downloads"
}
Setup directories
conf <- read_json("generate_ldobj_config.json")
dir.create(conf$dl_dir)
dir.create(file.path(conf$dl_dir, "EUR_1kg_hm3"))
dir.create(file.path(conf$dl_dir, "EAS_1kg_hm3"))
dir.create(file.path(conf$dl_dir, "AFR_1kg_hm3"))
setwd(conf$dl_dir)
Get the 1000 genomes files
wget -O 1kg.v3.tgz http://fileserve.mrcieu.ac.uk/ld/1kg.v3.tgz
tar xzvf 1kg.v3.tgz
rm 1kg.v3.tgz
wget https://github.com/MRCIEU/gwasglue/raw/master/inst/hapmap3/hapmap3_autosome.snplist.gz
gunzip hapmap3_autosome.snplist.gz
plink --bfile EUR --extract hapmap3_autosome.snplist --make-bed --keep-allele-order --out EUR_1kg_hm3
plink --bfile EAS --extract hapmap3_autosome.snplist --make-bed --keep-allele-order --out EAS_1kg_hm3
plink --bfile AFR --extract hapmap3_autosome.snplist --make-bed --keep-allele-order --out AFR_1kg_hm3
Generate matrices for each population
data(ldetect)
map_eas <- generate_ldobj(
outdir=file.path(conf$dl_dir, "EAS_1kg_hm3"),
bfile=file.path(conf$dl_dir, "EAS_1kg_hm3"),
regions=subset(ldetect, pop=="EAS"),
nthreads=16
)
map_afr <- generate_ldobj(
outdir=file.path(conf$dl_dir, "AFR_1kg_hm3"),
bfile=file.path(conf$dl_dir, "AFR_1kg_hm3"),
regions=subset(ldetect, pop=="AFR"),
nthreads=16
)
map_eur <- generate_ldobj(
outdir=file.path(conf$dl_dir, "EUR_1kg_hm3"),
bfile=file.path(conf$dl_dir, "EUR_1kg_hm3"),
regions=subset(ldetect, pop=="EUR"),
nthreads=16
)
Package them up
Using just one region with just one causal variant. Read in a regional LD matrix
set.seed(1234)
fn <- list.files(file.path(conf$dl_dir, "EAS_1kg_hm3"), full.names=TRUE) %>%
grep("ldobj_chr", ., value=TRUE) %>%
{.[7]}
ldobj_eas <- readRDS(fn)
Generate the LD-aware effects from a single causal variant
params <- ldobj_eas$map %>%
generate_gwas_params(h2=0.01, Pi=1/nrow(.)) %>%
add_ld_to_params(ldobj=ldobj_eas)
Add some random noise for a sample size of 100000 and plot
ss <- params %>%
generate_gwas_ss(100000)
ggplot(ss, aes(x=pos, y=-log10(pval))) +
geom_point()
Now try whole genome with 100 causal variants - from files - takes less than 2 minutes for HapMap3 with 1 thread
# Generate effects
params <- map_eas %>%
generate_gwas_params(h2=0.01, Pi=100/nrow(.)) %>%
add_ld_to_params(ldobjdir = file.path(conf$dl_dir, "EAS_1kg_hm3"), nthreads=16)
# Generate sample estimates
ss <- params %>%
generate_gwas_ss(10000000)
# Plot
ggplot(ss, aes(x=pos, y=-log10(pval))) +
geom_point() +
facet_grid(. ~ chr, scale="free_x", space="free_x")