library(simulateGP)
library(TwoSampleMR)
#> TwoSampleMR version 0.5.10 
#> [>] New: Option to use non-European LD reference panels for clumping etc
#> [>] Some studies temporarily quarantined to verify effect allele
#> [>] See news(package='TwoSampleMR') and https://gwas.mrcieu.ac.uk for further details
#> 
#> Warning:
#> You are running an old version of the TwoSampleMR package.
#> This version:   0.5.10
#> Latest version: 0.5.11
#> Please consider updating using remotes::install_github('MRCIEU/TwoSampleMR')
#> 
#> Attaching package: 'TwoSampleMR'
#> The following objects are masked from 'package:simulateGP':
#> 
#>     allele_frequency, contingency, get_population_allele_frequency

A convenient approach to performing Mendelian randomisation is to use GWAS summary data, often known as summary MR or 2-sample MR. This figure outlines the approach:

2smr

In this vignette we simulate some examples of how to simulate the underlying GWAS summary data for 2-sample MR.

Generating summary data from individual level data

Here we want to achieve the following:

  1. Simulate some genetic or confounding variables
  2. Simulate exposures that are influenced by (1)
  3. Simulate the outcomes that are influenced by (1) and (2)
  4. Get the SNP-exposure and SNP-outcome effects and standard errors for these simulations, for use in further analyses e.g. using the TwoSampleMR package

Here is how to do 1-3:

# Genotypes for 20 SNPs and 10000 individuals, where the SNPs have MAF = 0.5:
g <- make_geno(10000, 20, 0.5)

# These SNPs instrument some exposure, and together explain 5% of the variance
effs <- choose_effects(20, 0.05)

# Create X
x <- make_phen(effs, g)

# Check that the SNPs explain 5% of the variance in x
sum(cor(x, g)^2)
#> [1] 0.05320766

# Create Y, which is negatively influenced by x, explaining 10% of the variance in Y#
# so if the variances of x and y are both 1 then the effect size is about -0.31
beta_xy <- -sqrt(0.1)
y <- make_phen(beta_xy, x)

We now have an X and Y, and the genotypes. To perform 2-sample MR on this we need to obtain the summary data.

dat <- get_effs(x, y, g)

Perform MR

TwoSampleMR::mr(dat)
#> Analysing 'X' on 'Y'
#>   id.exposure id.outcome outcome exposure                    method nsnp
#> 1           X          Y       Y        X                  MR Egger   20
#> 2           X          Y       Y        X           Weighted median   20
#> 3           X          Y       Y        X Inverse variance weighted   20
#> 4           X          Y       Y        X               Simple mode   20
#> 5           X          Y       Y        X             Weighted mode   20
#>            b         se         pval
#> 1 -0.1191640 0.08034709 1.553421e-01
#> 2 -0.2402189 0.06345669 1.533651e-04
#> 3 -0.2607820 0.04651527 2.066081e-08
#> 4 -0.3590511 0.08356491 3.893170e-04
#> 5 -0.2385588 0.05929477 7.265316e-04

The effect size is approx -0.31 as we simulated.

More complex example

Let’s simulate an instance where there are two traits that have different effects on the outcome, but they share some of the same instruments

# Simulate 100 genotypes
g <- make_geno(100000, 80, 0.5)

# Choose effect sizes for instruments for each trait
effs1 <- choose_effects(50, 0.05)
effs2 <- choose_effects(50, 0.05)

# Create X1 and X2, where they overlap some variants
x1 <- make_phen(effs1, g[,1:50])
x2 <- make_phen(effs2, g[,31:80])

# Create Y - x1 has a -0.3 influence on it and x2 has a +0.3 influence on it
y <- make_phen(c(-0.3, 0.3), cbind(x1, x2))

# Perform separate MR on each
dat1 <- get_effs(x1, y, g)
dat2 <- get_effs(x2, y, g)

TwoSampleMR::mr(subset(dat1, pval.exposure < 5e-8))
#> Analysing 'X' on 'Y'
#>   id.exposure id.outcome outcome exposure                    method nsnp
#> 1           X          Y       Y        X                  MR Egger   32
#> 2           X          Y       Y        X           Weighted median   32
#> 3           X          Y       Y        X Inverse variance weighted   32
#> 4           X          Y       Y        X               Simple mode   32
#> 5           X          Y       Y        X             Weighted mode   32
#>            b         se         pval
#> 1 -0.1873725 0.07415657 1.702220e-02
#> 2 -0.2560702 0.02451385 1.529361e-25
#> 3 -0.2228823 0.02767159 7.976943e-16
#> 4 -0.2715727 0.04546517 1.324346e-06
#> 5 -0.2715727 0.04066437 1.809455e-07
TwoSampleMR::mr(subset(dat2, pval.exposure < 5e-8))
#> Analysing 'X' on 'Y'
#>   id.exposure id.outcome outcome exposure                    method nsnp
#> 1           X          Y       Y        X                  MR Egger   32
#> 2           X          Y       Y        X           Weighted median   32
#> 3           X          Y       Y        X Inverse variance weighted   32
#> 4           X          Y       Y        X               Simple mode   32
#> 5           X          Y       Y        X             Weighted mode   32
#>           b         se         pval
#> 1 0.2763200 0.08400630 2.570070e-03
#> 2 0.2973715 0.02535335 9.043432e-32
#> 3 0.2516120 0.03440694 2.615787e-13
#> 4 0.3245394 0.03784075 1.096375e-09
#> 5 0.3087809 0.02339750 2.914096e-14

# Do multivariable MR
# First get the effects for x1, x2 and y, and put them in mv format
mvdat <- make_mvdat(list(x1, x2), y, g)
#> There are 2 exposures
#> Harmonising x1 (x1) and y (y)

# Perform MV MR
TwoSampleMR::mv_multiple(mvdat)
#> $result
#>   id.exposure exposure id.outcome outcome nsnp          b         se
#> 1          x1       x1          y       y   32 -0.2796164 0.01272162
#> 2          x2       x2          y       y   32  0.2961061 0.01287924
#>            pval
#> 1 4.512174e-107
#> 2 5.741421e-117

Generating summary data directly

For large scale analyses it is more convenient to just generate the summary data directly. For this we need to specify the genotype effects on \(x\), determine the standard errors expected based on the sample size, and then determine the genotype effect on \(y\) according to causal effects of \(x\) and pleiotropic paths, and determine the sample sizes for \(y\) also. We can additionally specify the extent of sample overlap.

No sample overlap case

  1. Choose true effects of instruments on \(x\)
  2. Generate summary data for \(x\)
  3. Determine effects of instruments on \(y\)
  4. Generate summary data for \(y\)
  5. Perform two-sample MR

Choose some number of SNPs and the effect allele frequencies and the variance explained in \(x\) by the SNPs

nsnp <- 100
af <- runif(100)
rsq_gx <- 0.4

Generate some effects that correspond to these values


b_gx <- dplyr::tibble(af=af, snp=1:nsnp) %>%
    generate_gwas_params(h2=rsq_gx)

Generate sample effects for 1 million samples

nid_x <- 1000000
bhat_gx <- generate_gwas_ss(b_gx, nid=nid_x)

Assume \(x\) has an effect on \(y\) of -0.3

b_xy <- -0.3

Simulate some balanced pleiotropy - assume that 50% of the instruments have a direct effect on \(y\) and they explain 2% of the variance through this path

plei_index <- sample(1:nsnp, nsnp*0.5)
b_gy_plei <- dplyr::tibble(af=af[plei_index], snp=plei_index) %>%
    generate_gwas_params(h2=0.02)

Sum the genetic effects on \(y\)

b_gy <- b_gx
b_gy$beta <- b_gy$beta * b_xy
b_gy$beta[plei_index] <- b_gy$beta[plei_index] + b_gy_plei$beta

Generate summary data for g-y associations, assuming sample size of 800000

nid_y <- 800000
bhat_gy <- generate_gwas_ss(b_gy, nid=nid_y)

Perform MR

dat <- merge_exp_out(bhat_gx, bhat_gy)
TwoSampleMR::mr(dat)
#> Analysing 'X' on 'Y'
#>   id.exposure id.outcome outcome exposure                    method nsnp
#> 1           X          Y       Y        X                  MR Egger  100
#> 2           X          Y       Y        X           Weighted median  100
#> 3           X          Y       Y        X Inverse variance weighted  100
#> 4           X          Y       Y        X               Simple mode  100
#> 5           X          Y       Y        X             Weighted mode  100
#>            b          se         pval
#> 1 -0.2420559 0.040902499 4.803362e-08
#> 2 -0.3001481 0.004199798 0.000000e+00
#> 3 -0.2818603 0.022631250 1.322205e-35
#> 4 -0.2887265 0.016955286 3.532470e-31
#> 5 -0.2887265 0.016077724 6.554287e-33

Sample overlap case

It is of course possible to simulate varying degrees of sample overlap by simulating two individual-level datasets with the same underlying DAG, and then selecting which individuals contribute to the exposure and outcome effects e.g.

nid <- 10000
nsnp <- 100
g <- make_geno(nid, nsnp, 0.5)
u <- rnorm(nid)
beta_gx <- choose_effects(nsnp, 0.05)
x <- make_phen(c(beta_gx, 0.1), cbind(g, u))
y <- make_phen(c(0.3, 0.1), cbind(x, u))

Complete sample overlap

dat <- get_effs(x, y, g)

No sample overlap

id1 <- 1:(nid/2)
id2 <- (nid/2+1):nid
exp <- gwas(x[id1], g[id1,])
out <- gwas(y[id2], g[id2,])
dat <- merge_exp_out(exp, out)

Partial sample overlap

id1 <- 1:round(nid/1.5)
id2 <- round(nid/1.5+1):nid
exp <- gwas(x[id1], g[id1,])
out <- gwas(y[id2], g[id2,])
dat <- merge_exp_out(exp, out)

An approach to doing this directly, bypassing simulation of individuals, is also implemented in the summary_set function e.g.

nsnp <- 100
af <- runif(nsnp, 0.01, 0.99)
beta_gx <- dplyr::tibble(af=af, snp=1:nsnp) %>%
    generate_gwas_params(h2=0.5)
beta_gy <- beta_gx
beta_gy$beta <- beta_gy$beta * 0.3

dat <- summary_set(
    beta_gx = beta_gx$beta,
    beta_gy = beta_gy$beta,
    af = beta_gx$af,
    n_gx = 10000,
    n_gy = 10000,
    n_overlap = 10000,
    cor_xy = 0.6,
    prev_y = NA,
    sigma_x = 1,
    sigma_y = 1
)

dat %>%
    dplyr::filter(pval.exposure < 5e-8) %>%
    TwoSampleMR::mr(., method_list="mr_ivw")
#> Analysing 'X' on 'Y'
#>   id.exposure id.outcome outcome exposure                    method nsnp
#> 1           X          Y       Y        X Inverse variance weighted   39
#>           b        se         pval
#> 1 0.2751496 0.0152435 7.848903e-73