Gibran Hemani’s lab book - Heterogeneity in cell-type interaction model

Background

Two models

Estimate mQTL in purified cells. Marginal effects obtained per cell type. Can estimate heterogeneity between of mQTL effect estimates across cell types
Estimate G x celltype proportion interaction for each celltype. Interaction is a deviation fron mean effect. Quite a complex model Can we just estimate heterogeneity between interaction terms?

library(ggplot2)
library(dplyr)


Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

sim <- function(nc, n, betas = runif(nc, -2, 2)) {
    g <- rbinom(n, 2, 0.4)
    
    m <- sapply(1:nc, function(i)
    {
        g * betas[i] + rnorm(n)
    })
    # for each individual sample cell type proportions
    cellprop <- sapply(1:n, function(x) {a <- runif(nc); a/sum(a)}) %>% t()
    # weighted sum
    M <- (scale(m) * cellprop) %>% rowSums

    res <- lapply(1:nc, function(i)
    {
        o <- summary(lm(M ~ g * cellprop[,i]))
        tibble(cell=i, b=o$coef[4,1], se=o$coef[4,2], pval=o$coef[4,4], method="interaction")
    }) %>% bind_rows()

    res2 <- lapply(1:nc, function(i)
    {
        o <- summary(lm(m[,i] ~ g))
        tibble(cell=i, b=o$coef[2,1], se=o$coef[2,2], pval=o$coef[2,4], method="marginal")
    }) %>% bind_rows()

    betas <- tibble(cell=1:nc, b=betas, method="true")
    

    # cellprop <- as_tibble(cellprop)
    # m <- as_tibble(m)
    # names(m) <- paste0("m", 1:nc)
    # names(cellprop) <- paste0("cp", 1:nc)
    # cellprop$M <- M
    # cellprop$g <- g

    # cellprop <- bind_cols(cellprop, m)

    # return(tibble(int=res, marg=res2, betas))

    # heterogeneity
    het <- bind_rows(
        fixed_effects_meta_analysis(res$b, res$se) %>% 
            as_tibble() %>%
            slice_head(n=1) %>%
            mutate(method="interaction"),
        fixed_effects_meta_analysis(res2$b, res2$se) %>% 
            as_tibble() %>%
            slice_head(n=1) %>%
            mutate(method="marginal"),
    )

    effs <- bind_rows(res, res2, betas)
    return(list(effs=effs, het=het))
}

# Provide vector of interaction betas and their standard errors
fixed_effects_meta_analysis <- function(beta_vec, se_vec) {
    w <- 1 / se_vec^2
    beta <- sum(beta_vec * w, na.rm=T) / sum(w, na.rm=T)
    se <- sqrt(1 / sum(w, na.rm=T))
    pval <- pnorm(abs(beta / se), lower.tail = FALSE)
    Qj <- w * (beta-beta_vec)^2
    Q <- sum(Qj, na.rm=T)
    Qdf <- sum(!is.na(beta_vec))-1
    if(Qdf == 0) Q <- 0
    Qjpval <- pchisq(Qj, 1, lower.tail=FALSE)
    Qpval <- pchisq(Q, Qdf, lower.tail=FALSE)
    pv <- pnorm(abs(beta_vec)/se_vec, lower.tail=FALSE)
    min_pv <- min(pv, na.rm=T)
    return(list(beta=beta, se=se, Q=Q, Qdf=Qdf, Qpval=Qpval, Qj=Qj, Qjpval=Qjpval, min_pv=min_pv))
}

param <- expand.grid(
    nc=5,
    n=seq(100,1000, by=50),
    bsd=0.4,
    nsim=1:100
)
dim(param)

[1] 1900    4

r <- lapply(1:nrow(param), function(i) {
    o <- sim(param$nc[i], param$n[i], rnorm(param$nc[i], sd=param$bsd[i]))
    bind_rows(
        param[i,] %>% mutate(Qpval=o$het$Qpval[1], method="interaction"),
        param[i,] %>% mutate(Qpval=o$het$Qpval[2], method="marginal")
    )
}) %>% bind_rows()

r %>% group_by(n, method) %>%
    summarise(pow=sum(Qpval < 0.05)/n()) %>%
    ggplot(., aes(x=n, y=pow)) +
    geom_point(aes(colour=method)) +
    geom_line(aes(colour=method))

`summarise()` has grouped output by 'n'. You can override using the `.groups`
argument.

ggplot(r, aes(x=as.factor(n), y=-log10(Qpval))) +
    geom_boxplot(aes(colour=method))

Summary

Heterogeneity estimate seems ok
Interaction model is very low power compared to marginal cell estimates

Ignore

d <- sim(2, 1000)

sim(5, 1000, b=rnorm(5, sd=0.4))
sim(2, 10000, c(1,2))


n <- 10000
nc <- 2
betas <- c(1,2)
g <- rbinom(n, 2, 0.4)

m <- sapply(1:nc, function(i) {
    g * betas[i] + rnorm(n)
})
# for each individual sample cell type proportions
cp <- sapply(1:n, function(x) {a <- runif(nc); a/sum(a)}) %>% t()
# weighted sum
M <- (scale(m) * cp) %>% rowSums

cov(M, g * cp[,1]) / var(g*cp[,1])

summary(lm(M ~ g*cp[,1]))

summary(lm(M ~ g))
cov(M, g) / var(g)

summary(lm(M ~ cp[,1]))
cov(M, cp[,1]) / var(cp[,1])

summary(lm(M ~ g*cp[,1]))
cov(M, g*cp[,1]) / var(g*cp[,1])

Mr <- residuals(lm(M ~ g+cp[,1]))
summary(lm(Mr ~ g:cp[,1]))
cov(Mr, g*cp[,1]) / var(g*cp[,1])




summary(lm(M ~ g:cp[,1]))
cov(M, cp[,1]) / var(cp[,1])


summary(lm(M ~ g * cp1, d))
summary(lm(m1 ~ g, d))

sim2 <- function(nc, n, betas = runif(nc, -2, 2)) {
    g <- rbinom(n, 2, 0.4)
    
    m <- sapply(1:nc, function(i)
    {
        g * betas[i] + rnorm(n)
    })
    # for each individual sample cell type proportions
    cellprop <- sapply(1:n, function(x) {a <- runif(nc); a/sum(a)}) %>% t()
    # weighted sum
    M <- (scale(m) * cellprop) %>% rowSums

    res <- lapply(1:nc, function(i)
    {
        o <- summary(lm(M ~ g * cellprop[,i]))
        tibble(cell=i, b=o$coef[4,1], se=o$coef[4,2], pval=o$coef[4,4], method="interaction")
    }) %>% bind_rows()

    res2 <- lapply(1:nc, function(i)
    {
        o <- summary(lm(m[,i] ~ g))
        tibble(cell=i, b=o$coef[2,1], se=o$coef[2,2], pval=o$coef[2,4], method="marginal")
    }) %>% bind_rows()

    betas <- tibble(cell=1:nc, b=betas, method="true")
    

    cellprop <- as_tibble(cellprop)
    m <- as_tibble(m)
    names(m) <- paste0("m", 1:nc)
    names(cellprop) <- paste0("cp", 1:nc)
    cellprop$M <- M
    cellprop$g <- g

    cellprop <- bind_cols(cellprop, m)
    return(cellprop)
}

ana <- function(d) {
    nc <- length(grep("cp", names(d)))

    # per cell estimates
    o1 <- lapply(1:nc, \(x) { 
        f <- paste0("M ~ g * cp", x)
        o <- summary(lm(f, data=d))$coef
        n <- rownames(o)
        o <- as_tibble(o)
        names(o) <- c("b", "se", "tval", "pval")
        o$term <- n
        o$cell <- x
        o$mod <- "interaction"
        o
    }) %>% bind_rows()

    # cell int estimates
    o2 <- lapply(1:nc, \(x) { 
        f <- paste0("m", x, " ~ g")
        o <- summary(lm(f, data=d))$coef
        n <- rownames(o)
        o <- as_tibble(o)
        names(o) <- c("b", "se", "tval", "pval")
        o$term <- n
        o$cell <- x
        o$mod <- "marginal"
        o
    }) %>% bind_rows()
    bind_rows(o1, o2)
}
d <- sim2(2, 1000)

Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
`.name_repair` is omitted as of tibble 2.0.0.
ℹ Using compatibility `.name_repair`.

ana(d) %>% as.data.frame

              b         se        tval          pval        term cell
1   0.465602983 0.06112157   7.6176547  5.994846e-14 (Intercept)    1
2  -0.685796058 0.05872337 -11.6784187  1.268203e-29           g    1
3   0.513953777 0.11109149   4.6264011  4.211167e-06         cp1    1
4  -0.491006953 0.10538418  -4.6592094  3.604804e-06       g:cp1    1
5   0.979556760 0.06188496  15.8286714  1.658308e-50 (Intercept)    2
6  -1.176803011 0.05829833 -20.1858778  3.135017e-76           g    2
7  -0.513953777 0.11109149  -4.6264011  4.211167e-06         cp2    2
8   0.491006953 0.10538418   4.6592094  3.604804e-06       g:cp2    2
9   0.021219262 0.04455143   0.4762869  6.339742e-01 (Intercept)    1
10 -1.814232687 0.04290938 -42.2805626 1.112888e-224           g    1
11  0.009644626 0.04697208   0.2053268  8.373587e-01 (Intercept)    2
12 -0.824782042 0.04524081 -18.2309294  2.550133e-64           g    2
           mod
1  interaction
2  interaction
3  interaction
4  interaction
5  interaction
6  interaction
7  interaction
8  interaction
9     marginal
10    marginal
11    marginal
12    marginal

cov(residuals(lm(M ~ g + cp1, d)), d$g*d$cp1) / var(d$g*d$cp1)
cov(d$M, d$g*d$cp1) / var(d$g*d$cp1)
summary(lm(M ~ g*cp1, d))
cov(d$M, d$g) / var(d$g)

summary(lm(M ~ g + cp1, d))
summary(lm(M ~ g + cp1 + g*cp1, d))
summary(lm(M ~ g + cp1 + g*cp1 + cp2 + g*cp2, d))


r <- residuals(lm(M ~ g + cp1, d))

summary(lm(r ~ g*cp1, d))
cov(d$cp1)

n <- 10000
a <- rnorm(n, m=0 sd=10)
b <- rnorm(n, sd=3)
y <- a + b - 4*a*b + rnorm(n)

cov(y, a*b) / var(a*b)
summary(lm(y ~ a*b))

sessionInfo()

R version 4.3.2 (2023-10-31)
Platform: aarch64-apple-darwin20 (64-bit)
Running under: macOS Ventura 13.6

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0

locale:
[1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8

time zone: Europe/London
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] dplyr_1.1.4   ggplot2_3.4.2

loaded via a namespace (and not attached):
 [1] vctrs_0.6.4       cli_3.6.1         knitr_1.45        rlang_1.1.2      
 [5] xfun_0.41         generics_0.1.3    jsonlite_1.8.7    labeling_0.4.2   
 [9] glue_1.6.2        colorspace_2.1-0  htmltools_0.5.7   scales_1.2.1     
[13] fansi_1.0.5       rmarkdown_2.25    grid_4.3.2        evaluate_0.23    
[17] munsell_0.5.0     tibble_3.2.1      fastmap_1.1.1     yaml_2.3.7       
[21] lifecycle_1.0.4   compiler_4.3.2    htmlwidgets_1.6.3 pkgconfig_2.0.3  
[25] farver_2.1.1      digest_0.6.33     R6_2.5.1          tidyselect_1.2.0 
[29] utf8_1.2.4        pillar_1.9.0      magrittr_2.0.3    withr_2.5.2      
[33] tools_4.3.2       gtable_0.3.3