residual-height notebook 2 – Gibran Hemani’s lab book

dat <- sim(b_xy = -0.5, b_uy = -0.2, b_ux = 1, b_prs=1, h2=0.7, r2_env_height=0.15, r2_prs=0.4, n=100000)[[2]]
str(dat)

tibble [100,000 × 9] (S3: tbl_df/tbl/data.frame)
 $ prs_explained  : num [1:100000] -0.4 0.114 0.213 -0.449 0.138 ...
 $ prs_unexplained: num [1:100000] -0.21 -0.692 -0.491 0.825 0.526 ...
 $ prs            : num [1:100000] -0.61 -0.577 -0.277 0.377 0.665 ...
 $ u              : num [1:100000] -0.106 0.406 0.151 -0.251 0.682 ...
 $ ex             : num [1:100000] 0.518 -0.422 -0.401 -0.494 -0.104 ...
 $ x              : num [1:100000] -0.199 -0.593 -0.527 -0.369 1.242 ...
 $ residual_x     : num [1:100000] 0.2013 -0.7078 -0.74 0.0798 1.104 ...
 $ ey             : num [1:100000] -0.955 -0.132 0.382 -1.381 -0.833 ...
 $ y              : num [1:100000] -0.8345 0.0837 0.6148 -1.1465 -1.5905 ...

gx_to_gp <- function(liability, varexp, prevalence) {
    x_prime <- qnorm(prevalence, 0, 1, lower.tail=FALSE)
    p <- pnorm(x_prime, mean=liability, sd = sqrt(1 - varexp), lower.tail=FALSE)
    return(p)
}

generate_binary_outcome <- function(liability, varexp, prevalence) {
    cc <- rbinom(length(liability), 1, gx_to_gp(liability, varexp, prevalence))
    return(cc)
}

binary_2sls <- function(x, y, prs) {
    mod <- summary(glm(y ~ prs, family="binomial"))
    bgy <- mod$coefficients[2, 1]
    se_bgy <- mod$coefficients[2, 2]

    mod <- summary(lm(x ~ prs))
    bgx <- mod$coefficients[2, 1]
    se_bgx <- mod$coefficients[2, 2]
    beta_xy <- bgy / bgx
    se_beta_xy <- sqrt((se_bgy / bgx)^2 + (bgy * se_bgx / bgx^2)^2)
    return(list(beta_xy = beta_xy, se_beta_xy = se_beta_xy))
}

continuous_2sls <- function(x, y, prs) {
    mod <- summary(lm(y ~ prs))
    bgy <- mod$coefficients[2, 1]
    se_bgy <- mod$coefficients[2, 2]

    mod <- summary(lm(x ~ prs))
    bgx <- mod$coefficients[2, 1]
    se_bgx <- mod$coefficients[2, 2]
    beta_xy <- bgy / bgx
    se_beta_xy <- sqrt((se_bgy / bgx)^2 + (bgy * se_bgx / bgx^2)^2)
    return(list(beta_xy = beta_xy, se_beta_xy = se_beta_xy))
}

dat$cc <- generate_binary_outcome(dat$y, varexp = 0.2, prevalence = 0.1)

continuous_2sls(dat$x, dat$y, dat$prs_explained)
binary_2sls(dat$x, dat$cc, dat$prs_explained)

$beta_xy: -0.497850997027835
$se_beta_xy: 0.00707376641206964

$beta_xy: -0.626409705538854
$se_beta_xy: 0.0160154339944029

param <- expand.grid(
    varexp = c(0.2, 0.5, 0.8),
    prevalence = c(0.01, 0.1, 0.5)
)

binary_res <- lapply(1:nrow(param), \(i) {
    varexp <- param[i,]$varexp
    prevalence <- param[i,]$prevalence
    dat$cc <- generate_binary_outcome(dat$y, varexp = varexp, prevalence = prevalence)
    res_continuous <- continuous_2sls(dat$x, dat$y, dat$prs_explained)
    res_binary <- binary_2sls(dat$x, dat$cc, dat$prs_explained)
    tibble(
        varexp = varexp,
        prevalence = prevalence,
        method = c("binary", "continuous"),
        beta_xy = c(res_binary$beta_xy, res_continuous$beta_xy),
        se_beta_xy = c(res_binary$se_beta_xy, res_continuous$se_beta_xy)
    )
}) %>% bind_rows()

ggplot(binary_res, aes(x = beta_xy, y = se_beta_xy, color = method)) +
    geom_point() +
    facet_grid(varexp ~ prevalence) +
    labs(x = "Estimated beta_xy", y = "Standard error of beta_xy", color = "Method")

estimate_beta_cy(
    height = dat$x,
    prs = dat$prs_explained,
    outcome = dat$cc,
    var_c = 0.15,
    h2 = 0.7,
    n_boot = 100
)

estimate_beta_cy(
    height = dat$x,
    prs = dat$prs_explained,
    outcome = dat$y,
    var_c = 0.15,
    h2 = 0.7,
    n_boot = 100
)

Binary outcome detected (prevalence = 0.189)
Liability scale transformation factor = 0.2705

$beta_xy

height: -0.353678285546259

$se_beta_xy

0.00836783982859172

$beta_cy_method1

height: -0.0461408201620962

$beta_cy_method2

height: -0.0461408201621222

$beta_cy_combined

height: -0.0461408201621133

$se_method1

0.0642545301989432

$se_method2

0.0466293324546653

$se_combined

0.0380479336137564

$cov_estimators

5.18850345151184e-05

$correlation_estimators

0.0173172592745444

$weight_method1

0.342370006827975

$weight_method2

0.657629993172025

$r2_prs

0.281387979315542

$is_binary

TRUE

$prevalence

0.18905

$outcome_scale

'liability'

$beta_cy_method1_observed

height: -0.012482458056538

$beta_cy_method2_observed

height: -0.0124824580565451

$boot_distribution_method1

-0.0611031904933605
-0.135169080428818
-0.139630417357704
-0.156230633575275
-0.119066010525505
0.0205816882683652
0.080569271731431
-0.0527727037713818
-0.0498117314459362
-0.0787439314490554
0.041691414202236
-0.0609742220878743
0.0397545020862654
-0.0954466012620649
-0.0591472885341207
-0.0585753434347414
-0.0162898277995962
-0.105405567851969
0.0198725815306079
-0.122971528519119
-0.0434949530082326
-0.0622390114312051
0.0757112913282528
-0.0151974663859412
-0.116032428498501
0.0806090973291542
-0.129793099597847
0.0371662748236907
-0.0484091990962174
-0.053298371618411
-0.00561247893316303
-0.0686899047729173
0.00749802630228404
-0.0598614575650598
-0.121132656930218
0.000982959868562029
-0.0806600994747147
-0.043797327737302
0.037344789315098
-0.101471647641016
-0.06088227629016
-0.0420804320225053
-0.0225762662441897
-0.073730119682587
0.0408548152575747
-0.0344591629950885
0.0382961762728246
-0.0211696677122442
-0.118105800628724
-0.0360727545944138
-0.0207528663455216
0.0259590111030823
-0.108834868225128
0.0209172890814239
-0.00841872670066864
-0.0886134145444476
-0.0121242066573223
-0.0120022724837197
-0.0863692445534315
-0.0105136817488118
-0.109834948624145
-0.0524489238679272
0.0421164689889682
-0.0692817925022548
-0.0750808129470261
-0.139994570485952
-0.10529482754501
-0.00906185948704938
-0.0424167002940804
-0.0278478007856253
-0.0682019069045269
-0.0427361827046255
-0.173127137533978
-0.0686733587952739
-0.056433295853779
-0.0770850539543246
-0.052261924502032
-0.0246860279526018
-0.0724053705073356
-0.0178154070971077
-0.00365709639212909
0.0439906158214736
9.3172723094532e-05
-0.0521812699715958
0.0383029205540669
0.102361982320747
-0.260909275675758
0.0672883814651478
-0.0941630129920866
0.102542943248134
-0.0455534770373631
0.00612279782910471
-0.0598015006251095
-0.148969568495035
-0.0245998959426125
-0.040854015979174
-0.146992108183965
-0.0655688569651259
0.0145760157573712
-0.0251416546763345

$boot_distribution_method2

-0.0652278245705572
-0.104650626868185
-0.113056574457559
-0.0945793262498817
-0.101569936702741
-0.0265829238892682
0.0292920175141744
-0.087682005918698
-0.0592853920776707
-0.0731726695504334
0.0257606459816746
-0.0685198125204218
0.015087702267375
-0.0722248795388675
-0.0396232631309289
-0.0771622496094649
-0.00965308231199817
-0.0884132588714599
-0.0241147025590098
-0.113912765098206
-0.0528612609615785
-0.0404603064127261
0.0382899789869132
-0.0167458503278753
-0.12514307401458
0.0405238362724186
-0.108546341675922
-0.00408163724004184
-0.0826122096248635
-0.0533845155778164
-0.0237342762112691
-0.064053665904024
-0.0208504732247801
-0.0711807293854162
-0.110322320963719
-0.0300852487189132
-0.0714529760448407
-0.0356023969553724
0.0133827003732791
-0.0784652912766696
-0.0532794933065255
-0.0625523855787123
-0.0216974410389273
-0.0619901822883103
0.0123211903912979
-0.0241138549295462
-0.00260650974968311
-0.0351973835872563
-0.0710054075846515
-0.0444341983998022
-0.0110788458400416
0.0181073797348081
-0.078523480124807
0.0220511992277951
-0.0285536538276524
-0.0762069611480715
-0.0265961304900171
-0.0178983001983176
-0.084151750407915
-0.016452289940594
-0.0983578674944817
-0.0447059399782941
0.0194687654654045
-0.0597526454723468
-0.0683541557797384
-0.125784181966446
-0.083293725878232
-0.034211427401645
-0.0412799394959916
0.000485459087808684
-0.0709076677296979
-0.0527319789048432
-0.140697611245831
-0.068678899573747
-0.0299791196313125
-0.0689337749031155
-0.063454270599156
-0.0567547738398126
-0.0653908346359646
-0.0470383796348957
-0.00823525867640807
0.0242222037493185
-0.0153889859976803
-0.0569068866363121
0.0241691026948184
0.0619171053039704
-0.191423449680033
0.0274568382979265
-0.0609507817970366
0.0736247469107616
-0.0668609025927943
-0.00893319266005886
-0.0513381878313581
-0.118940878537881
-0.0342167235115769
-0.0550974678645145
-0.119563117226794
-0.0471305447376192
-0.00327214786177437
-0.030755677306192

$boot_distribution_combined

-0.0638156735733845
-0.115099230022118
-0.122154661232727
-0.11568688475983
-0.107560067616903
-0.0104351753028205
0.0468478113906564
-0.0757301079041615
-0.0560418948224978
-0.075080102524705
0.0312148632061231
-0.0659364286725095
0.0235328746897928
-0.0801753005637959
-0.0463077038415292
-0.0707986504155139
-0.0119253049099026
-0.0942309158131398
-0.0090547758049033
-0.117014213992477
-0.0496545180436386
-0.0479166817986072
0.0511019139487295
-0.016215730107103
-0.122023862247033
0.0542478273741143
-0.115820594330724
0.010040410694858
-0.0709021246766336
-0.0533550224698466
-0.0175299163534287
-0.0656409750372231
-0.0111447972481363
-0.0673053502149931
-0.114023455762382
-0.0194484259326864
-0.0746052189563927
-0.0384080954631364
0.0215866009279022
-0.0863419776622178
-0.0558824581685441
-0.0555434026998917
-0.0219983244304536
-0.0660095847341488
0.022090247731592
-0.0276557781225834
0.0113973431431254
-0.0303947144073475
-0.0871311694727508
-0.0415714908270597
-0.0143909403065567
0.0207955428199749
-0.0889011902756792
0.0216629824032396
-0.0216600586897062
-0.0804545586820998
-0.0216413778286035
-0.0158796771494128
-0.0849109538936565
-0.0144190886134249
-0.10228727583921
-0.0473569054254567
0.027222659875405
-0.0630151396060412
-0.0706571614400322
-0.130649392780898
-0.0908262432061099
-0.0256009696630043
-0.041669132298195
-0.00921499928851772
-0.0699812963775092
-0.0493097180915236
-0.151800508382532
-0.0686770025773833
-0.0390362361252268
-0.0717245283675347
-0.0596223469894626
-0.0457753970914436
-0.067792401330141
-0.0370333103275994
-0.00666783322387989
0.0309903151254405
-0.0100883592107368
-0.0552889772265468
0.0290080980117947
0.0757642181243608
-0.215213312400637
0.0410939640040569
-0.0723216536180582
0.0835254699882406
-0.0595658791598948
-0.00377847309348199
-0.0542357722903421
-0.129221801323547
-0.0309242101911271
-0.0502209371452765
-0.128953981048085
-0.053443269820847
0.00283852803837933
-0.0288336043400753

Continuous outcome detected - standardized to mean=0, SD=1

$beta_xy

height: -0.441665535052701

$se_beta_xy

0.00528348139012448

$beta_cy_method1

height: -0.165954112166875

$beta_cy_method2

height: -0.165954112166932

$beta_cy_combined

height: -0.165954112166916

$se_method1

0.0401685981744717

$se_method2

0.0292640184437315

$se_combined

0.0269693133336915

$cov_estimators

0.000389186019560699

$correlation_estimators

0.331082780199051

$weight_method1

0.276198223019937

$weight_method2

0.723801776980063

$r2_prs

0.281387979315542

$is_binary

FALSE

$prevalence

<NA>

$outcome_scale

'standardized'

$beta_cy_method1_observed

height: -0.165954112166875

$beta_cy_method2_observed

height: -0.165954112166932

$boot_distribution_method1

-0.201080149534597
-0.141986241730799
-0.117992732362003
-0.234078495267115
-0.198139496172601
-0.169901487133364
-0.219311061001482
-0.202691888846666
-0.139387591249826
-0.143533783033519
-0.20139024933716
-0.131939542347391
-0.182179716321898
-0.246757857627421
-0.1007136685065
-0.208860558880945
-0.106211936450269
-0.143161972668909
-0.172665742473354
-0.14934277410912
-0.212169846032858
-0.1698829744339
-0.186058899620131
-0.15198439422287
-0.134086974194455
-0.142712698959012
-0.174861051623753
-0.231885334860841
-0.160361219837752
-0.158182310786196
-0.186017737163378
-0.10404919494268
-0.151216138236994
-0.137823530039221
-0.118465958980409
-0.246967996908424
-0.133031676581657
-0.157065628461535
-0.159713528843896
-0.1546551446749
-0.123983467660231
-0.151145393514394
-0.161621358629921
-0.245922446713711
-0.14578944366245
-0.184883500452221
-0.129485534936634
-0.174601407857576
-0.224856958729769
-0.159858524100665
-0.144174454129412
-0.154487056212054
-0.154419920955245
-0.19183105558998
-0.170427287908341
-0.130975792442076
-0.154918142569359
-0.166314263600151
-0.210058320097843
-0.254272844404001
-0.15922022015918
-0.220100981200499
-0.133058995852077
-0.145392329889137
-0.116319367183657
-0.129477440555102
-0.231506108093738
-0.128681873275744
-0.248938361317341
-0.177045115971261
-0.14912645048928
-0.135253323620739
-0.137059929184164
-0.204411929375568
-0.251610842808553
-0.173152409110393
-0.176604958309179
-0.167423909291602
-0.152941611521434
-0.170485976545322
-0.18239432664296
-0.0757607054596917
-0.104856076655347
-0.271247756902795
-0.170263412951742
-0.147889486564038
-0.201135270251341
-0.244233025931237
-0.189926407640508
-0.177175495219979
-0.164555823561874
-0.181345122288013
-0.186207406443152
-0.0828697596409811
-0.187953696453619
-0.189810480735222
-0.156560171137261
-0.216972190791485
-0.190278224842864
-0.176162472366338

$boot_distribution_method2

-0.189023247474652
-0.147645214443561
-0.123346812318564
-0.215228704147959
-0.188592171856754
-0.154094450621137
-0.198485458322053
-0.206278923158365
-0.135047129427976
-0.13669786837128
-0.205080464368643
-0.153665238017864
-0.16743611671424
-0.23333631572506
-0.122411146185772
-0.184052972893622
-0.137854615180077
-0.15296174714388
-0.178070993834158
-0.148522337295296
-0.200037332526188
-0.161872414269867
-0.167351033493232
-0.15775434771434
-0.146554375462847
-0.146537920558163
-0.165065807591714
-0.223106707810149
-0.158039951534927
-0.156631695223541
-0.165661724086163
-0.12413587920615
-0.166828375511943
-0.14495905001803
-0.1383541702391
-0.228396757415416
-0.155474542614563
-0.157372770924548
-0.162294393756779
-0.149731418027964
-0.136009617606024
-0.157876236996877
-0.132829212630696
-0.22164551174682
-0.16380754716289
-0.159011878387065
-0.140934085418359
-0.177168149266043
-0.218581250077076
-0.160850218139862
-0.164185721595368
-0.148508841603871
-0.142274482453375
-0.174475874867003
-0.164351211525489
-0.144585879019798
-0.168389965234683
-0.175599434430052
-0.199168812838767
-0.216916074136781
-0.162672343273242
-0.20692413808712
-0.142723545768491
-0.152482475337673
-0.127320708177366
-0.137756432611885
-0.205593481982909
-0.143316534658032
-0.227691623297184
-0.164525026605583
-0.165075349057447
-0.145049151486755
-0.127080926629979
-0.200930870781448
-0.222359238505474
-0.17406081960274
-0.173040342755499
-0.156542957655167
-0.157451822058959
-0.167301495854578
-0.175600392273362
-0.105617730486561
-0.132445986720648
-0.247316327825268
-0.169433738040939
-0.16038671529588
-0.180275153379003
-0.227313321123545
-0.179184563615351
-0.165641164533858
-0.157272321819679
-0.199273472152803
-0.199049366732507
-0.114367186839892
-0.179031093164698
-0.177781138115106
-0.158977300027737
-0.182812012803223
-0.172264826833272
-0.163921762531624

$boot_distribution_combined

-0.192353342398734
-0.146082216236178
-0.121868024948655
-0.220434982959367
-0.191229125867386
-0.158460326017025
-0.204237452775431
-0.205288190655562
-0.136245957270257
-0.138585935853706
-0.204061233534385
-0.147664639479807
-0.171508272726792
-0.23704332174868
-0.116418341406742
-0.190904784060735
-0.129114963543314
-0.150255066847897
-0.176578073013328
-0.148748940485374
-0.203388311197496
-0.164084916752567
-0.172518112873977
-0.156160696813088
-0.143110901386841
-0.14548140114982
-0.16777123658741
-0.225531349002105
-0.15868108171532
-0.157059972486533
-0.17128401872586
-0.118587972706217
-0.162516303319237
-0.14298823207956
-0.132861081630404
-0.233526100762663
-0.1492758628968
-0.15728793872205
-0.161581563453986
-0.151091342578484
-0.132688016361225
-0.15601718998759
-0.140781552192613
-0.228350758045046
-0.15883097899388
-0.166157574428104
-0.137772016119152
-0.176459219850073
-0.22031458965514
-0.160576314008456
-0.158658645080894
-0.150160013855483
-0.145629030985389
-0.179269344942879
-0.166029413025366
-0.140826797291883
-0.164669071753681
-0.173034886746397
-0.202176475393287
-0.227233947702351
-0.161718873003492
-0.210563558740048
-0.14005421425529
-0.150524189763834
-0.124282157344067
-0.135469789717406
-0.2127505032685
-0.139274467189746
-0.233559934583322
-0.167983053040434
-0.160670291613794
-0.142343561237152
-0.129837109402956
-0.201892332979372
-0.230438479634467
-0.173809918238981
-0.174024883237174
-0.159548257161917
-0.156206109923048
-0.168181043762603
-0.177476864873559
-0.0973712732294781
-0.124825702587332
-0.253926146010808
-0.169662892776987
-0.156935002927472
-0.186036680591131
-0.231986513525451
-0.182151441847057
-0.168826926173089
-0.159284012058236
-0.194321693778469
-0.195502440120494
-0.105667653417853
-0.18149550033781
-0.18110362117088
-0.158309693323377
-0.192246993261625
-0.177240095354072
-0.167302624836475

generate_binary_outcome2 <- function(liability, varexp, prevalence) {
    n <- length(liability)
    liability <- scale(liability) * sqrt(varexp)*2 + log(prevalence / (1 - prevalence))
    cc <- rbinom(length(liability), 1, plogis(liability))
    return(cc)
}
dat$cc2 <- generate_binary_outcome2(dat$y, varexp = 0.4, prevalence = 0.4)
dat$cc <- generate_binary_outcome(dat$y, varexp = 0.4, prevalence = 0.4)
print(mean(dat$cc))
cor(dat$y, dat$cc2)
cor(dat$y, dat$cc)

[1] 0.42636

0.480093929886205

0.655762667041965

param <- expand.grid(
    varexp = c(0.2, 0.5, 0.8),
    prevalence = c(0.01, 0.1, 0.5)
)

binary_res <- lapply(1:nrow(param), \(i) {
    varexp <- param[i,]$varexp
    prevalence <- param[i,]$prevalence
    dat$cc2 <- generate_binary_outcome2(dat$y, varexp = varexp, prevalence = prevalence)
    res_continuous <- continuous_2sls(dat$x, dat$y, dat$prs_explained)
    res_binary <- binary_2sls(dat$x, dat$cc2, dat$prs_explained)
    tibble(
        varexp = varexp,
        prevalence = prevalence,
        method = c("binary", "continuous"),
        beta_xy = c(res_binary$beta_xy, res_continuous$beta_xy),
        se_beta_xy = c(res_binary$se_beta_xy, res_continuous$se_beta_xy)
    )
}) %>% bind_rows()

tidyr::pivot_wider(binary_res, names_from = method, values_from = c(beta_xy, se_beta_xy)) %>%
ggplot(., aes(x=beta_xy_continuous, y = beta_xy_binary)) +
    geom_point(aes(colour=prevalence)) +
    facet_grid(. ~ varexp) +
    geom_abline(slope = 1, intercept = 0, linetype = "dashed")

dat <- sim(b_xy = -0.1, b_uy = -0.2, b_ux = 1, b_prs=1, h2=0.7, r2_env_height=0.15, r2_prs=0.4, n=100000)[[2]]

param <- expand.grid(
    varexp = c(0.2, 0.5, 0.8),
    prevalence = c(0.01, 0.1, 0.5),
    rep = 1:100
)
binary_res <- lapply(1:nrow(param), \(i) {
    varexp <- param[i,]$varexp
    prevalence <- param[i,]$prevalence
    dat$cc2 <- generate_binary_outcome2(dat$y, varexp = varexp, prevalence = prevalence)
    res_continuous <- continuous_2sls(dat$x, dat$y, dat$prs_explained)
    res_binary <- binary_2sls(dat$x, dat$cc2, dat$prs_explained)
    dat$cc <- generate_binary_outcome(dat$y, varexp = varexp, prevalence = prevalence)
    res_binary1 <- binary_2sls(dat$x, dat$cc, dat$prs_explained)
    tibble(
        
        varexp = varexp,
        prevalence = prevalence,
        method = c("binary2", "continuous", "binary1"),
        beta_xy = c(res_binary$beta_xy, res_continuous$beta_xy, res_binary1$beta_xy),
        se_beta_xy = c(res_binary$se_beta_xy, res_continuous$se_beta_xy, res_binary1$se_beta_xy)
    )
}) %>% bind_rows()

ggplot(binary_res, aes(x=method, y = beta_xy)) +
    geom_boxplot(aes(fill=as.factor(prevalence))) +
    facet_grid(. ~ varexp) +
    geom_abline(slope = 1, intercept = 0, linetype = "dashed")

simdat <- function(n, prevalence, beta_xl) {
    # 1. Simulate Exposure X (Standard Normal)
    x <- rnorm(n)

    # 2. Simulate Liability L
    # L = beta*X + epsilon, where epsilon is standard normal noise
    epsilon <- rnorm(n)
    l <- beta_xl * x + epsilon

    # 3. Define Threshold T based on prevalence
    threshold <- qnorm(1 - prevalence, mean = 0, sd = sqrt(var(l)))
    y <- as.numeric(l > threshold)

    return(tibble(x = x, l = l, y = y))
}

estimation <- function(dat) {
    # A. Estimate X -> L (Linear Regression - the "Hidden" Truth)
    model_linear <- lm(l ~ x, dat)
    hat_beta_xl <- coef(model_linear)["x"]

    # B. Estimate X -> Y (Logistic GLM - what we usually observe)
    model_logit <- glm(y ~ x, data = dat, family = binomial(link = "logit"))
    hat_beta_logit <- coef(model_logit)["x"]

    # C. Estimate X -> Y (Probit GLM - the LTM-appropriate GLM)
    model_probit <- glm(y ~ x, data = dat, family = binomial(link = "probit"))
    hat_beta_probit <- coef(model_probit)["x"]

    return(tibble(
        method = c("Linear", "Logistic", "Probit", "Logistic (Transformed)"),
        beta_hat = c(hat_beta_xl, hat_beta_logit, hat_beta_probit, hat_beta_logit * (sqrt(3) / pi))
    ))
}

set.seed(42)
simdat(100000, prevalence = 0.1, beta_xl = -0.5) %>% estimation()

A tibble: 4 × 2
method	beta_hat
<chr>	<dbl>
Linear	-0.5028558
Logistic	-0.9754070
Probit	-0.5108572
Logistic (Transformed)	-0.5377701

library(furrr)
plan("multicore", workers=4)
param <- expand.grid(
    n = 100000,
    prevalence = c(0.01, 0.1, 0.5),
    beta_xl = c(-0.5, -0.3, 0)
) %>% furrr::future_pmap(., \(n, prevalence, beta_xl) {
    dat <- simdat(n = n, prevalence = prevalence, beta_xl = beta_xl)
    est <- estimation(dat)
    bind_cols(tibble(n = n, prevalence = prevalence, beta_xl = beta_xl), est)
}) %>% bind_rows()

Warning message:
“UNRELIABLE VALUE: Future (<unnamed-1>) unexpectedly generated random numbers without specifying argument 'seed'. There is a risk that those random numbers are not statistically sound and the overall results might be invalid. To fix this, specify 'seed=TRUE'. This ensures that proper, parallel-safe random numbers are produced. To disable this check, use 'seed=NULL', or set option 'future.rng.onMisuse' to "ignore". [future <unnamed-1> (ed502c4b95166bb22746467ddf260d51-1); on ed502c4b95166bb22746467ddf260d51@DT7MTP4F2G<63874>]”
Warning message:
“UNRELIABLE VALUE: Future (<unnamed-2>) unexpectedly generated random numbers without specifying argument 'seed'. There is a risk that those random numbers are not statistically sound and the overall results might be invalid. To fix this, specify 'seed=TRUE'. This ensures that proper, parallel-safe random numbers are produced. To disable this check, use 'seed=NULL', or set option 'future.rng.onMisuse' to "ignore". [future <unnamed-2> (ed502c4b95166bb22746467ddf260d51-2); on ed502c4b95166bb22746467ddf260d51@DT7MTP4F2G<63874>]”
Warning message:
“UNRELIABLE VALUE: Future (<unnamed-3>) unexpectedly generated random numbers without specifying argument 'seed'. There is a risk that those random numbers are not statistically sound and the overall results might be invalid. To fix this, specify 'seed=TRUE'. This ensures that proper, parallel-safe random numbers are produced. To disable this check, use 'seed=NULL', or set option 'future.rng.onMisuse' to "ignore". [future <unnamed-3> (ed502c4b95166bb22746467ddf260d51-3); on ed502c4b95166bb22746467ddf260d51@DT7MTP4F2G<63874>]”
Warning message:
“UNRELIABLE VALUE: Future (<unnamed-4>) unexpectedly generated random numbers without specifying argument 'seed'. There is a risk that those random numbers are not statistically sound and the overall results might be invalid. To fix this, specify 'seed=TRUE'. This ensures that proper, parallel-safe random numbers are produced. To disable this check, use 'seed=NULL', or set option 'future.rng.onMisuse' to "ignore". [future <unnamed-4> (ed502c4b95166bb22746467ddf260d51-4); on ed502c4b95166bb22746467ddf260d51@DT7MTP4F2G<63874>]”

ggplot(param, aes(x=beta_xl, y=beta_hat, color=method)) +
geom_point() +
geom_abline(slope = 1, intercept = 0, linetype = "dashed")

generate_binary_outcome3 <- function(liability, varexp=1, prevalence) {
    n <- length(liability)
    threshold <- qnorm(1 - prevalence, mean = 0, sd = sqrt(var(liability)))
    cc <- as.numeric(liability > threshold)
    return(cc)
}

binary_2sls_probit <- function(x, y, prs) {
    mod <- summary(glm(y ~ prs, family=binomial(link = "probit")))
    bgy <- mod$coefficients[2, 1]
    se_bgy <- mod$coefficients[2, 2]

    mod <- summary(lm(x ~ prs))
    bgx <- mod$coefficients[2, 1]
    se_bgx <- mod$coefficients[2, 2]
    beta_xy <- bgy / bgx
    se_beta_xy <- sqrt((se_bgy / bgx)^2 + (bgy * se_bgx / bgx^2)^2)
    return(list(beta_xy = beta_xy, se_beta_xy = se_beta_xy))
}

param <- expand.grid(
    prevalence = c(0.01, 0.1, 0.5),
    b_xy = c(-0.5, -0.3, 0),
    rep = 1:50
)
plan("multicore", workers=8)
binary_res <- future_pmap(param, \(prevalence, rep, b_xy) {
    dat <- sim(b_xy = b_xy, b_uy = -0.2, b_ux = 1, b_prs=1, h2=0.7, r2_env_height=0.15, r2_prs=0.4, n=100000)[[2]]

    dat$cc <- generate_binary_outcome3(dat$y, prevalence = prevalence)
    res_continuous <- continuous_2sls(dat$x, dat$y, dat$prs_explained)
    res_binary <- binary_2sls_probit(dat$x, dat$cc, dat$prs_explained)
    tibble(
        prevalence = prevalence,
        method = c("binary", "continuous"),
        beta_xy = c(res_binary$beta_xy, res_continuous$beta_xy),
        se_beta_xy = c(res_binary$se_beta_xy, res_continuous$se_beta_xy),
        rep = rep,
        b_xy = b_xy
    )
}) %>% bind_rows()
str(binary_res)

Warning message:
“UNRELIABLE VALUE: Future (<unnamed-25>) unexpectedly generated random numbers without specifying argument 'seed'. There is a risk that those random numbers are not statistically sound and the overall results might be invalid. To fix this, specify 'seed=TRUE'. This ensures that proper, parallel-safe random numbers are produced. To disable this check, use 'seed=NULL', or set option 'future.rng.onMisuse' to "ignore". [future <unnamed-25> (ed502c4b95166bb22746467ddf260d51-25); on ed502c4b95166bb22746467ddf260d51@DT7MTP4F2G<63874>]”
Warning message:
“UNRELIABLE VALUE: Future (<unnamed-26>) unexpectedly generated random numbers without specifying argument 'seed'. There is a risk that those random numbers are not statistically sound and the overall results might be invalid. To fix this, specify 'seed=TRUE'. This ensures that proper, parallel-safe random numbers are produced. To disable this check, use 'seed=NULL', or set option 'future.rng.onMisuse' to "ignore". [future <unnamed-26> (ed502c4b95166bb22746467ddf260d51-26); on ed502c4b95166bb22746467ddf260d51@DT7MTP4F2G<63874>]”
Warning message:
“UNRELIABLE VALUE: Future (<unnamed-27>) unexpectedly generated random numbers without specifying argument 'seed'. There is a risk that those random numbers are not statistically sound and the overall results might be invalid. To fix this, specify 'seed=TRUE'. This ensures that proper, parallel-safe random numbers are produced. To disable this check, use 'seed=NULL', or set option 'future.rng.onMisuse' to "ignore". [future <unnamed-27> (ed502c4b95166bb22746467ddf260d51-27); on ed502c4b95166bb22746467ddf260d51@DT7MTP4F2G<63874>]”
Warning message:
“UNRELIABLE VALUE: Future (<unnamed-28>) unexpectedly generated random numbers without specifying argument 'seed'. There is a risk that those random numbers are not statistically sound and the overall results might be invalid. To fix this, specify 'seed=TRUE'. This ensures that proper, parallel-safe random numbers are produced. To disable this check, use 'seed=NULL', or set option 'future.rng.onMisuse' to "ignore". [future <unnamed-28> (ed502c4b95166bb22746467ddf260d51-28); on ed502c4b95166bb22746467ddf260d51@DT7MTP4F2G<63874>]”
Warning message:
“UNRELIABLE VALUE: Future (<unnamed-29>) unexpectedly generated random numbers without specifying argument 'seed'. There is a risk that those random numbers are not statistically sound and the overall results might be invalid. To fix this, specify 'seed=TRUE'. This ensures that proper, parallel-safe random numbers are produced. To disable this check, use 'seed=NULL', or set option 'future.rng.onMisuse' to "ignore". [future <unnamed-29> (ed502c4b95166bb22746467ddf260d51-29); on ed502c4b95166bb22746467ddf260d51@DT7MTP4F2G<63874>]”
Warning message:
“UNRELIABLE VALUE: Future (<unnamed-30>) unexpectedly generated random numbers without specifying argument 'seed'. There is a risk that those random numbers are not statistically sound and the overall results might be invalid. To fix this, specify 'seed=TRUE'. This ensures that proper, parallel-safe random numbers are produced. To disable this check, use 'seed=NULL', or set option 'future.rng.onMisuse' to "ignore". [future <unnamed-30> (ed502c4b95166bb22746467ddf260d51-30); on ed502c4b95166bb22746467ddf260d51@DT7MTP4F2G<63874>]”
Warning message:
“UNRELIABLE VALUE: Future (<unnamed-31>) unexpectedly generated random numbers without specifying argument 'seed'. There is a risk that those random numbers are not statistically sound and the overall results might be invalid. To fix this, specify 'seed=TRUE'. This ensures that proper, parallel-safe random numbers are produced. To disable this check, use 'seed=NULL', or set option 'future.rng.onMisuse' to "ignore". [future <unnamed-31> (ed502c4b95166bb22746467ddf260d51-31); on ed502c4b95166bb22746467ddf260d51@DT7MTP4F2G<63874>]”
Warning message:
“UNRELIABLE VALUE: Future (<unnamed-32>) unexpectedly generated random numbers without specifying argument 'seed'. There is a risk that those random numbers are not statistically sound and the overall results might be invalid. To fix this, specify 'seed=TRUE'. This ensures that proper, parallel-safe random numbers are produced. To disable this check, use 'seed=NULL', or set option 'future.rng.onMisuse' to "ignore". [future <unnamed-32> (ed502c4b95166bb22746467ddf260d51-32); on ed502c4b95166bb22746467ddf260d51@DT7MTP4F2G<63874>]”

tibble [900 × 6] (S3: tbl_df/tbl/data.frame)
 $ prevalence: num [1:900] 0.01 0.01 0.1 0.1 0.5 0.5 0.01 0.01 0.1 0.1 ...
 $ method    : chr [1:900] "binary" "continuous" "binary" "continuous" ...
 $ beta_xy   : num [1:900] -0.444 -0.497 -0.465 -0.51 -0.442 ...
 $ se_beta_xy: num [1:900] 0.0238 0.00702 0.01087 0.007 0.00797 ...
 $ rep       : int [1:900] 1 1 1 1 1 1 1 1 1 1 ...
 $ b_xy      : num [1:900] -0.5 -0.5 -0.5 -0.5 -0.5 -0.5 -0.3 -0.3 -0.3 -0.3 ...

ggplot(binary_res, aes(x=method, y = beta_xy-b_xy)) +
    geom_boxplot(aes(colour=as.factor(prevalence))) +
    facet_grid(method ~ b_xy)

# Function to estimate beta_cy from height, PRS, and outcome

binary_2sls <- function(x, y, prs, method=c("probit", "logistic", "absolute")[1]) {
    if(method == "probit") {
        mod <- summary(glm(y ~ prs, family=binomial(link = "probit")))
    } else if(method == "logistic") {
        mod <- summary(glm(y ~ prs, family=binomial(link = "logit")))
    } else if(method == "absolute") {
        mod <- summary(lm(y ~ prs))
    }
    bgy <- mod$coefficients[2, 1]
    se_bgy <- mod$coefficients[2, 2]

    mod <- summary(lm(x ~ prs))
    bgx <- mod$coefficients[2, 1]
    se_bgx <- mod$coefficients[2, 2]
    beta_xy <- bgy / bgx
    se_beta_xy <- sqrt((se_bgy / bgx)^2 + (bgy * se_bgx / bgx^2)^2)
    return(list(beta_xy = beta_xy, se_beta_xy = se_beta_xy))
}

estimate_beta_cy2 <- function(height, prs, outcome, var_c = 0.15, h2 = 0.7, n_boot = 1000, binary_method="probit") {
    stopifnot(binary_method %in% c("probit", "logistic", "absolute"))
    
    # Detect if outcome is binary or continuous
    unique_vals <- unique(outcome)
    is_binary <- length(unique_vals) == 2 && all(sort(unique_vals) == c(0, 1))
    
    # Store original outcome for prevalence calculation
    outcome_original <- outcome
    
    # Calculate prevalence if binary
    if (is_binary) {
        prevalence <- mean(outcome)
        # Calculate liability scale threshold transformation factor
        threshold <- qnorm(prevalence)
        liability_scale_factor <- dnorm(threshold)
        cat(sprintf("Binary outcome detected (prevalence = %.3f)\n", prevalence))
        cat(sprintf("Liability scale transformation factor = %.4f\n", liability_scale_factor))
    } else {
        # Standardize continuous outcome
        outcome <- (outcome - mean(outcome)) / sd(outcome)
        cat("Continuous outcome detected - standardized to mean=0, SD=1\n")
        prevalence <- NA
        liability_scale_factor <- 1
    }

    

    # Use instrumental variable regression to estimate causal effect
    if(is_binary) {
        iv_res <- binary_2sls(height, outcome, prs, method = binary_method)
    } else {
        iv_res <- binary_2sls(height, outcome, prs, method = "absolute")
    }
    beta_xy_prs <- iv_res$beta_xy
    se_beta_xy <- iv_res$se_beta_xy
    
    # Calculate R^2 for PRS
    model_prs <- lm(height ~ prs)
    prs_explained <- fitted.values(model_prs)
    r2_prs <- var(prs_explained) / var(height)
    
    # Estimate beta_cy using method 1 (from covariance of height and outcome)
    beta_cy_method1_observed <- (cov(height, outcome) - beta_xy_prs * var(height)) / var_c
    
    # Method 2: Using height residual
    # Get residual height (height not explained by PRS)
    residual_height <- residuals(model_prs)
    
    # Estimate beta_cy using method 2 (from residual height)
    beta_cy_method2_observed <- (cov(residual_height, outcome) - beta_xy_prs_observed * var(residual_height)) / var_c
    
    # Transform to liability scale if binary
    beta_cy_method1 <- beta_cy_method1_observed / liability_scale_factor
    beta_cy_method2 <- beta_cy_method2_observed / liability_scale_factor
    
    # Parametric bootstrap using 2SLS SE for beta_xy
    n <- length(height)
    boot_results <- matrix(NA, n_boot, 2)
    
    for(i in 1:n_boot) {
        # Sample with replacement
        idx <- sample(1:n, n, replace = TRUE)
        h_boot <- height[idx]
        p_boot <- prs[idx]
        y_boot <- outcome[idx]
        
        # Parametrically resample beta_xy from its sampling distribution
        beta_xy_boot <- rnorm(1, mean = beta_xy_prs, sd = se_beta_xy)
        beta_xy_boot <- beta_xy_boot / liability_scale_factor
        
        # Method 1 bootstrap
        boot_observed_1 <- (cov(h_boot, y_boot) - beta_xy_boot_observed * var(h_boot)) / var_c
        boot_results[i, 1] <- boot_observed_1 / liability_scale_factor
        
        # Method 2 bootstrap
        model_prs_boot <- lm(h_boot ~ p_boot)
        res_boot <- residuals(model_prs_boot)
        boot_observed_2 <- (cov(res_boot, y_boot) - beta_xy_boot_observed * var(res_boot)) / var_c
        boot_results[i, 2] <- boot_observed_2 / liability_scale_factor
    }
    
    # Calculate standard errors
    se_method1 <- sd(boot_results[, 1])
    se_method2 <- sd(boot_results[, 2])
    
    # Calculate covariance between the two estimators using GLS
    # cov(β̂_cy,1, β̂_cy,2) ≈ 1/(N·(σ_C²)²) * [var(Y)var(X,X_res) + var(X,Y)var(X_res,Y)]
    cov_x_xres <- cov(height, residual_height)
    cov_x_y <- cov(height, outcome)
    cov_xres_y <- cov(residual_height, outcome)
    var_y <- var(outcome)
    
    # Covariance of the two beta_cy estimates
    cov_estimators <- (1 / (n * var_c^2)) * (var_y * cov_x_xres + cov_x_y * cov_xres_y)
    
    # Build variance-covariance matrix
    var_method1 <- se_method1^2
    var_method2 <- se_method2^2
    Sigma <- matrix(c(var_method1, cov_estimators,
                      cov_estimators, var_method2), nrow = 2)
    
    # GLS weights (inverse of variance-covariance matrix)
    Sigma_inv <- solve(Sigma)
    weights <- Sigma_inv %*% c(1, 1)
    weight_sum <- sum(weights)
    
    # GLS combined estimate
    beta_cy_combined <- (weights[1] * beta_cy_method1 + weights[2] * beta_cy_method2) / weight_sum
    
    # Variance of combined estimate: var(β̂_combined) = 1 / (1' Σ^(-1) 1)
    var_combined <- as.numeric(1 / (t(c(1, 1)) %*% Sigma_inv %*% c(1, 1)))
    se_combined <- sqrt(var_combined)
    
    # Apply GLS weights to bootstrap distributions
    boot_combined <- (weights[1] * boot_results[, 1] + weights[2] * boot_results[, 2]) / weight_sum
    
    # Return results
    return(list(
        beta_xy = beta_xy_prs,
        se_beta_xy = se_beta_xy,
        beta_cy_method1 = beta_cy_method1,
        beta_cy_method2 = beta_cy_method2,
        beta_cy_combined = beta_cy_combined,
        se_method1 = se_method1,
        se_method2 = se_method2,
        se_combined = se_combined,
        cov_estimators = cov_estimators,
        correlation_estimators = cov_estimators / (se_method1 * se_method2),
        weight_method1 = weights[1] / weight_sum,
        weight_method2 = weights[2] / weight_sum,
        r2_prs = r2_prs,
        is_binary = is_binary,
        prevalence = if(is_binary) prevalence else NA,
        outcome_scale = if(is_binary) "liability" else "standardized",
        beta_cy_method1_observed = beta_cy_method1_observed,
        beta_cy_method2_observed = beta_cy_method2_observed,
        boot_distribution_method1 = boot_results[, 1],
        boot_distribution_method2 = boot_results[, 2],
        boot_distribution_combined = boot_combined
    ))
}

Expected betas for

library(dplyr)
sim_dan <- function(n=100000, bgx1=0.1, bgx2=0.2, bx1y=0.3, bex2=0.4, bey=0.5) {
    g <- rbinom(n, 2, 0.4)
    e <- rnorm(n)
    x1 <- bgx1 * g + rnorm(n)
    x2 <- bgx2 * g + bex2 * e + rnorm(n)
    y <- bx1y * x1 + bey * e + rnorm(n)
    return(list(
        dat = tibble(g = g, x1 = x1, x2 = x2, y = y, e = e),
        param = tibble(bgx1 = bgx1, bgx2 = bgx2, bx1y = bx1y, bex2 = bex2, bey = bey)
    ))
}

dat <- sim_dan()

cov(dat$dat$x2, dat$dat$y)

0.203145466403477

dat$param$bgx2 * dat$param$bgx1 * dat$param$bx1y * var(dat$dat$g) + dat$param$bex2 * dat$param$bey * var(dat$dat$e)

0.201365693134032