Bilateral cancer incidence simulation

Author

Gibran Hemani

Published

March 18, 2026

library(tidyverse)

simulate_time_to_event <- function(liability, baseline_rate = 0.1, shape = 1) {
  #' Simulate time-to-event data based on individual liability
  #' 
  #' @param liability Vector of individual liability values
  #' @param baseline_rate Baseline hazard rate (lambda parameter)
  #' @param shape Shape parameter for Weibull distribution (shape=1 gives exponential)
  #' @return Vector of simulated time to event for each individual
  
  n <- length(liability)
  
  # Convert liability to hazard rate (exponential relationship)
  # Higher liability = higher hazard = shorter time to event
  hazard_rate <- baseline_rate * exp(liability)
  
  # Scale parameter for Weibull (lambda = 1/scale)
  scale <- 1 / hazard_rate
  
  # Simulate time to event using Weibull distribution
  # When shape=1, this is exponential distribution
  time_to_event <- rweibull(n, shape = shape, scale = scale)
  
  return(time_to_event)
}

# Example simulation
set.seed(42)
n_individuals <- 1000

# Simulate liability (e.g., genetic risk, standardized)
liability <- rnorm(n_individuals, mean = 0, sd = 1)

# Simulate time to event
time_to_event <- simulate_time_to_event(liability, baseline_rate = 0.1, shape = 10)

# Create dataframe
df <- tibble(
  individual_id = 1:n_individuals,
  liability = liability,
  time_to_event = time_to_event
)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──

 dplyr     1.1.4      readr     2.2.0

 forcats   1.0.1      stringr   1.6.0

 ggplot2   4.0.1      tibble    3.3.0

 lubridate 1.9.4      tidyr     1.3.1

 purrr     1.2.0     

── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──

 dplyr::filter() masks stats::filter()

 dplyr::lag()    masks stats::lag()

 Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
set.seed(123)
n <- 1000

# Simulate genetic/environmental liability (standardized)
liability <- rnorm(n, mean = 0, sd = 1)

# For cancer with median onset around age 65-70:
# baseline_rate = 0.015 gives expected time ~ 67 years for liability=0
cancer_data <- tibble(
  person_id = 1:n,
  liability = liability,
  age_at_cancer1 = simulate_time_to_event(liability, baseline_rate = 0.003, shape = 1),
  age_at_cancer2 = simulate_time_to_event(liability, baseline_rate = 0.003, shape = 1),
  diff = abs(age_at_cancer1 - age_at_cancer2),
  below_60 = age_at_cancer1 < 60 | age_at_cancer2 < 60
)
str(cancer_data)
tibble [1,000 × 6] (S3: tbl_df/tbl/data.frame)
 $ person_id     : int [1:1000] 1 2 3 4 5 6 7 8 9 10 ...
 $ liability     : num [1:1000] -0.5605 -0.2302 1.5587 0.0705 0.1293 ...
 $ age_at_cancer1: num [1:1000] 1071 812 133 206 207 ...
 $ age_at_cancer2: num [1:1000] 922.9 24.8 68 145.4 496.6 ...
 $ diff          : num [1:1000] 148.2 786.8 65.5 61.1 289.4 ...
 $ below_60      : logi [1:1000] FALSE TRUE FALSE FALSE FALSE TRUE ...
cancer_data %>%
    ggplot(aes(x = liability, y = pmin(age_at_cancer1, age_at_cancer2))) +
    geom_point() +
    scale_y_log10() +
    geom_hline(yintercept = 60, linetype = "dashed", color = "red")

table(cancer_data$below_60)

FALSE  TRUE 
  624   376 
cancer_data %>% group_by(below_60, diff <= 5) %>% summarise(n = n(), prop=n()/nrow(cancer_data))
`summarise()` has grouped output by 'below_60'. You can override using the

`.groups` argument.
A grouped_df: 4 × 4
below_60 diff <= 5 n prop
<lgl> <lgl> <int> <dbl>
FALSE FALSE 613 0.613
FALSE TRUE 11 0.011
TRUE FALSE 365 0.365
TRUE TRUE 11 0.011
set.seed(123)
n <- 1000

# Simulate genetic/environmental liability (standardized)
liability <- rnorm(n, mean = 0, sd = 3)

# For cancer with median onset around age 65-70:
# baseline_rate = 0.015 gives expected time ~ 67 years for liability=0
cancer_data <- tibble(
  person_id = 1:n,
  liability = liability,
  age_at_cancer1 = simulate_time_to_event(liability, baseline_rate = 0.003, shape = 1),
  age_at_cancer2 = simulate_time_to_event(liability, baseline_rate = 0.003, shape = 1),
  diff = abs(age_at_cancer1 - age_at_cancer2),
  below_60 = age_at_cancer1 < 60 | age_at_cancer2 < 60
)
cancer_data %>% group_by(below_60, diff <= 5) %>% summarise(n = n(), prop=n()/nrow(cancer_data))
`summarise()` has grouped output by 'below_60'. You can override using the

`.groups` argument.
A grouped_df: 4 × 4
below_60 diff <= 5 n prop
<lgl> <lgl> <int> <dbl>
FALSE FALSE 536 0.536
FALSE TRUE 5 0.005
TRUE FALSE 328 0.328
TRUE TRUE 131 0.131

When the liability has a low variance (i.e. not contributing much to overall risk, most risk comes from chance), then bilateral cancer incidence within 5 years is at 1%.

When liability has a high variance (i.e. smaller chance component) then bilateral cancer incidence within 5 years is at 13%.

One way to determine if bilateral cancer is occurring more than expected is to look at familial rates. Suppose siblings share ~50% genetics and X% environmental liability, we can get expected and observed rates for bilateral cancer and sibling cancer.

n <- 1000000
cd <- lapply(1:4, function(i) {
    liability <- rnorm(n, mean = 0, sd = i)
    cancer_data1 <- tibble(
        person_id = 1:n,
        liability = liability,
        age_at_cancer1 = simulate_time_to_event(liability, baseline_rate = 0.003, shape = 1),
        age_at_cancer2 = simulate_time_to_event(liability, baseline_rate = 0.003, shape = 1),
        diff = abs(age_at_cancer1 - age_at_cancer2),
        below_60 = age_at_cancer1 < 60 | age_at_cancer2 < 60
    ) %>% mutate(sd = i)
}) %>% bind_rows()

str(cd)
tibble [4,000,000 × 7] (S3: tbl_df/tbl/data.frame)
 $ person_id     : int [1:4000000] 1 2 3 4 5 6 7 8 9 10 ...
 $ liability     : num [1:4000000] -0.387 -0.607 0.16 -1.021 0.761 ...
 $ age_at_cancer1: num [1:4000000] 397.8 339.4 42.6 321.7 149.5 ...
 $ age_at_cancer2: num [1:4000000] 1703.9 883.2 17.9 1008.8 24.5 ...
 $ diff          : num [1:4000000] 1306.1 543.8 24.7 687.2 125 ...
 $ below_60      : logi [1:4000000] FALSE FALSE TRUE FALSE TRUE TRUE ...
 $ sd            : int [1:4000000] 1 1 1 1 1 1 1 1 1 1 ...
table(cd$below_60, cd$sd)
       
             1      2      3      4
  FALSE 640796 584875 559269 544393
  TRUE  359204 415125 440731 455607
ggplot(cd %>% filter(below_60) %>% mutate(age_first = pmin(age_at_cancer1, age_at_cancer2), age_first_cut = cut(age_first, breaks=3)), aes(x = diff, fill = factor(sd))) +
    geom_density(alpha = 0.4) +
    labs(x = "Difference in time", fill = "Liability SD") +
    xlim(0, 20) +
    facet_grid(age_first_cut ~ .) +
    theme_minimal()
Warning message:

“Removed 969971 rows containing non-finite outside the scale range

(`stat_density()`).”