Clustering traits

Author

Gibran Hemani

Published

September 22, 2024

Background

library(ieugwasr)
library(dplyr)
library(data.table)
library(lsa)

get_clusts <- function(cs, th) {
    l <- list()
    m <- apply(cs, 1, \(x) sum(x > th))
    o <- order(m[m > 1])
    if(length(o) == 0) return(tibble(index = 1:nrow(cs), matches = 1:nrow(cs)))
    for(i in o) {
        x <- which(cs[i,] > th)
        if(length(x) > 0) {
            l[[i]] <- tibble(index = i, matches = x)
            cs[x, x] <- 0
        }
        if(all(cs == 0)) break
    }
    l <- bind_rows(l)
    stopifnot(length(l$matches) == length(unique(l$matches)))
    return(l)
}

special_characters <- function(x) {
    sc <- c("\\[", "\\]", "+", "-", ":", "[0-9]")
    sapply(sc, \(y) {
        grepl(y, x)
    }) %>% {sum(.) > 2}
}

run_with_retry <- function(x, ntries=3) {
    result <- NULL
    url <- paste0("http://vectology-api.mrcieu.ac.uk/encodeText?qtext=", URLencode(x))
    for (i in 1:ntries) {
        result <- try(httr::content(httr::POST(url, encode="json")) %>% unlist() %>% {.[-1]} %>% as.numeric())
        if (!inherits(result, "try-error")) {
            break
        } else {
            result <- NULL
        }
    }
    return(result)
}

cluster_traits <- function(d) {
    if(nrow(d) == 1) {
        d$index_id <- d$id
        return(d)
    }

    tr <- d$trait

    # Identify technical trait names e.g. metabolites, genes etc
    sc <- sapply(tr, special_characters)
    dsc <- d[sc,]
    dsc$index_id <- dsc$id
    d <- subset(d, !sc)
    if(nrow(d) == 0) {
        NULL
    } else if(nrow(d) == 1) {
        d$index_id <- d$id
    } else {
        tr <- d$trait

        # Use vectology to try to cluster traits
        cs <- sapply(tr, run_with_retry) %>% lsa::cosine()
        csc <- get_clusts(cs, 0.95)

        # Create a new vector which is the index_id for the cluster (this can be updated later)
        d$idn <- 1:nrow(d)
        csc <- left_join(csc, d %>% select(idn, index_id=id), by=c("index"="idn")) %>% left_join(d %>% select(idn, match_id=id), by=c("matches"="idn"))

        d <- left_join(d, csc %>% select(match_id, index_id), by=c("id"="match_id"))
    }
    d <- bind_rows(d, dsc)
    d <- d %>% 
        ungroup() %>%
        group_by(index_id) %>% 
        arrange(desc(ncase), desc(sample_size)) %>%
        mutate(priority = as.numeric(row_number()==1), index_id2=first(id)) %>% 
        ungroup() %>%
        mutate(index_id=index_id2) %>% select(-index_id2)

    return(d)
}

a <- gwasinfo()

b <- subset(a, grepl("ebi-a", id)) %>% filter(population=="European")
b

gcat <- fread("~/Downloads/gwas_catalog_v1.0.2.1-studies_r2024-03-11.tsv")

gcat$sa2 <- paste0("ebi-a-", gcat$`STUDY ACCESSION`)

table(b$id %in% gcat$sa2)

b <- left_join(b, select(gcat, sa2, MAPPED_TRAIT, MAPPED_TRAIT_URI, LINK), by=c("id"="sa2"))
dim(b)

table(duplicated(b$MAPPED_TRAIT_URI))

table(table(b$MAPPED_TRAIT_URI))

b %>% group_by(MAPPED_TRAIT, LINK) %>% summarise(n=dplyr::n()) %>% arrange(desc(n)) %>% filter(n != 1) %>% as.data.frame %>% write.csv("ebi-a-duplicates.csv", row.names=FALSE)

# Cluster traits by
# - EFO term
# - Trait name (fuzzy matching)
# - Ancestry
# - Sex
# - Study type (note)



cl <- b %>% group_by(MAPPED_TRAIT, sex, population, note) %>%
    arrange(desc(ncase), desc(sample_size)) %>%
    mutate(priority = as.numeric(row_number()==1)) %>%
    select(priority, MAPPED_TRAIT, sex, population, note, trait, sample_size, ncase, ncontrol, everything())

cl2 <- cl %>% group_split() %>% lapply(., \(x) {
    print(nrow(x))
    cluster_traits(x)
})

d <- cl %>% group_split() %>% {.[[21]]}

d %>% cluster_traits




Which of the following traits are synonymous with each other? Provide the result in json format, with each element being a list of synonymous traits.

- hypertension 
- systolic blood pressure
- high blood pressure
- cigarettes per day 
- smoking heaviness 
- smoking initiation
- Triglyceride levels 
- Triglycerides 
- Triglyceride levels (UKB data field 30870) 
- Triglycerides 
- Triglycerides 
- Triglyceride levels 
[
  ["Triglyceride levels", "Triglycerides", "Triglyceride levels (UKB data field 30870)"]
]




cl <- b %>% group_by(MAPPED_TRAIT, sex, population, note) %>%
    arrange(desc(ncase), desc(sample_size)) %>%
    mutate(priority = as.numeric(row_number()==1)) %>%
    select(priority, MAPPED_TRAIT, sex, population, note, trait, sample_size, ncase, ncontrol, everything())


cl %>% write.csv("ebi-a-clustered.csv", row.names=FALSE)

hclust(cl)

cl %>% glimpse

 %>%
    arrange










# The largest sample size
# Most recent

sessionInfo()