GRM lookups

Author

Gibran Hemani

Published

August 30, 2024

Background

library(data.table)
library(dplyr)


#' Get just one row of the triangle GRM matrix
#' 
#' @param i ID to extract (location ID rather than character ID e.g. row 100)
#' @param ki List of all IDs to extract (location IDs again)
#' @param b GRM file rootname
#' 
#' @return vector of length i's position in ki
findone <- function(i, ki, b) {
  f <- file(b, "rb")
  seek(f, where = (i * (i+1) / 2 - i) * 4)
  a <- readBin(f, "double", n=i, size=4)
  close(f)
  x <- ki[ki <= i]
  a[x]
}


#' Get all rows of the triangle GRM matrix for required individuals
#' 
#' @param ki List of all IDs to extract (location IDs again)
#' @param b GRM file rootname
findall <- function(ki, b) {
  n <- length(ki)
  mat <- matrix(0, n, n)
  for(i in 1:n) {
    message(i)
    mat[i,1:i] <- mat[1:i, i] <- findone(ki[i], ki, b)
  }
  return(mat)
}

#' Get only diagonal elements from GRM
#' 
#' @param n sample size of GRM
#' @param b GRM file rootname
#' 
#' @return vector of length n, should all be close to 1
diags <- function(n, b) {
  f <- file(b, "rb")
  x <- sapply(1:n, \(i) {
    seek(f, where = (i-1)*4, origin="current")
    readBin(f, "double", n=1, size=4)
  })
  close(f)
  return(x)
}

# takes a few minutes
d <- diags(ids, b)
length(d)

# The first few look good
head(d)

# But most of them are way off
summary(d)

# Once we get to 10% of the sample the matrix is no longer following the expected pattern
which(d < 0.75)[1]
nrow(ids)

# The matrix was generated by concatenating chunks from plink. Perhaps it's not possible to do this


# Expected size of the GRM
n <- nrow(ids)
n * (n+1) / 2 * 4

# This matches the exact size of the GRM






mat <- "/local-scratch/data/ukb/genetic/variants/arrays/imputed/draft/grm/full-ukb-indep"
ids <- fread(paste0(mat, ".grm.id")) %>% as_tibble()

ids

b <- paste0(mat, ".grm.bin")
file.exists(b)

f <- file(b, "rb")
readBin(f, "double", n=1, size=4)
seek(f, where=1)
readBin(f, "double", n=1, size=4)
readBin(f, "double", n=1, size=4)
readBin(f, "double", n=1, size=4)
readBin(f, "double", n=1, size=4)
readBin(f, "double", n=1, size=4)
close(f)

ki <- sort(sample(1:nrow(ids), 100, replace=FALSE))



findall(ki, b)



findone(ki[15], ki, b)
findone(9, 1:10, b)


diags <- function(ids, b) {
  f <- file(b, "rb")
  x <- sapply(1:nrow(ids), \(i) {
    seek(f, where = (i-1)*4, origin="current")
    readBin(f, "double", n=1, size=4)
  })
  close(f)
  return(x)
}

d <- diags(ids, b)
length(d)
summary(d)

sessionInfo()