library(data.table)
library(dplyr)
#' Get just one row of the triangle GRM matrix
#'
#' @param i ID to extract (location ID rather than character ID e.g. row 100)
#' @param ki List of all IDs to extract (location IDs again)
#' @param b GRM file rootname
#'
#' @return vector of length i's position in ki
findone <- function(i, ki, b) {
f <- file(b, "rb")
seek(f, where = (i * (i+1) / 2 - i) * 4)
a <- readBin(f, "double", n=i, size=4)
close(f)
x <- ki[ki <= i]
a[x]
}
#' Get all rows of the triangle GRM matrix for required individuals
#'
#' @param ki List of all IDs to extract (location IDs again)
#' @param b GRM file rootname
findall <- function(ki, b) {
n <- length(ki)
mat <- matrix(0, n, n)
for(i in 1:n) {
message(i)
mat[i,1:i] <- mat[1:i, i] <- findone(ki[i], ki, b)
}
return(mat)
}
#' Get only diagonal elements from GRM
#'
#' @param n sample size of GRM
#' @param b GRM file rootname
#'
#' @return vector of length n, should all be close to 1
diags <- function(n, b) {
f <- file(b, "rb")
x <- sapply(1:n, \(i) {
seek(f, where = (i-1)*4, origin="current")
readBin(f, "double", n=1, size=4)
})
close(f)
return(x)
}
# takes a few minutes
d <- diags(ids, b)
length(d)
# The first few look good
head(d)
# But most of them are way off
summary(d)
# Once we get to 10% of the sample the matrix is no longer following the expected pattern
which(d < 0.75)[1]
nrow(ids)
# The matrix was generated by concatenating chunks from plink. Perhaps it's not possible to do this
# Expected size of the GRM
n <- nrow(ids)
n * (n+1) / 2 * 4
# This matches the exact size of the GRM
mat <- "/local-scratch/data/ukb/genetic/variants/arrays/imputed/draft/grm/full-ukb-indep"
ids <- fread(paste0(mat, ".grm.id")) %>% as_tibble()
ids
b <- paste0(mat, ".grm.bin")
file.exists(b)
f <- file(b, "rb")
readBin(f, "double", n=1, size=4)
seek(f, where=1)
readBin(f, "double", n=1, size=4)
readBin(f, "double", n=1, size=4)
readBin(f, "double", n=1, size=4)
readBin(f, "double", n=1, size=4)
readBin(f, "double", n=1, size=4)
close(f)
ki <- sort(sample(1:nrow(ids), 100, replace=FALSE))
findall(ki, b)
findone(ki[15], ki, b)
findone(9, 1:10, b)
diags <- function(ids, b) {
f <- file(b, "rb")
x <- sapply(1:nrow(ids), \(i) {
seek(f, where = (i-1)*4, origin="current")
readBin(f, "double", n=1, size=4)
})
close(f)
return(x)
}
d <- diags(ids, b)
length(d)
summary(d)