Load the packages BGData
, BGLR
, data.table
, and qqman
:
library(BGData)
library(BGLR)
library(data.table)
library(qqman)
Set paths:
genoPath
is the directory containing the .bed+.bim+.fam filesets (following the naming scheme chrom{1..22,X,Y,XY,MT}
)phenoPath
is the directory containing a preprepared phenotype file called phenotypes.csv
with two initial columns FID
and IID
sampleQcPath
is the directory containing the Sample-QC file (version 1)withdrawalsPath
is the directory containing the project-specific list of samples that have withdrawn from UKBLoad each chromosome as a BEDMatrix object and link them by columns to a ColumnLinkedMatrix object:
X <- as.ColumnLinkedMatrix(lapply(c(1:22, "X", "Y", "XY", "MT"), function(chrom) {
BEDMatrix(paste0(genoPath, "/chrom", chrom))
}))
rownames(X) <- sapply(strsplit(rownames(X), "_"), `[`, 1) # convert names from FID_IID to eid
Combine genotypes and phenotypes into a BGData object:
DATA <- as.BGData(X, alternatePhenotypeFile = paste0(phenoPath, "/phenotypes.csv"))
allSamples <- rownames(DATA@geno)
allVariants <- colnames(DATA@geno)
The original 26 .bed files (autosomal chromosomes, sex chromosomes, and mitochondrial DNA) contain 805426 variants for each of the 488377 samples.
Determine white samples using the in.white.British.ancestry.subset
column of the sample QC file:
sampleQC <- fread(paste0(sampleQcPath, "/ukb_sqc_v2.txt"), select = 24, col.names = "in.white.British.ancestry.subset", data.table = FALSE)
whiteSamples <- allSamples[sampleQC$in.white.British.ancestry.subset == "1"]
Load withdrawals and remove them from white British cohort:
withdrawals <- scan(paste0(withdrawalsPath, "/w15326.csv"), what = "")
whiteSamples <- setdiff(whiteSamples, withdrawals)
There are 409637 white Caucasian samples.
Compute summary statistics for each variant among all white British samples:
summaries <- summarize(DATA@geno, i = whiteSamples, j = allVariants, chunkSize = 2500, nCores = 4)
Remove variants with a minor allele frequency of <1% and a missing call rate of >5%:
maf <- ifelse(summaries$allele_freq > 0.5, 1 - summaries$allele_freq, summaries$allele_freq)
qcedVariants <- allVariants[maf >= 0.01 & summaries$freq_na <= 0.05]
There are 624523 variants after filtering.
Compute 5 PCs based on a singular-value decomposition and 5000 evenly spaced variants:
evenlySpacedVariants <- round(seq(from = 100, to = ncol(DATA@geno) - 100, length.out = 5000))
svdX <- DATA@geno[unrelatedWhiteSamples, evenlySpacedVariants]
svdX[is.na(svdX)] <- 0 # impute missing values
svdX <- svdX[, apply(svdX, 2, sd) > 0] # remove constant columns
svdX <- scale(svdX)
SVD <- svd(svdX, nu = 5, nv = 0)
PCs <- data.frame(IID = unrelatedWhiteSamples, pc1 = SVD$u[, 1], pc2 = SVD$u[, 2], pc3 = SVD$u[, 3], pc4 = SVD$u[, 4], pc5 = SVD$u[, 5], stringsAsFactors = FALSE)
Merge the PCs into DATA@pheno
:
DATA@pheno <- orderedMerge(DATA@pheno, PCs, by = "IID")
Remove outliers for height:
DATA@pheno$height[DATA@pheno$height < 147 & DATA@pheno$height > 210] <- NA
Adjust for sex, age, batch (named PHENOTYPE
in the .fam file), center, and the 5 PCs:
adjModel <- lm(height ~ factor(SEX) + age + factor(PHENOTYPE) + factor(center) + pc1 + pc2 + pc3 + pc4 + pc5, data = DATA@pheno, subset = unrelatedWhiteSamples, na.action = na.exclude)
adjustedPhenotypes <- data.frame(IID = unrelatedWhiteSamples, adjusted_height = residuals(adjModel), stringsAsFactors = FALSE)
Merge adjusted height into DATA@pheno
:
DATA@pheno <- orderedMerge(DATA@pheno, adjustedPhenotypes, by = "IID")
Remove samples with missing values:
phenotypedUnrelatedWhiteSamples <- unrelatedWhiteSamples[!is.na(DATA@pheno[unrelatedWhiteSamples, "adjusted_height"])]
Build test set containing 10,000 samples and put the remaining samples into the training set:
testSet <- sample(phenotypedUnrelatedWhiteSamples, size = 10000)
trainingSet <- setdiff(phenotypedUnrelatedWhiteSamples, testSet)
There are 222648 samples in the training set.
Perform a genome-wide association study (GWAS) on the training set, regressing adjusted height on one variant at a time using rayOLS
as regression method:
GWAS <- GWAS(adjusted_height ~ 1, data = DATA, method = "rayOLS", i = trainingSet, j = qcedVariants, chunkSize = 2500, nCores = 4)
pValues <- GWAS[, 4]
Produce a Manhattan plot:
manhattan <- data.frame(
SNP = qcedVariants,
CHR = DATA@map[qcedVariants, "chromosome"],
P = pValues,
BP = DATA@map[qcedVariants, "base_pair_position"],
stringsAsFactors = FALSE
)
manhattan(manhattan, suggestiveline = FALSE, genomewideline = FALSE, cex = 0.4, cex.axis = 0.7, cex.lab = 0.7)
Identify the top-p variants (i.e., the p variants with the smallest p-value) from the GWAS result and group them into count-based sets of 500, 1,000, 2,000, 5,000, 10,000, 20,000, 30,000, 40,000, and 50,000. Shown here the code for a variant set consisting of four groups: 500-1,000, 1,001-2,000, 2,001-5,000, and 5,001-10,000.
variantSet <- list(
1:500,
501:1000,
1001:2000,
2001:5000,
5001:10000
)
rankedVariants <- order(pValues)
Fit a Bayesian regression model with the training set and an increasing number of variant groups per set using the BGLR package using the BayesB
method:
yTRN <- DATA@pheno[trainingSet, "adjusted_height"]
ETA <- vector(mode = "list", length = length(variantSet))
for (group in seq_along(variantSet)) {
variantSelection <- qcedVariants[rankedVariants[variantSet[[group]]]]
xTRN <- DATA@geno[trainingSet, variantSelection]
sdTRN <- summaries[variantSelection, "sd"]
muTRN <- summaries[variantSelection, "allele_freq"] * 2
for (col in 1:ncol(xTRN)) {
x <- xTRN[, col]
x <- (x - muTRN[col]) / sdTRN[col]
x[is.na(x)] <- 0
xTRN[, col] <- x
}
ETA[[group]] <- list(X = xTRN, model = "BayesB")
}
# Fit model
fm <- BGLR(y = yTRN, ETA = ETA, nIter = 7000, burnIn = 2000)
Predict height for samples in the test set using the estimated effects from the previous top-p set and SNP genotypes:
yTST <- DATA@pheno[testSet, "adjusted_height"]
variantSelection <- qcedVariants[rankedVariants[unlist(variantSet)]]
xTST <- DATA@geno[testSet, variantSelection]
sdTST <- summaries[variantSelection, "sd"]
muTST <- summaries[variantSelection, "allele_freq"] * 2
for (col in 1:ncol(xTST)) {
x <- xTST[, col]
x <- (x - muTST[col]) / sdTST[col]
x[is.na(x)] <- 0
xTST[, col] <- x
}
bHat <- do.call("c", lapply(fm$ETA[seq_along(fm$ETA)], function(x) x$b))
yHatTST <- xTST %*% bHat + fm$mu
Correlate the predicted height with the sex-age adjusted height:
cor(yTST, yHatTST)
## [,1]
## [1,] 0.5273258
cor(yTST, yHatTST)^2
## [,1]
## [1,] 0.2780725