1 Convert full bod file into text

Here, the bod file contains vertex-wise values for the discovery and replication sample.


${bind}/qsubshcom " "${bind}"/osca --befile "${wd}"/BodFiles/AllVertices.fwhm0.fsaverage.UKB15K --make-efile --out "${wd}"/BodFiles/AllVertices.fwhm0.fsaverage.UKB15K.txt " 1 80G MakeEfile 4:00:00 "-acct=UQ-IMB-CNSG"
 
mkdir -p ${wd}/LassoRidgePred

2 Run lasso and ridge (when available) using different R packages

We have made several scripts (RR_9.0) that use the different R packages that have implemented penalised regression. Note that with the size of the current data, only the bigstatsr package managed to perform the analyses.

All R scripts require the following arguments:
1. /working/directory/ : setwd() in R
2. VertexFile.txt : created above
3. outputFolder
4. variable name
5. lasso or ridge
6. n cpus to use

They require two lists of IDs in order to separate discovery and replication (train / test samples)
Here: ID_discovery_UKBanalysis.txt and ID_replication_UKBanalysis.txt

3 Using bigstatsr package (as in manuscript)

To our knowledge, this package does not implement ridge regression


cd $wd/LassoRidgePred

for mod in lasso
do 
${bind}/qsubshcom " Rscript --no-save 'RR_9.0_PenalisedRegressionFunction_bigstatsr_noBin.R' /working/directory AllVertices.fwhm0.fsaverage.UKB15K.txt LassoRidgePred \$(sed -n \"\${TASK_ID}{p;q}\" "${wd}"/UKB_Phenotypes_signif_Bslne.txt) "${mod}" 1" 1 300G R_pen_bigstatsr_noBin_${mod} 48:00:00 "-array=1-58 -acct=UQ-IMB-CNSG"
done

4 Using glmnet and biglasso packages

For us, it only worked on a subset of the data but could not scale to the full sample. For biglasso, simply replace RR_9.0_PenalisedRegressionFunction_glmnet.R by the biglasso equivalent.


cd $wd/LassoRidgePred

for mod in lasso ridge
do 
${bind}/qsubshcom " Rscript --no-save 'RR_9.0_PenalisedRegressionFunction_glmnet.R' /working/directory AllVertices.fwhm0.fsaverage.UKB15K.txt LassoRidgePred \$(sed -n \"\${TASK_ID}{p;q}\" "${wd}"/UKB_Phenotypes_signif_Bslne.txt) "${mod}" 1" 1 250G R_penalised_glmnet_${mod} 48:00:00 "-array=1-2 -acct=UQ-IMB-CNSG"
done

On full sample, we got error: Error in elnet(x, is.sparse, ix, jx, y, weights, offset, type.gaussian, : long vectors (argument 5) are not supported in .Fortran Calls: cv.glmnet -> cv.glmnet.raw -> glmnet -> elnet Execution halted

Similarily using biglasso, we got the error: Error in SetMatrixElements(x@address, as.double(j), as.double(i), as.double(value)) : long vectors not supported yet: ../../src/include/Rinlinedfuns.h:522 Calls: as.big.matrix … [<- -> .local -> SetElements.bm -> SetMatrixElements Execution halted

5 Open phenotype files

# Open 15K - january2019 data
TT=read.csv("UKB_phenotypes_15K_allData_regVars_Jan2019Update.csv")
# Using read_csv makes a df and does not allow to call TT[,columnName]

# Phenotype list
phe=read.csv("UKB_allvars_categories_fancylabels_withOldNames_Jan19Update.csv", stringsAsFactors = F)

# Subset 
signi=read.table("UKB_Phenotypes_signif_replication15K.txt")
phe=phe[which(phe$V4 %in% signi$V1),]

6 Read Lasso predictions

BRSdat=as.data.frame(TT$eid)
colnames(BRSdat)="eid"

for (iii in 1:length(phe$V4)){

  varLoop=phe$V4[iii]
  BRSvarLoop=paste0("BRS_lasso_", phe$V4[iii])
  
  if (file.exists(paste0("LassoRidgePred/lassoPredictor_", varLoop , "_bigstatsr.txt") )){
  dat=read.table( paste0("LassoRidgePred/lassoPredictor_", varLoop , "_bigstatsr.txt") , header=T)
  dat$FID=NULL
    dat$PHENO=NULL
     dat$CNT=NULL
  colnames(dat)=c("eid", BRSvarLoop)
  BRSdat=merge(BRSdat, dat, by="eid", all.x=T)
  print(dim(BRSdat))
  }
}

BRSdat=BRSdat[-which(is.na(BRSdat$BRS_lasso_Age_MRI)),]

# Add scores to the table
TT2=merge(TT, BRSdat, by="eid")

# Check
aa=cbind(colSums(is.na(TT2[,c(paste0("BRS_lasso_", phe$V4))])),colSums(is.na(TT2[,c(paste0( phe$V4))])) )

7 Evaluate prediction performance in replication set

library(lmtest)
library(pROC)
res=NULL

# Loop on subset of phenotypes for which BRS was calculated 
for (varLoop in phe$V4){

BRSvarLoop=paste0("BRS_lasso_", varLoop) # Name of the BLUP score
newVarLoop=varLoop # Phenotype to predict in independent sample
    
  # Fixed effect R2
m1<-lm(formula = as.formula(paste0(newVarLoop," ~  Age_MRI +  sexuses_datacoding_9_f31_0_0 + ICV + LThickness + RThickness + LSurfArea + RSurfArea "))  , data = TT2[which(!is.na(TT2[,BRSvarLoop])),])
 
m2<-lm(formula = as.formula(paste0(newVarLoop," ~ ", BRSvarLoop, "+ Age_MRI +  sexuses_datacoding_9_f31_0_0 + ICV + LThickness + RThickness + LSurfArea + RSurfArea "))  , data = TT2)
 
mICV<-as.data.frame(summary(m2)$coefficients)
# Add values of interest
mICV$R2<-NA
mICV[BRSvarLoop, "R2_cov"]<-summary(m1)$adj.r.squared
mICV[BRSvarLoop, "R2"]<-summary(m2)$adj.r.squared
lmt=lrtest(m1,m2)
mICV[BRSvarLoop, "pval"]<-lmt$`Pr(>Chisq)`[2]
mICV[BRSvarLoop, "chisQ"]<-lmt$Chisq[2]
mICV[BRSvarLoop, "df"]<-lmt$Df[2]
mICV[BRSvarLoop, "R2_final"]<-mICV[BRSvarLoop, "R2"]-mICV[BRSvarLoop, "R2_cov"]

# Get AUC for discrete variables
if (length(table(TT2[,newVarLoop]))<3){
mICV[BRSvarLoop, "auc"]<-auc(TT2[,newVarLoop], TT2[,BRSvarLoop])
mICV[BRSvarLoop, "auc.se"]<-sqrt(var(roc(TT2[,newVarLoop]~ TT2[,BRSvarLoop])))
} else{
mICV[BRSvarLoop, "auc"]<-NA
mICV[BRSvarLoop, "auc.se"]<-NA
}

res<-rbind(res,cbind(BRSvarLoop,  newVarLoop, mICV[BRSvarLoop,]) )

}

# Write results
write.csv(res, "Lasso_prediction_UKB_replicationsample5K.csv")

8 Compare absolute error made by BLUP and LASSO

8.1 Open blup scores and merge

BRSdat=as.data.frame(TT$eid)
colnames(BRSdat)="eid"
varsWithBRS=NULL

for (iii in 1:length(phe$V4)){

  varLoop=phe$V4[iii]
  BRSvarLoop=paste0("BRS_", phe$V4[iii])
  
  if (file.exists(paste0("BLUP_Bslne/BLUP.", varLoop , ".profile") )){
  dat=read.table( paste0("BLUP_Bslne/BLUP.", varLoop , ".profile") , header=T)
  dat$FID=NULL
    dat$PHENO=NULL
     dat$CNT=NULL
  colnames(dat)=c("eid", BRSvarLoop)
  BRSdat=merge(BRSdat, dat, by="eid")
  varsWithBRS=c(varsWithBRS, phe$V4[iii])
  }
}

# Add scores to the table
TT3=merge(TT2, BRSdat, by="eid")

8.2 Normalise blup and lasso scores, calculate and test Absolute Error

# Extract Discovery sample
TTD=TT[which(TT$DiscoverySample==1 & TT$DiscoveryQced==0),]

res=NULL
for (pheno in phe$V4){
 
  # Scale BLUP and LASSO score using mean and sd from discovery sample 
  TT3[,paste0("BRS_", pheno, "_std")]=scale(TT3[,paste0("BRS_", pheno)])*sd(TTD[,pheno], na.rm = T)+mean(TTD[,pheno], na.rm = T)
  TT3[,paste0("BRS_lasso_", pheno, "_std")]=scale(TT3[,paste0("BRS_lasso_", pheno)])*sd(TTD[,pheno], na.rm = T)+mean(TTD[,pheno], na.rm = T)
  
  # Calculate Absolute Error
  TT3[,paste0("AE_BLP_", pheno)]=abs(TT3[,paste0("BRS_", pheno, "_std")] - TT3[,paste0( pheno)] )
  TT3[,paste0("AE_LASSO_", pheno)]=abs(TT3[,paste0("BRS_lasso_", pheno, "_std")] - TT3[,paste0( pheno)] )
   
  # Perform paired t-test
tt=t.test(TT3[,paste0("AE_BLP_", pheno)], TT3[,paste0("AE_LASSO_", pheno)], paired = T)

  # Wilcoxon test on median (less sensitive to extreme values)
ww=wilcox.test(TT3[,paste0("AE_BLP_", pheno)], TT3[,paste0("AE_LASSO_", pheno)], paired = T)

  # Store results
res=rbind(res,c(pheno, tt$p.value,ww$p.value, mean(TT3[,paste0("AE_BLP_", pheno)], na.rm = T), mean(TT3[,paste0("AE_LASSO_", pheno)], na.rm = T) ))

}

colnames(res)=c("pheno", "pval_ttest","pval_wilcox", "mae_blup", "mae_lasso")
signifT=0.05/58
res[which(res[,2]<signifT),]

9 Final plots - BLUP vs. LASSO

blup=read.csv("BLUP_prediction_UKBreplication_Bslne.csv")
lasso=read.csv("Lasso_prediction_UKB_replicationsample5K.csv")

all=merge(blup, lasso, by.x="HCPvarLoop", by.y="newVarLoop" )
#all=all[which(all$pval.x <0.1 & all$pval.x>0.01),]

# Calculate SE and CI of R2
# Using exponential transformation - see Algina et al., 2010
all$z=log( (1+sqrt(all$R2_final.x)) / (1-sqrt(all$R2_final.x)) )
all$z1=all$z-2*1.96/sqrt(all$N)
all$z2=all$z+2*1.96/sqrt(all$N)

all$lb.x=((exp(all$z1)-1)/(exp(all$z1)+1))**2
all$lb.x[which(all$z1<0)]=0
all$ub.x=((exp(all$z2)-1)/(exp(all$z2)+1))**2

# LASSO
all$z=log( (1+sqrt(all$R2_final.y)) / (1-sqrt(all$R2_final.y)) )
all$z1=all$z-2*1.96/sqrt(all$N)
all$z2=all$z+2*1.96/sqrt(all$N)

all$lb.y=((exp(all$z1)-1)/(exp(all$z1)+1))**2
all$lb.y[which(all$z1<0)]=0
all$ub.y=((exp(all$z2)-1)/(exp(all$z2)+1))**2

# Add labels for plotting
phe=read.csv("UKB_allvars_categories_fancylabels_Jan19Update.csv")
all=merge(all, phe, by.x="HCPvarLoop", by.y="V4")

png("Prediction_BLUP_LASSO_replicationUKB.png", width = 15, height = 15, units = "cm", res = 400)
par(mar=c(4,4,1,1))
plot(all$R2_final.x, all$R2_final.y, pch=20, cex=1.5,ylab="Prediction R2 from LASSO", xlab="Prediction R2 from BLUP", xlim = c(0,0.6), ylim = c(0,0.6) )
abline(a=0, b=1, col="red")

# Variable labels
text(all$ub.x[which(all$R2_final.x>0.05)], all$R2_final.y[which(all$R2_final.x>0.05)], labels =all$V3[which(all$R2_final.x>0.05)], pos = 4, cex=0.5 )

# Confidence intervals
arrows(y0 = all$lb.y , x0 = all$R2_final.x, y1 = all$ub.y, x1 =all$R2_final.x, code = 3, angle = 90,length = 0.05  )
arrows(x0 = all$lb.x , y0 = all$R2_final.y, x1 = all$ub.x, y1 =all$R2_final.y, code = 3, angle = 90,length = 0.05  )

dev.off()

A work by by [Baptiste Couvy-Duchesne] - 28 April 2020

Penalised regression in R to compare with BLUP scores

by [Baptiste Couvy-Duchesne] - 28 April 2020 -