Here, the bod file contains vertex-wise values for the discovery and replication sample.
We have made several scripts (RR_9.0) that use the different R packages that have implemented penalised regression. Note that with the size of the current data, only the bigstatsr package managed to perform the analyses.
All R scripts require the following arguments:
1. /working/directory/ : setwd() in R
2. VertexFile.txt : created above
3. outputFolder
4. variable name
5. lasso or ridge
6. n cpus to use
They require two lists of IDs in order to separate discovery and replication (train / test samples)
Here: ID_discovery_UKBanalysis.txt and ID_replication_UKBanalysis.txt
To our knowledge, this package does not implement ridge regression
cd $wd/LassoRidgePred
for mod in lasso
do
${bind}/qsubshcom " Rscript --no-save 'RR_9.0_PenalisedRegressionFunction_bigstatsr_noBin.R' /working/directory AllVertices.fwhm0.fsaverage.UKB15K.txt LassoRidgePred \$(sed -n \"\${TASK_ID}{p;q}\" "${wd}"/UKB_Phenotypes_signif_Bslne.txt) "${mod}" 1" 1 300G R_pen_bigstatsr_noBin_${mod} 48:00:00 "-array=1-58 -acct=UQ-IMB-CNSG"
done
For us, it only worked on a subset of the data but could not scale to the full sample. For biglasso, simply replace RR_9.0_PenalisedRegressionFunction_glmnet.R by the biglasso equivalent.
cd $wd/LassoRidgePred
for mod in lasso ridge
do
${bind}/qsubshcom " Rscript --no-save 'RR_9.0_PenalisedRegressionFunction_glmnet.R' /working/directory AllVertices.fwhm0.fsaverage.UKB15K.txt LassoRidgePred \$(sed -n \"\${TASK_ID}{p;q}\" "${wd}"/UKB_Phenotypes_signif_Bslne.txt) "${mod}" 1" 1 250G R_penalised_glmnet_${mod} 48:00:00 "-array=1-2 -acct=UQ-IMB-CNSG"
done
On full sample, we got error: Error in elnet(x, is.sparse, ix, jx, y, weights, offset, type.gaussian, : long vectors (argument 5) are not supported in .Fortran Calls: cv.glmnet -> cv.glmnet.raw -> glmnet -> elnet Execution halted
Similarily using biglasso, we got the error: Error in SetMatrixElements(x@address, as.double(j), as.double(i), as.double(value)) : long vectors not supported yet: ../../src/include/Rinlinedfuns.h:522 Calls: as.big.matrix … [<- -> .local -> SetElements.bm -> SetMatrixElements Execution halted
# Open 15K - january2019 data
TT=read.csv("UKB_phenotypes_15K_allData_regVars_Jan2019Update.csv")
# Using read_csv makes a df and does not allow to call TT[,columnName]
# Phenotype list
phe=read.csv("UKB_allvars_categories_fancylabels_withOldNames_Jan19Update.csv", stringsAsFactors = F)
# Subset
signi=read.table("UKB_Phenotypes_signif_replication15K.txt")
phe=phe[which(phe$V4 %in% signi$V1),]
BRSdat=as.data.frame(TT$eid)
colnames(BRSdat)="eid"
for (iii in 1:length(phe$V4)){
varLoop=phe$V4[iii]
BRSvarLoop=paste0("BRS_lasso_", phe$V4[iii])
if (file.exists(paste0("LassoRidgePred/lassoPredictor_", varLoop , "_bigstatsr.txt") )){
dat=read.table( paste0("LassoRidgePred/lassoPredictor_", varLoop , "_bigstatsr.txt") , header=T)
dat$FID=NULL
dat$PHENO=NULL
dat$CNT=NULL
colnames(dat)=c("eid", BRSvarLoop)
BRSdat=merge(BRSdat, dat, by="eid", all.x=T)
print(dim(BRSdat))
}
}
BRSdat=BRSdat[-which(is.na(BRSdat$BRS_lasso_Age_MRI)),]
# Add scores to the table
TT2=merge(TT, BRSdat, by="eid")
# Check
aa=cbind(colSums(is.na(TT2[,c(paste0("BRS_lasso_", phe$V4))])),colSums(is.na(TT2[,c(paste0( phe$V4))])) )
library(lmtest)
library(pROC)
res=NULL
# Loop on subset of phenotypes for which BRS was calculated
for (varLoop in phe$V4){
BRSvarLoop=paste0("BRS_lasso_", varLoop) # Name of the BLUP score
newVarLoop=varLoop # Phenotype to predict in independent sample
# Fixed effect R2
m1<-lm(formula = as.formula(paste0(newVarLoop," ~ Age_MRI + sexuses_datacoding_9_f31_0_0 + ICV + LThickness + RThickness + LSurfArea + RSurfArea ")) , data = TT2[which(!is.na(TT2[,BRSvarLoop])),])
m2<-lm(formula = as.formula(paste0(newVarLoop," ~ ", BRSvarLoop, "+ Age_MRI + sexuses_datacoding_9_f31_0_0 + ICV + LThickness + RThickness + LSurfArea + RSurfArea ")) , data = TT2)
mICV<-as.data.frame(summary(m2)$coefficients)
# Add values of interest
mICV$R2<-NA
mICV[BRSvarLoop, "R2_cov"]<-summary(m1)$adj.r.squared
mICV[BRSvarLoop, "R2"]<-summary(m2)$adj.r.squared
lmt=lrtest(m1,m2)
mICV[BRSvarLoop, "pval"]<-lmt$`Pr(>Chisq)`[2]
mICV[BRSvarLoop, "chisQ"]<-lmt$Chisq[2]
mICV[BRSvarLoop, "df"]<-lmt$Df[2]
mICV[BRSvarLoop, "R2_final"]<-mICV[BRSvarLoop, "R2"]-mICV[BRSvarLoop, "R2_cov"]
# Get AUC for discrete variables
if (length(table(TT2[,newVarLoop]))<3){
mICV[BRSvarLoop, "auc"]<-auc(TT2[,newVarLoop], TT2[,BRSvarLoop])
mICV[BRSvarLoop, "auc.se"]<-sqrt(var(roc(TT2[,newVarLoop]~ TT2[,BRSvarLoop])))
} else{
mICV[BRSvarLoop, "auc"]<-NA
mICV[BRSvarLoop, "auc.se"]<-NA
}
res<-rbind(res,cbind(BRSvarLoop, newVarLoop, mICV[BRSvarLoop,]) )
}
# Write results
write.csv(res, "Lasso_prediction_UKB_replicationsample5K.csv")
BRSdat=as.data.frame(TT$eid)
colnames(BRSdat)="eid"
varsWithBRS=NULL
for (iii in 1:length(phe$V4)){
varLoop=phe$V4[iii]
BRSvarLoop=paste0("BRS_", phe$V4[iii])
if (file.exists(paste0("BLUP_Bslne/BLUP.", varLoop , ".profile") )){
dat=read.table( paste0("BLUP_Bslne/BLUP.", varLoop , ".profile") , header=T)
dat$FID=NULL
dat$PHENO=NULL
dat$CNT=NULL
colnames(dat)=c("eid", BRSvarLoop)
BRSdat=merge(BRSdat, dat, by="eid")
varsWithBRS=c(varsWithBRS, phe$V4[iii])
}
}
# Add scores to the table
TT3=merge(TT2, BRSdat, by="eid")
# Extract Discovery sample
TTD=TT[which(TT$DiscoverySample==1 & TT$DiscoveryQced==0),]
res=NULL
for (pheno in phe$V4){
# Scale BLUP and LASSO score using mean and sd from discovery sample
TT3[,paste0("BRS_", pheno, "_std")]=scale(TT3[,paste0("BRS_", pheno)])*sd(TTD[,pheno], na.rm = T)+mean(TTD[,pheno], na.rm = T)
TT3[,paste0("BRS_lasso_", pheno, "_std")]=scale(TT3[,paste0("BRS_lasso_", pheno)])*sd(TTD[,pheno], na.rm = T)+mean(TTD[,pheno], na.rm = T)
# Calculate Absolute Error
TT3[,paste0("AE_BLP_", pheno)]=abs(TT3[,paste0("BRS_", pheno, "_std")] - TT3[,paste0( pheno)] )
TT3[,paste0("AE_LASSO_", pheno)]=abs(TT3[,paste0("BRS_lasso_", pheno, "_std")] - TT3[,paste0( pheno)] )
# Perform paired t-test
tt=t.test(TT3[,paste0("AE_BLP_", pheno)], TT3[,paste0("AE_LASSO_", pheno)], paired = T)
# Wilcoxon test on median (less sensitive to extreme values)
ww=wilcox.test(TT3[,paste0("AE_BLP_", pheno)], TT3[,paste0("AE_LASSO_", pheno)], paired = T)
# Store results
res=rbind(res,c(pheno, tt$p.value,ww$p.value, mean(TT3[,paste0("AE_BLP_", pheno)], na.rm = T), mean(TT3[,paste0("AE_LASSO_", pheno)], na.rm = T) ))
}
colnames(res)=c("pheno", "pval_ttest","pval_wilcox", "mae_blup", "mae_lasso")
signifT=0.05/58
res[which(res[,2]<signifT),]
blup=read.csv("BLUP_prediction_UKBreplication_Bslne.csv")
lasso=read.csv("Lasso_prediction_UKB_replicationsample5K.csv")
all=merge(blup, lasso, by.x="HCPvarLoop", by.y="newVarLoop" )
#all=all[which(all$pval.x <0.1 & all$pval.x>0.01),]
# Calculate SE and CI of R2
# Using exponential transformation - see Algina et al., 2010
all$z=log( (1+sqrt(all$R2_final.x)) / (1-sqrt(all$R2_final.x)) )
all$z1=all$z-2*1.96/sqrt(all$N)
all$z2=all$z+2*1.96/sqrt(all$N)
all$lb.x=((exp(all$z1)-1)/(exp(all$z1)+1))**2
all$lb.x[which(all$z1<0)]=0
all$ub.x=((exp(all$z2)-1)/(exp(all$z2)+1))**2
# LASSO
all$z=log( (1+sqrt(all$R2_final.y)) / (1-sqrt(all$R2_final.y)) )
all$z1=all$z-2*1.96/sqrt(all$N)
all$z2=all$z+2*1.96/sqrt(all$N)
all$lb.y=((exp(all$z1)-1)/(exp(all$z1)+1))**2
all$lb.y[which(all$z1<0)]=0
all$ub.y=((exp(all$z2)-1)/(exp(all$z2)+1))**2
# Add labels for plotting
phe=read.csv("UKB_allvars_categories_fancylabels_Jan19Update.csv")
all=merge(all, phe, by.x="HCPvarLoop", by.y="V4")
png("Prediction_BLUP_LASSO_replicationUKB.png", width = 15, height = 15, units = "cm", res = 400)
par(mar=c(4,4,1,1))
plot(all$R2_final.x, all$R2_final.y, pch=20, cex=1.5,ylab="Prediction R2 from LASSO", xlab="Prediction R2 from BLUP", xlim = c(0,0.6), ylim = c(0,0.6) )
abline(a=0, b=1, col="red")
# Variable labels
text(all$ub.x[which(all$R2_final.x>0.05)], all$R2_final.y[which(all$R2_final.x>0.05)], labels =all$V3[which(all$R2_final.x>0.05)], pos = 4, cex=0.5 )
# Confidence intervals
arrows(y0 = all$lb.y , x0 = all$R2_final.x, y1 = all$ub.y, x1 =all$R2_final.x, code = 3, angle = 90,length = 0.05 )
arrows(x0 = all$lb.x , y0 = all$R2_final.y, x1 = all$ub.x, y1 =all$R2_final.y, code = 3, angle = 90,length = 0.05 )
dev.off()
A work by by [Baptiste Couvy-Duchesne] - 28 April 2020