##NORMALIZATION #import raw data for milestones 1-6, sorted by Set, then by Dx1 set.seed(3) datFR<-read.delim("FinalRawData1_6.txt") #split into dataframe list by Set datFRL<-split(datFR, datFR$Set) #Create individual dataframes from list datS1<-as.data.frame(datFRL[1]) datS2<-as.data.frame(datFRL[2]) datS3_4<-as.data.frame(datFRL[3]) datS5<-as.data.frame(datFRL[4]) datS6<-as.data.frame(datFRL[5]) #identify healthy controls which(datS1$Set_1.Dx1 == "CON") #61:120 from output which(datS2$Set_2.Dx1 == "CON") #1:14 which(datS3_4$Set_3_4.Dx1 == "CON") #41:139 which(datS5$Set_5.Dx1 == "CON") #1:288 which(datS6$Set_6.Dx1 == "CON") #33:72 #create objects for log2 transform analyte1<-as.matrix(datS1[,10:40]) analyte2<-as.matrix(datS2[,10:40]) analyte3<-as.matrix(datS3_4[,10:40]) analyte4<-as.matrix(datS5[,10:40]) analyte5<-as.matrix(datS6[,10:40]) #log transform analyte1L<-log2(analyte1) analyte2L<-log2(analyte2) analyte3L<-log2(analyte3) analyte4L<-log2(analyte4) analyte5L<-log2(analyte5) #generate vector for scaling to SD of CON sdCON1<-apply(analyte1L[61:120,],2,sd) sdCON2<-apply(analyte2L[1:14,],2,sd) sdCON3<-apply(analyte3L[41:139,],2,sd) sdCON4<-apply(analyte4L[1:288,],2,sd) sdCON5<-apply(analyte5L[33:72,],2,sd) #center and divide by SD of CON dat1cs<-scale(analyte1L, center = TRUE, scale = sdCON1) dat2cs<-scale(analyte2L, center = TRUE, scale = sdCON2) dat3cs<-scale(analyte3L, center = TRUE, scale = sdCON3) dat4cs<-scale(analyte4L, center = TRUE, scale = sdCON4) dat5cs<-scale(analyte5L, center = TRUE, scale = sdCON5) #combine with decode for final data objects datS1.adj<-cbind(datS1[,1:9], dat1cs) datS2.adj<-cbind(datS2[,1:9], dat2cs) datS3_4.adj<-cbind(datS3_4[,1:9], dat3cs) datS5.adj<-cbind(datS5[,1:9], dat4cs) datS6.adj<-cbind(datS6[,1:9], dat5cs) #rename columns using original data file names(datS1.adj)<-names(datFR) names(datS2.adj)<-names(datFR) names(datS3_4.adj)<-names(datFR) names(datS5.adj)<-names(datFR) names(datS6.adj)<-names(datFR) #save individual data files write.csv(datS1.adj, file = "M1logcentscale.csv") write.csv(datS2.adj, file = "M2logcentscale.csv") write.csv(datS3_4.adj, file = "M3_4logcentscale.csv") write.csv(datS5.adj, file = "M5logcentscale.csv") write.csv(datS6.adj, file = "M6logcentscale.csv") #recombine for analysis dat1_6.adj<-rbind(datS1.adj, datS2.adj, datS3_4.adj, datS5.adj, datS6.adj) write.csv(dat1_6.adj, file = "dat1_6.adj.csv") ##MODELING #develop GBM model using log2/center/scale and CON norm data library(caret) library(caretEnsemble) library(doParallel) registerDoParallel(cores = 10) library(gbm) library(glmnet) library(nnet) library(rpart) library(kknn) #PARTITION SETS #use sample split from FIXED algorithm, identified in column 1 trainSplit<-dat1_6.adj[dat1_6.adj$Split=="Train",] testSplit<-dat1_6.adj[dat1_6.adj$Split == "Test" ,] valSplit<-dat1_6.adj[dat1_6.adj$Split == "Val" ,] #save split data write.csv(trainSplit,file = "SplitTrain.csv" ) write.csv(testSplit,file = "SplitTest.csv" ) write.csv(valSplit,file = "SplitTest6.csv" ) #CREATE DATA OBJECTS #truncate TRAIN dataframe to include only predictors and factor (Dx) datTrtemp<-trainSplit[,6:40] datTr<-datTrtemp[,-2] #remove Stage #truncate TEST dataframe to include only predictors and factor (Dx) datTetemp<-testSplit[,6:40] datte<-datTetemp[,-2] #remove Stage #truncate VALIDATION dataframe to include only predictors datTe6<-valSplit[,8:40] #Milestone 6 data #create empty dataframes for probability collection trprob<-data.frame(matrix(nrow = 669, ncol = 0)) teprob<-data.frame(matrix(nrow = 168, ncol = 0)) te6prob<-data.frame(matrix(nrow = 186, ncol = 0)) #SET UP #control with index, need all models to use the same folds control <- trainControl(method="repeatedcv", number=10, index = createFolds(datTr$Dx, 10), repeats=10, savePredictions="all", search = "random", classProbs=TRUE, summaryFunction=twoClassSummary) #control for Ensemble stackControl <- trainControl(method="repeatedcv", number=10, repeats=10, savePredictions="all", classProbs=TRUE, summaryFunction=twoClassSummary) #create submodel lists algorithmList <- c('glmnet', 'svmRadial', 'rf', 'nnet', 'knn') #TRAIN/TEST BLOCKS #create submodels RR set.seed(1618) models.R <- caretList(Dx~., data=datTr, preProc = c("center", "scale"), metric = "ROC", trControl=control, methodList=algorithmList) # stack using gbm set.seed(7) stack.gbm.RR <- caretStack(models.R, method="gbm", metric = "ROC", #verbose=FALSE, tuneLength=10, trControl=stackControl) #resample train RR.tr.prob <- predict(stack.gbm.RR, newdata = datTr, type = "prob") trprob<-cbind(trprob, RR.tr.prob) #predict test sets RR.te6.prob <- predict(stack.gbm.RR, newdata = datTe6, type = "prob") te6prob<-cbind(te6prob, RR.te6.prob) RR.te.prob <- predict(stack.gbm.RR, newdata = datte, type = "prob") teprob<-cbind(teprob, RR.te.prob) #create submodels RSe #use previous models.R # stack using gbm set.seed(7) stack.gbm.RSe <- caretStack(models.R, method="gbm", metric = "Sens", #verbose=FALSE, tuneLength=10, trControl=stackControl) #resample train RSe.tr.prob <- predict(stack.gbm.RSe, newdata = datTr, type = "prob") trprob<-cbind(trprob, RSe.tr.prob) #predict test sets RSe.te6.prob <- predict(stack.gbm.RSe, newdata = datTe6, type = "prob") te6prob<-cbind(te6prob, RSe.te6.prob) RSe.te.prob <- predict(stack.gbm.RSe, newdata = datte, type = "prob") teprob<-cbind(teprob, RSe.te.prob) #create submodels RSp #use previous models.R # stack using gbm set.seed(7) stack.gbm.RSp <- caretStack(models.R, method="gbm", metric = "Spec", #verbose=FALSE, tuneLength=10, trControl=stackControl) #resample train RSp.tr.prob <- predict(stack.gbm.RSp, newdata = datTr, type = "prob") trprob<-cbind(trprob, RSp.tr.prob) #predict test sets RSp.te6.prob <- predict(stack.gbm.RSp, newdata = datTe6, type = "prob") te6prob<-cbind(te6prob, RSp.te6.prob) RSp.te.prob <- predict(stack.gbm.RSp, newdata = datte, type = "prob") teprob<-cbind(teprob, RSp.te.prob) #create submodels SeR set.seed(1618) models.Se <- caretList(Dx~., data=datTr, preProc = c("center", "scale"), metric = "Sens", trControl=control, methodList=algorithmList) # stack using gbm set.seed(7) stack.gbm.SeR <- caretStack(models.Se, method="gbm", metric = "ROC", #verbose=FALSE, tuneLength=10, trControl=stackControl) #resample train SeR.tr.prob <- predict(stack.gbm.SeR, newdata = datTr, type = "prob") trprob<-cbind(trprob, SeR.tr.prob) #predict test SeR.te6.prob <- predict(stack.gbm.SeR, newdata = datTe6, type = "prob") te6prob<-cbind(te6prob, SeR.te6.prob) SeR.te.prob <- predict(stack.gbm.SeR, newdata = datte, type = "prob") teprob<-cbind(teprob, SeR.te.prob) #create submodels SeSp #use previous models.Se # stack using gbm set.seed(7) stack.gbm.SeSp <- caretStack(models.Se, method="gbm", metric = "Spec", #verbose=FALSE, tuneLength=10, trControl=stackControl) #resample train SeSp.tr.prob <- predict(stack.gbm.SeSp, newdata = datTr, type = "prob") trprob<-cbind(trprob, SeSp.tr.prob) #predict test SeSp.te6.prob <- predict(stack.gbm.SeSp, newdata = datTe6, type = "prob") te6prob<-cbind(te6prob, SeSp.te6.prob) SeSp.te.prob <- predict(stack.gbm.SeSp, newdata = datte, type = "prob") teprob<-cbind(teprob, SeSp.te.prob) #create submodels SeSe #use previous models.Se # stack using gbm set.seed(7) stack.gbm.SeSe <- caretStack(models.Se, method="gbm", metric = "Sens", #verbose=FALSE, tuneLength=10, trControl=stackControl) #resample train SeSe.tr.prob <- predict(stack.gbm.SeSe, newdata = datTr, type = "prob") trprob<-cbind(trprob, SeSe.tr.prob) #predict test SeSe.te6.prob <- predict(stack.gbm.SeSe, newdata = datTe6, type = "prob") te6prob<-cbind(te6prob, SeSe.te6.prob) SeSe.te.prob <- predict(stack.gbm.SeSe, newdata = datte, type = "prob") teprob<-cbind(teprob, SeSe.te.prob) #create submodels SpR set.seed(1618) models.Sp <- caretList(Dx~., data=datTr, preProc = c("center", "scale"), metric = "Spec", trControl=control, methodList=algorithmList) # stack using gbm set.seed(7) stack.gbm.SpR <- caretStack(models.Sp, method="gbm", metric = "ROC", #verbose=FALSE, tuneLength=10, trControl=stackControl) #resample train SpR.tr.prob <- predict(stack.gbm.SpR, newdata = datTr, type = "prob") trprob<-cbind(trprob, SpR.tr.prob) #predict test SpR.te6.prob <- predict(stack.gbm.SpR, newdata = datTe6, type = "prob") te6prob<-cbind(te6prob, SpR.te6.prob) SpR.te.prob <- predict(stack.gbm.SpR, newdata = datte, type = "prob") teprob<-cbind(teprob, SpR.te.prob) #create submodels SpSe #use previous models.Sp # stack using gbm set.seed(7) stack.gbm.SpSe <- caretStack(models.Sp, method="gbm", metric = "Sens", #verbose=FALSE, tuneLength=10, trControl=stackControl) #resample train SpSe.tr.prob <- predict(stack.gbm.SpSe, newdata = datTr, type = "prob") trprob<-cbind(trprob, SpSe.tr.prob) #predict test SpSe.te6.prob <- predict(stack.gbm.SpSe, newdata = datTe6, type = "prob") te6prob<-cbind(te6prob, SpSe.te6.prob) SpSe.te.prob <- predict(stack.gbm.SpSe, newdata = datte, type = "prob") teprob<-cbind(teprob, SpSe.te.prob) #create submodels SpSp #use previous models.Sp # stack using gbm set.seed(7) stack.gbm.SpSp <- caretStack(models.Sp, method="gbm", metric = "Spec", #verbose=FALSE, tuneLength=10, trControl=stackControl) #resample train SpSp.tr.prob <- predict(stack.gbm.SpSp, newdata = datTr, type = "prob") trprob<-cbind(trprob, SpSp.tr.prob) #predict test SpSp.te6.prob <- predict(stack.gbm.SpSp, newdata = datTe6, type = "prob") te6prob<-cbind(te6prob, SpSp.te6.prob) SpSp.te.prob <- predict(stack.gbm.SpSp, newdata = datte, type = "prob") teprob<-cbind(teprob, SpSp.te.prob) #add row means trprob<- cbind(trprob, rowMeans(trprob[1:669,])) teprob<-cbind(teprob, rowMeans(teprob[1:168,])) te6prob<-cbind(te6prob, rowMeans(te6prob[1:186,])) #add decode trprob<- cbind(trainSplit[,5:6],trprob) teprob<-cbind(testSplit[,5:6] ,teprob) te6prob<-cbind(valSplit[,5:6] ,te6prob) #add prediction column based on mean probability (PDAC if <.5) trprob<- cbind(trprob, trprob[,12] <0.5) colnames(trprob)[13] = "PredictPDAC" teprob<- cbind(teprob, teprob[,12] <0.5) colnames(teprob)[13] = "PredictPDAC" te6prob<- cbind(te6prob, te6prob[,12] <0.5) colnames(te6prob)[13] = "PredictPDAC" #add logical actual PDAC for confusion matrix trprob<- cbind(trprob, trprob[,2] == "PDAC") colnames(trprob)[14] = "ActualPDAC" teprob<- cbind(teprob, teprob[,2] == "PDAC") colnames(teprob)[14] = "ActualPDAC" te6prob<- cbind(te6prob, te6prob[,2] == "PDAC") colnames(te6prob)[14] = "ActualPDAC" #save results write.csv(trprob,file = "TrainProb.csv" ) write.csv(te6prob,file = "Test6Prob.csv" ) write.csv(teprob,file = "TestProb.csv" ) #Confusion Matrix from caret CM.tr<-confusionMatrix(as.factor(trprob$PredictPDAC), as.factor(trprob$ActualPDAC), positive = "TRUE") CM.te<-confusionMatrix(as.factor(teprob$PredictPDAC), as.factor(teprob$ActualPDAC), positive = "TRUE") CM.te6<-confusionMatrix(as.factor(te6prob$PredictPDAC), as.factor(te6prob$ActualPDAC), positive = "TRUE") CM.tr CM.te CM.te6