Boruta is a 'wrapper algorithm for all relevant feature selection'.
multi.boruta<- list()
multi<- function(i){
fit.boruta <- Boruta(factor(group)~., data=ml.Spear, maxRuns = 300, pValue = 0.01)
boruta.df <- data.frame(attStats(fit.boruta))
multi.boruta[[i]] <- rownames(boruta.df[boruta.df$decision =='Confirmed',])
multi.boruta<- parallel::mclapply(1:100, multi)
multi.boruta.mirs<- as.character(multi.boruta[which(multi.boruta$Freq>10),1])
ml.Spear$group <- as.factor((ml.Spear$group))
m<- paste(as.vector(multi.boruta.mirs, mode = "any"), collapse = "+")
RFm<- as.formula(paste("group ~ ",m,sep = ""))<- ml.Spear[,c("group",multi.boruta.mirs)]
tree.Boruta<- setB[,c("group", multi.boruta.mirs)]
RFBoruta.train <- predict(fit.Boruta, tree.Boruta)
BorutaInterim<- confusionMatrix(as.factor(RFBoruta.train), as.factor(tree.Boruta$group), positive = "PH")
The rpart function from the rpart package can be utilised to grow a regression tree. This tree is built by splitting the data on the single variable which best splits the group in 2. Once the data is separated, this process is applied to each sub-group separately recursively until the subgroups reach a minimum size (defined as 3 below), or no improvement can be made.

Rpart uses a variable selection algorithm called recursive feature elimination (REF, also known as backward selection).
- min no of observations that must exist in a node in order for a split to be attempted
- min no of observations in any terminal node
prp(fit.caret.rpart$finalModel, main="PH from HV Rpart model", extra=2, varlen=0)
plot($finalModel), main="PH from HV Rpart model", drop_terminal=F)
rpartmirs<- gsub("<.*","", rpartmirs)
rpartmirs<- unique(gsub(">.*","", rpartmirs))
predrpart2 <- predict(fit.caret.rpart, setB2)
fit.preds.table.Rpart<- cbind(rownames(setB2),predrpart2,as.character(setB2$group))
colnames(fit.preds.table.Rpart) <- c("sample","fit.preds","group")
RpartInterim<- confusionMatrix(predrpart2, setB2$group, positive = "PH")
LASSO (Least Absolute Shrinkage and Selection Operator) is a feature selection method designed to reduce over-fitting. It automatically selects the significant variables by shrinking the coefficients of predictors deemed unimportant to zero.
(Vignette) is a package that uses penalised maximum likelihood to fit a generalised linear model.
fit.glmcv <- cv.glmnet(x=as.matrix(ml.Spear[-1]), y=as.factor(ml.Spear$group), alpha=1, family='binomial', nfolds=10)
other.glmcv <- cv.glmnet(x=as.matrix(ml.Spear[-1]), y=as.factor(ml.Spear$group), alpha=1, family='binomial', nfolds=10, type.measure = "class")
model.lambda <- fit.glmcv$lambda.min
plot(fit.glmcv, cex.axis=1, cex.lab=1,cex.main=1)
plot(other.glmcv, cex.axis=1, cex.lab=1, cex.main=1)
LASSO.min<- caret::train(group ~ ., data = ml.Spear, method = "glmnet", trControl = control, family = 'binomial', tuneGrid = tuneLASSO)
lasso.model<- coef(LASSO.min$finalModel, LASSO.min$bestTune$lambda) %>% as.matrix() %>% %>% rownames_to_column %>% filter(abs(`1`) >0)
ml.Spear.LASSO <- cbind("group" = ml.Spear$group, ml.Spear[,colnames(ml.Spear) %in% lasso.model$rowname])
val.LASSO<- setB[, colnames(setB) %in% colnames(ml.Spear)]
LASSO.pred.min <- predict(LASSO.min, newdata=val.LASSO[-1])
colnames(fit.preds.table.LASSO) <- c("sample","LASSO","group")
LASSOInterim<- confusionMatrix(LASSO.pred.min,val.LASSO$group, positive = "PH")
XGBoost (Extreme Gradient Boosting) is an optimised distributed gradient boosting library that performs better than gradient boosting (GBM) framework alone.
gg+ggplot2::theme(legend.position = "none", text = element_text(size = 22), axis.text = element_text(size = 18))
xgbpredictfinal<- predict(xgb_tune5, validation)
xgbpredictfinal_probs<- predict(xgb_tune5, validation, type = "prob")
confusionMatrix(xgbpredictfinal, as.factor(setB$group), positive = "PH")
setAshort<- ml.Spear[,c("group",xgb_mirs$Feature)]
setBshort<- setB[,c("group",xgb_mirs$Feature)]
train.short<- data.matrix(setAshort[-1])
validation.short<- data.matrix(setBshort[-1])
labels<- setAshort$group
setBshort$group <- setB$group
ROC curves for all microRNAs selected by either LASSO, Rpart, Boruta or XGBoost.
kable(cutoffs, caption = "Cutpoints for miRNAs selected by the above methods, calculated on the training set") %>% kable_styling(full_width = TRUE)
Summary statistics for PH group and healthy volunteers
Boruta.perf<- predict(fit.Boruta, tree.Boruta[-1], type= "prob")
Boruta.pred<- prediction(Boruta.perf$PH, tree.Boruta$group)
Boruta.perfs<- ROCR::performance(Boruta.pred,"tpr","fpr")
Boruta.sens<- ROCR::performance(Boruta.pred,"sens","spec")
Boruta.auc <- pROC::auc(tree.Boruta$group, Boruta.perf[,2])
rpart.perf<- predict(fit.caret.rpart, setB2[-1], type= "prob")
rpart.pred<- prediction(rpart.perf$PH, setB2$group)
rpart.perfs<- ROCR::performance(rpart.pred,"tpr","fpr")
rpart.sens<- ROCR::performance(rpart.pred,"sens","spec")
LASSO.probs.min <- predict(LASSO.min, newdata=val.LASSO[-1], type = "prob")
LASSO.pred.min<- prediction(LASSO.probs.min$PH, val.LASSO$group)
LASSO.perfs.min<- ROCR::performance(LASSO.pred.min,"tpr","fpr")
LASSO.sens.min<- ROCR::performance(LASSO.pred.min,"sens","spec")
xgbpreds<- predict(xgb_tune_final_short, validation.short, type = "prob")
xgb.pred<- prediction(xgbpreds$PH, as.factor(setBshort$group))
xgb.perfs<- ROCR::performance(xgb.pred,"tpr","fpr")
xgb.sens<- ROCR::performance(xgb.pred,"sens","spec")
proBNP<- glm(group ~ NTproBNP, data = withBNPa, family = "binomial")
