Training Ensemble Models with Full CPU Cores

The ‘randomForest’ Package does not support training with multiple cores. What should I do?

R supports parallel computations with the core parallel package. What the doParallel package does is provide a backend while utilizing the core parallel package. The caret package is used for developing and testing machine learning models in R. This package as well as others like plyr support multicore CPU speedups if a parallel backend is registered before the supported instructions are called.

Before showing you how to train with multiple cores. Here are ensemble models code that I used:

setwd("~/Boruta-RFE-R-main/Final model")

# Load data 
v3data.train <- read.csv("full_train_setcsv.csv")
v3data.test <-  read.csv("full_test_setcsv.csv")

# Load caret package
library(caret)
names(getModelInfo()) # Check model name in caret 

# Model training 
set.seed(1234)
fitControl <- trainControl(
  method = "repeatedcv",
  number = 10,
  repeats = 2, allowParallel = TRUE)


## Parallel Training
library(doParallel)
cluster <- makeCluster(detectCores(logical = TRUE)) # leave one CPU spare...
registerDoParallel(cluster)
clusterEvalQ(cluster, {
  library(caret)
})


# Linear Regression 
set.seed(1234)
model_LR<-train(new_cases ~ . ,data=v3data.train, method='lm', trControl=fitControl,tuneLength=10)
print(model_LR)
plot(model_LR)
saveRDS(model_rf, "linear_reg_model.rds")



# CART Decision Tree
set.seed(1234)
model_CART<-train(new_cases ~ . ,data=v3data.train, method='rpart2', trControl=fitControl,tuneLength=10)
print(model_CART)
plot(model_CART)
rfImp_CART <- varImp(model_CART) 
plot(rfImp_CART, top = 53)
saveRDS(model_rf, "CART_model.rds")



# M5 Model tree
set.seed(1234)
model_M5<-train(new_cases ~ . ,data=v3data.train, method='M5', trControl=fitControl,tuneLength=10)
print(model_M5)
plot(model_M5)
rfImp_M5 <- varImp(model_M5) 
plot(rfImp_M5, top = 53)
saveRDS(model_rf, "M5_model.rds")



# Neural Network
set.seed(1234)
model_NN<-train(new_cases ~ .,data=v3data.train, method='neuralnet', trControl=fitControl,tuneLength=10)
print(model_NN)
plot(model_NN)
rfImp_NN <- varImp(model_NN) 
plot(rfImp_NN, top = 53)

# Random Forest
set.seed(1234)
model_rf<-train(new_cases ~ . ,data=v3data.train, method='rf', trControl=fitControl,tuneLength=10, importance = T)
print(model_rf)
plot(model_rf)
rfImp_rf <- varImp(model_rf) 
plot(rfImp_rf, top = 53)


# Gradient Boosted Tree model
set.seed(1234)
model_gbm<-train(new_cases ~ . ,data=v3data.train, method='gbm', trControl=fitControl,tuneLength=10, importance = T)
print(model_gbm)
plot(model_gbm)
rfImp_gbm <- varImp(model_gbm) 
plot(rfImp_gbm, top = 53)
saveRDS(model_rf, "GBT_model.rds")


### Stacking model
# collect resamples
results <- resamples(list(CART=model_CART, 
                          RF=model_rf, 
                          LR=model_LR))


# summarize differences between modes
summary(results)

# box and whisker plots to compare models
scales <- list(x=list(relation="free"), y=list(relation="free"))
bwplot(results, scales=scales)


# dot plots of accuracy
scales <- list(x=list(relation="free"), y=list(relation="free"))
p <- dotplot(results, scales=scales)
stripchart(results, vertical=TRUE, add=TRUE, method="stack", col='red', pch="*")
p + stat_summary(fun.data="mean_sdl", fun.args = list(mult=1), 
                 geom="crossbar", width=0.5)
summary(p)


### Plotting Varianle Importance each model 
plot(varImp(object=model_gbm),top = 11, main="GBM - Variable Importance")
plot(varImp(object=model_rf),top = 50,main="RF - Variable Importance")
plot(varImp(object=model_NN),top = 11, main="neuralnet - Variable Importance")
plot(varImp(object=model_M5),top = 11, main="M5 - Variable Importance")
plot(varImp(object=model_CART),top = 11,main="CART (Boruta) - Variable Importance")

names(v3data.test)
### Predictions in test data (If model_RF has highest accuracy)

v3data.train$Predicted_new_cases <-predict.train(object=model_rf,v3data.train)

v3data.test$Predicted_new_cases <-predict.train(object=model_rf,v3data.test)


### Save Trained Model
saveRDS(model_rf, "model.rds")
model_rf <- readRDS("full_var_model.rds")

### Error measurement

library("Metrics")

# Training error

mse(v3data.train$Predicted_new_cases , v3data.train$new_cases)
rmse(v3data.train$Predicted_new_cases , v3data.train$new_cases)
postResample(pred= v3data.train$Predicted_new_cases ,obs =v3data.train$new_cases)
cor(v3data.train$Predicted_new_cases , v3data.train$new_cases)  

# Testing error
mse(v3data.test$Predicted_new_cases , v3data.test$new_cases)
rmse(v3data.test$Predicted_new_cases , v3data.test$new_cases)
postResample(pred= v3data.test$Predicted_new_cases ,obs =v3data.test$new_cases)
cor(v3data.test$Predicted_new_cases , v3data.test$new_cases)  


# Export data

write.csv(v3data.train, ".....csv", row.names = FALSE)

write.csv(v3data.test, "......csv", row.names = FALSE)




# H-staistic
library(dplyr)     # basic data transformation
library(iml)       # ML interprtation
library(h2o)       # machine learning modeling
library(stats)
library(randomForest)
# 1. create a data frame with just the features
features <- as.data.frame(v3data.train) %>% select(-new_cases)

# 2. Create a vector with the actual responses
response <- as.numeric(as.vector(v3data.train$new_cases))


# initialize h2o session
h2o.no_progress()
h2o.init()


## Parallel Training
library(doParallel)
cluster <- makeCluster(detectCores(logical = TRUE)) # leave one CPU spare...
registerDoParallel(cluster)
clusterEvalQ(cluster, {
  library(iml)
})
# 3. Create custom predict function that returns the predicted values as a
#    vector (probability of purchasing in our example)
pred <- function(model, newdata)  {
  results <- as.data.frame(h2o.predict(model, as.h2o(newdata)))
  return(results[[3L]])
}

# example of prediction output
pred(model_rf, features) %>% head()

predictor.rf <- Predictor$new(
  model = model_rf, 
  data = features, 
  y = response, 
  predict.fun = pred,
  class = "classification"
)

interact.rf  <- Interaction$new(predictor.rf) %>% plot() + ggtitle("RF")

interact.rf

## Parallel
library("future")
library("future.callr")
plan("callr", workers = 6)


## Parallel Training
library(doParallel)
cluster <- makeCluster(detectCores(logical = TRUE)) # leave one CPU spare...
registerDoParallel(cluster)
clusterEvalQ(cluster, {
  library(future.callr)
})

## Two way Interaction
names(v3data.test)
two_way_interact.rf  <- Interaction$new(predictor.rf, feature = "c7_2_action") %>% plot() 


----------------------------------------------------------
## Parallel Training
library(doParallel)
cluster <- makeCluster(detectCores(logical = TRUE)) # leave one CPU spare...
registerDoParallel(cluster)
clusterEvalQ(cluster, {
  library(pdp)
})


library(pdp)
## Compute pdp for single variable 
pd_1st_order <- partial(model_rf, pred.var = c("c7_2_action"), rug = TRUE, plot = TRUE)

pd_1st_order

## Compute contour pdp for 2 variables 
pd <- partial(model_rf, pred.var = c("c7_1_action", "c7_2_action"), chull = FALSE, contour = TRUE)
# Add contour lines and use a different color palette
rwb <- colorRampPalette(c("blue", "white", "red" ))
pdp_2nd_order <- plotPartial(pd, contour = TRUE, col.regions = rwb)

pdp_2nd_order


The train instruction of the caret package has built-in support for parallel backends, but you have to call and set it up. If you don’t register a backend, train will resort to single-core computations. With a registered parallel backend, any caret model training will use multi-cores of the CPU, since by default the trainControl argument is already set as allowParallel=TRUE. You can also parallelize other instructions that don’t support it by default, but you have to add additional code. For example, you can parallel process in loops using the foreach instruction after registering a parallel backend. In our case for caret, we don’t have to. Whether you have a Windows or Unix-based (MacOS, Linux) machine, you can download the doParallel package (other similar packages are OS-dependent). Add the following lines in your program and leave everything else the same:

# load doParallel package
library(doParallel)
#automatically detect all cores and register
cluster <- makeCluster(detectCores(logical = TRUE)) # leave one CPU spare...
registerDoParallel(cluster)
clusterEvalQ(cluster, {
  library(caret)
})
## ** you can apply this to other libraries

# machine learning code goes in here
# for example
set.seed(1234)
model_rf<-train(new_cases ~ . ,data=v3data.train, method='rf', trControl=fitControl,tuneLength=10, importance = T)
print(model_rf)
plot(model_rf)
rfImp_rf <- varImp(model_rf) 
plot(rfImp_rf, top = 53)

# stop cluster
stopCluster(cl)

Lets check CPU status:

Yay! we did it!

Avatar
comments powered by Disqus

Related

© 2018-2021 Harry Trinh. All thoughts and opinions here are my own. Powered by the Academic theme for Hugo.