Plotting decision boundary from Random Forest model for multiclass MNIST dataset
I am using the MNIST dataset with 10 classes (the digits 0 to 9). I am using a compressed version with 49 predictor variables(x1,x2,...,x49). I have trained a Random Forest model and have created a Test data set, which is a grid, on which I have used the trained model to generate predictions as class probabilities as well as the classes. I am trying to generalise the code here that generates a decision boundary when there are only two outcome classes: https://stackoverflow.com/questions/31234621/variation-on-how-to-plot-decision-boundary-of-a-k-nearest-neighbor-classifier-f
and here: https://stackoverflow.com/questions/55936315/decision-boundary-plots-in-ggplot2
I have tried to visualise the boundary using the first 2 predictors(x1 and x2), though predictions have been made with all 49.
Here is my code:
## Create a grid of data which is the Test data...
## traindat is the dataset that the model was trained om
data- traindat
resolution = 50 (there will be 50 rows)
## Extract the 49 predictor variables and drop the toutcome variable
data - data[,2:50]
head(data)
## Get the variable names in a list
ln - vector(mode=list, length=49)
ln-as.list(names(data))
data_mat-matrix(0,50,49)
r - sapply(data, range, na.rm = TRUE)
for (i in 1:49){
data_mat[,i]- seq(r[1,i], r[2,i], length.out = resolution)
}
data_mat
mat-as.matrix(data_mat)
m-as.data.frame(mat)
## Create test data grid
fn-function(x) seq(min(x)+1, max(x) + 1, length.out=50)
test2-apply(m, 2, fn)
test2-as.data.frame(test2)
colnames(test2)-unlist(ln)
test2-as.data.frame(test2)
## label is a column that should contain the Predicted class labels
test2$label-"-1"
test2-test2 %%
relocate(label, .before = x1)
## finalModel is the model obtained from training the Random Forest on traindat
prob=predict(rf_gridsearch$finalModel,test2,type=prob)
prob2=predict(rf_gridsearch$finalModel,test2,type=response)
prob2-as.data.frame(prob)
head(prob2)
## Create predicted classes 0 to 9 and probabilities for the Test data
fn-function(x) which.max(x)-1
outCls-apply(prob2, 1, fn)
outCls
fn-function(x) max(x)
outProb-apply(prob2, 1, fn)
outProb
##Data structure for plotting
require(dplyr)
dataf2 - bind_rows(mutate(test2,
prob=outProb,
cls=0,
prob_cls=ifelse(outCls==cls,
1, 0)),
mutate(test2,
prob=outProb,
cls=1,
prob_cls=ifelse(outCls==cls,
1, 0)),
mutate(test2,
prob=outProb,
cls=2,
prob_cls=ifelse(outCls==cls,
1, 0)),
mutate(test2,
prob=outProb,
cls=3,
prob_cls=ifelse(outCls==cls,
1, 0)),
mutate(test2,
prob=outProb,
cls=4,
prob_cls=ifelse(outCls==cls,
1, 0)),
mutate(test2,
prob=outProb,
cls=5,
prob_cls=ifelse(outCls==cls,
1, 0)),
mutate(test2,
prob=outProb,
cls=6,
prob_cls=ifelse(outCls==cls,
1, 0)),
mutate(test2,
prob=outProb,
cls=7,
prob_cls=ifelse(outCls==cls,
1, 0)),
mutate(test2,
prob=outProb,
cls=8,
prob_cls=ifelse(outCls==cls,
1, 0)),
mutate(test2,
prob=outProb,
cls=9,
prob_cls=ifelse(outCls==cls,
1, 0))
)
## Solution from Stackexchange based on only two outcome classes
ggplot()+
geom_raster(data= dataf2, aes(x= x1, y=x2, fill=dataf2$cls ), interpolate = TRUE)+
geom_contour(data= NULL, aes(x= dataf2$x1, y=dataf2$x2, z= dataf2$prob), breaks=c(1.5), color=black, size=1)+
theme_few()+
scale_colour_manual(values = cols)+
labs(colour = , fill=)+
scale_fill_gradient2(low=#338cea, mid=white, high=#dd7e7e,
midpoint=0.5, limits=range(dataf2$prob))+
theme(legend.position = none)
My output doesn't look right - what does it mean? Also, why does the contour plot have to be based on the predicted probablity? What is the idea behind the code to generate a decision boundary for any classfier? What am I doing wrong?
Topic plotting random-forest predictive-modeling
Category Data Science