Xgboost Multiclass evaluation Metrics

Im training an Xgb Multiclass problem, but im having doubts about my evaluation metrics,

heres my code + output

import matplotlib.pylab as plt
from sklearn import metrics
from matplotlib import pyplot
from sklearn.model_selection import GridSearchCV 
import xgboost as xgb
from statistics import mean
%matplotlib inline
from sklearn.preprocessing import label_binarize
from itertools import cycle
from sklearn.metrics import roc_curve, auc
def plot_roc_curve(y_test, y_pred):
    
    n_classes = len(np.unique(y_test))
    y_test = label_binarize(y_test, classes=np.arange(n_classes))
    #y_pred = label_binarize(y_pred, classes=np.arange(n_classes))

  # Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(n_classes):
        fpr[i], tpr[i], _ = metrics.roc_curve(y_test[:, i], y_pred[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])
  
  # Compute micro-average ROC curve and ROC area
    fpr[micro], tpr[micro], _ = roc_curve(y_test.ravel(), y_pred.ravel())
    roc_auc[micro] = auc(fpr[micro], tpr[micro])

  # First aggregate all false positive rates
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

  # Then interpolate all ROC curves at this points
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(n_classes):
        mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])

  # Finally average it and compute AUC
    mean_tpr /= n_classes

    fpr[macro] = all_fpr
    tpr[macro] = mean_tpr
    roc_auc[macro] = auc(fpr[macro], tpr[macro])

  # Plot all ROC curves
    plt.figure(figsize=(10,8))
#    plt.figure(dpi=300)
    lw = 2
    plt.plot(fpr[micro], tpr[micro],
             label=micro-average ROC curve (area = {0:0.2f}).format(roc_auc[micro]),
             color=pink, linestyle=-., linewidth=4,)

    plt.plot(fpr[macro], tpr[macro],
             label=macro-average ROC curve (area = {0:0.2f}).format(roc_auc[macro]),
             color=purple, linestyle=-., linewidth=4,)

    colors = cycle([gray, green, blue, yellow, red,'black','brown','goldenrod','gold',
                    'aqua','violet','darkslategray','mistyrose','darkorange','tan'])
    for i, color in zip(range(n_classes), colors):
        plt.plot(fpr[i], tpr[i], color=color, lw=lw, linestyle=--,
                 label=ROC curve of class {0} (area = {1:0.2f}).format(i, roc_auc[i]),)

    plt.plot([0, 1], [0, 1], k--, lw=lw)
    plt.xlim([-0.05, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel(False Positive Rate)
    plt.ylabel(True Positive Rate)
    plt.title(Receiver Operating Characteristic (ROC) curve)
    plt.legend()



def evaluate_model(alg, train, target, predictors,test,target2, early_stopping_rounds=10,n_jobs=-1,useTrainCV=False, cv_folds=5):
    plt.rcParams['figure.figsize'] = [100, 50]
    plt.tick_params(axis='both', which='major', labelsize=50)
    plt.tick_params(axis='both', which='minor', labelsize=50)
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(train[predictors].values, target['CLASS_TARGET'].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='auc', early_stopping_rounds=early_stopping_rounds, verbose_eval=True)
        print (cvresult---, cvresult.shape[0])
        print (cvresult)
        alg.set_params(n_estimators=cvresult.shape[0])        

    #Ajustar (Fit) el algoritmo a los datos
    evaluation = [(train, target), (test, target2)]#sd
    alg.fit(train[predictors], target['CLASS_TARGET'], eval_metric=[merror, mlogloss,auc],eval_set=evaluation)
        
    #Predecir el set de entrenamiento:
    dtrain_predictions = alg.predict(train[predictors])
    dtrain_predprob = alg.predict_proba(train[predictors])#[:,1]
    feat_imp = pd.Series(alg.get_booster().get_fscore()).sort_values(ascending=False) 
    feat_imp.plot(kind='bar', title='Importancia', color='b') 
    plt.ylabel('Importancia Score')

    #Print Reporte de modelo:
    
    print(\n Reporte de Modelo)
    print(feat_imp)
    print(No. de vars : %.4g % feat_imp.count())
    print(Accuracy : %.4g % metrics.accuracy_score(target['CLASS_TARGET'].values, dtrain_predictions))
    print(AUC Score (Balanced): %f % metrics.roc_auc_score(target['CLASS_TARGET'], dtrain_predprob, multi_class='ovr', average='weighted'))
    
    # evaluate predictions
    pred = dtrain_predprob
    accuracy = metrics.accuracy_score(target['CLASS_TARGET'].values, dtrain_predictions)
    accuracy_AUC = metrics.roc_auc_score(target['CLASS_TARGET'], dtrain_predprob, multi_class='ovr', average='weighted')
    predictions = [np.round(value) for value in pred]
    #fpr, tpr, thresholds = metrics.roc_curve(y, scores, pos_label=2)
    
    #print(Accuracy: %.2f%% % (accuracy * 100.0))
    #  performance metrics
    results = alg.evals_result()
    epochs = len(results['validation_0']['merror'])
    x_axis = range(0, epochs)
    plt.style.use('ggplot')
    # plot log loss
    fig, ax = pyplot.subplots(figsize=(12,12))
    ax.plot(x_axis, results['validation_0']['mlogloss'], label='Train')
    ax.plot(x_axis, results['validation_1']['mlogloss'], label='Test')
    #ax.plot(x_axis, results['validation_2']['logloss'], label='Val')
    ax.legend()
    pyplot.ylabel('Log Loss')
    pyplot.title('XGBoost Log Loss')
    pyplot.show()
    # plot classification error
    fig, ax = pyplot.subplots(figsize=(12,12))
    ax.plot(x_axis, results['validation_0']['merror'], label='Train')
    ax.plot(x_axis, results['validation_1']['merror'], label='Test')
    #ax.plot(x_axis, results['validation_2']['error'], label='Val')
    ax.legend()
    
    pyplot.ylabel('Classification Error')
    pyplot.title('XGBoost Classification Error')
    pyplot.show()

xgb0 = xgb.XGBClassifier(max_depth=3, 
                         learning_rate=0.1, 
                         n_estimators=40, 
                         objective='multi:sofprob',
                         gamma=1, 
                         min_child_weight=1, 
                         max_delta_step=0, 
                         subsample=1, 
                         colsample_bytree=1, 
                         colsample_bylevel=1, 
                         colsample_bynode=1 , 
                         reg_alpha=1, 
                         reg_lambda=1, 
                         scale_pos_weight=1, 
                         base_score=0.5, 
                         random_state=0, 
                         seed=None,
                         silent=None,
                         missing=999999,
                         verbosity=1,
                         use_label_encoder =False,
                         n_jobs=-1)

Output:

Now lets evaluate the evaluation set:

df = pd.DataFrame(data=y_val)
yv=df.iloc[:,0]

df_va2 = xgb0.predict(X_val[features])
df_va= xgb0.predict_proba(X_val[features])#[:,1];
#df_val['Y_FAIL'] = np.where(((df_va = .53)), 0, 1)
#Print model report:

print(Accuracy : %.4g % metrics.accuracy_score(yv, df_va2))
print(One VS Rest)
print(AUC Score (Val) Macro: %f % metrics.roc_auc_score(yv, df_va, multi_class='ovr', average='macro'))
print(AUC Score (Val) Weighted: %f % metrics.roc_auc_score(yv, df_va, multi_class='ovr', average='weighted'))
print(One VS One)
print(AUC Score (Val) Macro: %f % metrics.roc_auc_score(yv, df_va, multi_class='ovo', average='macro'))
print(AUC Score (Val) Weighted: %f % metrics.roc_auc_score(yv, df_va, multi_class='ovo', average='weighted'))
plot_roc_curve(yv, df_va)

Output:

Accuracy : 0.8749

One VS Rest
AUC Score (Val) Macro: 0.990113
AUC Score (Val) Weighted: 0.964739

One VS One
AUC Score (Val) Macro: 0.994858
AUC Score (Val) Weighted: 0.983933

this looks great, thing is when i try to calculate AUC for individual classes i get this

code:

df = pd.DataFrame(data=y_val)
yv=df.iloc[:,0]

#df_va2 = xgb0.predict(X_val[features])
#df_va= xgb0.predict_proba(X_val[features])

d = yv.unique()
class_name = list(d.flatten())

for p in class_name:
    fpr, tpr, thresholds = metrics.roc_curve(yv, xgb0.predict_proba(X_val[features])[:,1], pos_label = p) 
    auroc = round(metrics.auc(fpr, tpr),2)
    print('Xgb',p,'--AUC---',auroc)

Output:

Xgb 10 --AUC--- 0.36
Xgb 8 --AUC--- 0.15
Xgb 4 --AUC--- 0.45
Xgb 1 --AUC--- 0.97
Xgb 12 --AUC--- 0.34
Xgb 7 --AUC--- 0.08
Xgb 2 --AUC--- 0.58
Xgb 5 --AUC--- 0.44
Xgb 14 --AUC--- 0.37
Xgb 11 --AUC--- 0.0
Xgb 6 --AUC--- 0.41
Xgb 0 --AUC--- 0.0
Xgb 3 --AUC--- 0.5
Xgb 9 --AUC--- 0.12
Xgb 13 --AUC--- 0.28

So what metric is right?, im doing something wrong?, why its so diferent

Thanks for your help, and sorry for bad english.

Topic auc xgboost multiclass-classification python machine-learning

Category Data Science

About

Geeks Mental is a community that publishes articles and tutorials about Web, Android, Data Science, new techniques and Linux security.