Logistic Regression optimal threshold is a negative value
I run the code below:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import linear_model
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from numpy import sqrt
from numpy import argmax
from sklearn.metrics import roc_curve
from sklearn.preprocessing import StandardScaler
def standardize(variable):
return (variable - np.mean(variable)) / np.std(variable)
def normalize(x):
return (x-x.min()/(x.max()- x.min()))
data.columns = np.arange(len(data.columns))
trainX, testX, trainy, testy=train_test_split(X,y,test_size=0.5,random_state=2, stratify=y)
# fit a model
model = LogisticRegression(solver='lbfgs')
model.fit(trainX, trainy)
#yhat = model.predict_proba(testX)
yhat = normalize(testX.values)
yhat = yhat[:, 0]
print(yhat)
# calculate roc curves
fpr, tpr, thresholds = roc_curve(testy, yhat)
#print(thresholds)
# calculate the g-mean for each threshold
gmeans = sqrt(tpr * (1-fpr))
# locate the index of the largest g-mean
ix = argmax(gmeans)
print('Best Threshold=%f, G-mean=%.3f' % (thresholds[ix], gmeans[ix]))
# plot the roc curve for the model
pyplot.plot([0,1], [0,1], linestyle='--', label='No Skill')
pyplot.plot(fpr, tpr, marker='.', label='Logistic')
pyplot.scatter(fpr[ix], tpr[ix], marker='o', color='black', label='Best')
# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
pyplot.legend()
# show the plot
pyplot.show()
The optimal threshold score is: Best Threshold= -0.049752, G-mean=0.889
Why is the optimal threshold a negative number? What does it mean? And why am I getting a negative number?