Drastic increase in accuracy while using pickle file with sklearn
I trained a xgboost classifier and it gave an accuracy of 49.99 % and i saved that model into a pickle file. When i ran the same data with pickle file (.pkl) it's giving an accuracy of 88.99 percent. I don't know why it's happening. Please help me out from this situation.
bank_dataset = pd.read_csv(rdataset.csv)
missing_val = pd.DataFrame(bank_dataset.isnull().sum())
bank_dataset[' Balance'] = bank_dataset[' Balance'].fillna(bank_dataset[' Balance'].mean())
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
objList = bank_dataset.select_dtypes(include = object).columns
for feat in objList:
bank_dataset[feat] = le.fit_transform(bank_dataset[feat].astype(str))
correlation = bank_dataset.corr()
print(correlation['Outcome'].sort_values(ascending = False), '\n')
k = 10
cols = correlation.nlargest(k, 'Outcome')['Outcome'].index
cm = np.corrcoef(bank_dataset[cols].values.T)
f, ax = plt.subplots(figsize=(14,14))
sns.heatmap(cm, vmax = .8, linewidths = 0.01, square = True, annot = True, cmap = coolwarm, linecolor = white,
annot_kws = {'size':12}, xticklabels = cols.values, yticklabels = cols.values)
X = bank_dataset.iloc[:, [7,8,12,24,11,16,4,18,20]].values
y = bank_dataset.iloc[:, -4].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
from xgboost import XGBClassifier
xg = XGBClassifier()
xg.fit(X_train, y_train)
y_pred = xg.predict(X_test)
from sklearn.metrics import confusion_matrix
from sklearn import metrics
print(metrics.accuracy_score(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
import pickle
filename = 'xg.pkl'
pickle.dump(xg, open(filename, 'wb'))
The above code for training and saving the model into .pkl
bank_dataset = pd.read_csv(rdataset.csv)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
objList = bank_dataset.select_dtypes(include = object).columns
for feat in objList:
bank_dataset[feat] = le.fit_transform(bank_dataset[feat].astype(str))
X = bank_dataset.iloc[:, [7,8,12,24,11,16,4,18,20]].values
y = bank_dataset.iloc[:, -4].values
file = open('xg.pkl', 'rb')
data = pickle.load(file)
X_ = data.predict(X)
The second code is running the code with pkl file with the same data used for training.
