Naive Bayes always predicting the same label
I have been trying to write a naive bayes classifier from scratch that is supposed to predict the class label of the nominal car.arff dataset. However the classifier always predicts the most common one. I have tried log probabilities and laplace correction, both to no avail. Also I have noticed that the conditional probabilities for any attribute is always the greatest for the most common label. Is this because of my dataset? What can be done about it?
Here is my code:
import numpy as np
import pandas as pd
from scipy.io import arff
def parser(path):
function which parses the data from an arff file
@param path: string containig the path to file
@return array containing the data
@raise FileNotFoundError exception in case if the path does not point to a valid file
start = 0 # check if data really occured
# Declaratives as constant to avoid misspelling in code
RELATION = 'relation'
ATTRIBUTE = 'attribute'
DATA = 'data'
# Create dictionary holding the arff information
data = {RELATION: [],
ATTRIBUTE: [],
DATA: []}
# Read the file and analyse the data
with open(path) as file:
for line in file.readlines():
# Check if line is empty
if line.strip() == '':
continue
# Check if line contains the relation
elif '@' + RELATION in line:
data[RELATION].append(line.replace('@' + RELATION, '').strip())
# Check if line contains an attribute
elif line.startswith('@attribute'):
tmp = line.replace({, ).replace(}, ).replace(\n, ).replace(', )
# checks if whitespaces between commas in attributes occur
if (len(tmp.split( )) 3):
values = tmp.replace(,, ).split( )[2:]
else:
values = tmp.split( )[2].split(,)
data[ATTRIBUTE].append({'name': tmp.split( )[1], 'values': values})
# check if @data exists
elif '@' + DATA in line:
start = 1
# If the line is not one of the others, it has to be data
elif '@' + DATA not in line and start:
line = line.split(',')
# strip each element of the line
for i in range(len(line)):
line[i] = line[i].strip()
# Add data to dictionary
data[DATA].append(line)
attributes = np.array(data['attribute'])
out = []
for i in range(len(data['data'])):
data_dict = {}
for j in range(len(attributes)):
data_dict.update({attributes[j]['name']: data['data'][i][j]})
out.append(data_dict)
out = np.array(out)
return out, data[ATTRIBUTE]
class NaiveBayes():
def __init__(self, data, atts, class_label):
self.data = data
self.atts = atts
self.class_label = class_label
def prior(self):
prior_probabilities = [0,0,0,0]
for i in range(len(self.data)):
if self.data[i]['class'] == 'unacc': prior_probabilities[0] += 1
if self.data[i]['class'] == 'acc': prior_probabilities[1] += 1
if self.data[i]['class'] == 'good': prior_probabilities[2] += 1
if self.data[i]['class'] == 'vgood': prior_probabilities[3] += 1
prior_probabilities = [x/len(self.data) for x in prior_probabilities]
return prior_probabilities
def conditionalProbability(self,key,value,length):
#returns (in our case) 4 vector for one attribute with probabilities for each outcome
conditional_probabilities = [0]*length
#definetly not the most efficient way
for i in range(len(self.data)):
if self.data[i][key] == value:
if self.data[i]['class'] == 'unacc': conditional_probabilities[0] += 1
if self.data[i]['class'] == 'acc': conditional_probabilities[1] += 1
if self.data[i]['class'] == 'good': conditional_probabilities[2] += 1
if self.data[i]['class'] == 'vgood': conditional_probabilities[3] += 1
s = np.sum(conditional_probabilities)
conditional_probabilities = [x/s for x in conditional_probabilities]
return conditional_probabilities
def classification(self, instance):
cprobs = []
probs = self.prior()
for key in instance.keys():
cprobs.append(self.conditionalProbability(key,instance[key],4))
print(cprobs)
#get probabilities
predicted_class = unacc
for i in range(len(cprobs)-1):
for j in range(4):
probs[j]*=cprobs[i][j]
#print(instance)
print(probs)
return probs.index(max(probs))
raw,atts = parser('car.arff')
class_attribute = 'class'
classifier = NaiveBayes(raw,atts,class_attribute)
print(classifier.data[1])
print(classifier.prior())
print(classifier.conditionalProbability('buying','vhigh',4))
print(classifier.classification(classifier.data[0]))
'''
results = [0,0,0,0]
for i in range(len(classifier.data)):
results[classifier.classification(classifier.data[i])]+=1
print(results)
'''
This is the class distribution and some more information:
% 5. Number of Instances: 1728
% (instances completely cover the attribute space)
%
% 6. Number of Attributes: 6
%
% 7. Attribute Values:
%
% buying v-high, high, med, low
% maint v-high, high, med, low
% doors 2, 3, 4, 5-more
% persons 2, 4, more
% lug_boot small, med, big
% safety low, med, high
%
% 8. Missing Attribute Values: none
%
% 9. Class Distribution (number of instances per class)
%
% class N N[%]
% -----------------------------
% unacc 1210 (70.023 %)
% acc 384 (22.222 %)
% good 69 ( 3.993 %)
% v-good 65 ( 3.762 %)
and here is some sample data:
low,low,5more,more,small,low,unacc
low,low,5more,more,small,med,acc
low,low,5more,more,small,high,good
low,low,5more,more,med,low,unacc
low,low,5more,more,med,med,good
low,low,5more,more,med,high,vgood
low,low,5more,more,big,low,unacc
low,low,5more,more,big,med,good
low,low,5more,more,big,high,vgood
The complete dataset can be found here
Topic implementation naive-bayes-classifier python machine-learning
Category Data Science