How to apply entropy discretization to a dataset
I have a simple dataset that I'd like to apply entropy discretization to. The program needs to discretize an attribute based on the following criteria
When either the condition “a” or condition “b” is true for a partition, then that partition stops splitting:
a- The number of distinct classes within a partition is 1.
b- The ratio of the minimum to maximum frequencies among the distinct values for the attribute Class in the partition is 0.5 and the number of distinct values
within the attribute of Class in the partition is Floor(n/2), where n is the
number of distinct values in the original dataset.
As an example,
100 records these are the unique values of those
v1 10
v2 30
v3 15
v4 20
v5 25
--- n = 5
s1
v1 10 - 10/30 = .34
v2 30
v3 15
--- n1 = 3
s2
v4 20 - 20/25 = 0.8
v5 25
--- n2 = 2
if minf(s1)/maxf(s1) 0.5 then condition 1 of b is met and floor(n/2) == n1 then condition 2 of b is met.
stop split
in this case v1 is the min in s1 and v2 is the max. So the algorithm should stop splitting.
Expected: The program should return the bestpartition based on the maximum information gain.
Actual: An empty dataset is returned
I have written an implementation
from numpy.core.defchararray import count
import pandas as pd
import numpy as np
import numpy as np
from math import floor, log2
from sklearn.decomposition import PCA
import matplotlib.pyplot as plot
def print_full(x):
pd.set_option('display.max_rows', len(x))
print(x)
pd.reset_option('display.max_rows')
def main():
s = pd.read_csv('A1-dm.csv')
print(s)
print(******************************************************)
print(Entropy Discretization STARTED)
s = entropy_discretization(s)
print(Entropy Discretization COMPLETED)
print(s)
print(******************************************************)
def entropy_discretization(s):
I = {}
i = 0
n = s.nunique()['A1']
s_temp = s
s1 = pd.DataFrame()
s2 = pd.DataFrame()
while(uniqueValue(s_temp)):
# Step 1: pick a threshold
threshold = s_temp['A1'].iloc[0]
# Step 2: Partititon the data set into two parttitions
s1 = s[s['A1'] threshold]
print(s1 after spitting)
print(s1)
print(******************)
s2 = s[s['A1'] = threshold]
print(s2 after spitting)
print(s2)
print(******************)
print(******************)
print(calculating maxf)
print(f maxf {maxf(s['A1'])})
print(******************)
print(******************)
print(calculating minf)
print(f maxf {minf(s['A1'])})
print(******************)
# print(maxf(s['A1'])/minf(s['A1']))
if (maxf(s1['A1'])/minf(s1['A1']) 0.5) and (s_temp.nunique()['A1'] == floor(n/2)):
print(fCondition b is met{maxf(s1['A1'])}/{minf(s1['A1'])} {0.5} {s_temp.nunique()['A1']} == {floor(n/2)})
break
# Step 3: calculate the information gain.
informationGain = information_gain(s1,s2,s_temp)
I.update({f'informationGain_{i}':informationGain,f'threshold_{i}': threshold})
print(f'added informationGain_{i}: {informationGain}, threshold_{i}: {threshold}')
s_temp = s_temp[s_temp['A1'] != threshold]
i += 1
# Step 5: calculate the min information gain
n = int(((len(I)/2)-1))
print(Calculating maximum threshold)
print(*****************************)
maxInformationGain = 0
maxThreshold = 0
for i in range(0, n):
if(I[f'informationGain_{i}'] maxInformationGain):
maxInformationGain = I[f'informationGain_{i}']
maxThreshold = I[f'threshold_{i}']
print(f'maxThreshold: {maxThreshold}, maxInformationGain: {maxInformationGain}')
s = pd.merge(s1,s2)
# Step 6: keep the partitions of S based on the value of threshold_i
return s #maxPartition(maxInformationGain,maxThreshold,s,s1,s2)
def maxf(s):
return s.max()
def minf(s):
return s.min()
def uniqueValue(s):
# are records in s the same? return true
if s.nunique()['A1'] == 1:
return False
# otherwise false
else:
return True
def maxPartition(maxInformationGain,maxThreshold,s,s1,s2):
print(f'informationGain: {maxInformationGain}, threshold: {maxThreshold}')
merged_partitions = pd.merge(s1,s2)
merged_partitions = pd.merge(merged_partitions,s)
print(Best Partition)
print(***************)
print(merged_partitions)
print(***************)
return merged_partitions
def information_gain(s1, s2, s):
# calculate cardinality for s1
cardinalityS1 = len(pd.Index(s1['A1']).value_counts())
print(f'The Cardinality of s1 is: {cardinalityS1}')
# calculate cardinality for s2
cardinalityS2 = len(pd.Index(s2['A1']).value_counts())
print(f'The Cardinality of s2 is: {cardinalityS2}')
# calculate cardinality of s
cardinalityS = len(pd.Index(s['A1']).value_counts())
print(f'The Cardinality of s is: {cardinalityS}')
# calculate informationGain
informationGain = (cardinalityS1/cardinalityS) * entropy(s1) + (cardinalityS2/cardinalityS) * entropy(s2)
print(f'The total informationGain is: {informationGain}')
return informationGain
def entropy(s):
print(calculating the entropy for s)
print(*****************************)
print(s)
print(*****************************)
# initialize ent
ent = 0
# calculate the number of classes in s
numberOfClasses = s['Class'].nunique()
print(f'Number of classes for dataset: {numberOfClasses}')
value_counts = s['Class'].value_counts()
p = []
for i in range(0,numberOfClasses):
n = s['Class'].count()
# calculate the frequency of class_i in S1
print(f'p{i} {value_counts.iloc[i]}/{n}')
f = value_counts.iloc[i]
pi = f/n
p.append(pi)
print(p)
for pi in p:
ent += -pi*log2(pi)
return ent
main()
A set sample data looks like this
A1,A2,A3,Class
2,0.4631338,1.5,3
8,0.7460648,3.0,3
6,0.264391038,2.5,2
5,0.4406713,2.3,1
2,0.410438159,1.5,3
2,0.302901816,1.5,2
6,0.275869396,2.5,3
Any help understanding why the method returns an empty dataset would be greatly appreciated.
Topic pandas python data-mining
Category Data Science