How to apply entropy discretization to a dataset

Question

How to apply entropy discretization to a dataset

Evan Gertis

2022年5月22日 12:03

I have a simple dataset that I'd like to apply entropy discretization to. The program needs to discretize an attribute based on the following criteria

When either the condition “a” or condition “b” is true for a partition, then that partition stops splitting:

a- The number of distinct classes within a partition is 1.

b- The ratio of the minimum to maximum frequencies among the distinct values for the attribute Class in the partition is 0.5 and the number of distinct values

within the attribute of Class in the partition is Floor(n/2), where n is the number of distinct values in the original dataset.

As an example,

100 records these are the unique values of those

v1 10
v2 30
v3 15
v4 20
v5 25

--- n = 5

s1

v1 10 - 10/30 = .34   
v2 30  
v3 15

--- n1 = 3

s2
v4 20 - 20/25 = 0.8
v5 25

--- n2 = 2

if minf(s1)/maxf(s1)  0.5 then condition 1 of b is met and floor(n/2) == n1 then condition 2 of b is met.
    stop split

in this case v1 is the min in s1 and v2 is the max. So the algorithm should stop splitting.

Expected: The program should return the bestpartition based on the maximum information gain.

Actual: An empty dataset is returned

I have written an implementation

from numpy.core.defchararray import count
import pandas as pd
import numpy as np
import numpy as np
from math import floor, log2
from sklearn.decomposition import PCA
import matplotlib.pyplot as plot

def print_full(x):
    pd.set_option('display.max_rows', len(x))
    print(x)
    pd.reset_option('display.max_rows')

def main():
    s = pd.read_csv('A1-dm.csv')
    print(s)
    print(******************************************************)
    print(Entropy Discretization                         STARTED)
    s = entropy_discretization(s)
    print(Entropy Discretization                         COMPLETED)
    print(s)
    print(******************************************************)

def entropy_discretization(s):

    I = {}
    i = 0
    n = s.nunique()['A1']
    s_temp = s
    s1 = pd.DataFrame()
    s2 = pd.DataFrame()
    while(uniqueValue(s_temp)):
        

        # Step 1: pick a threshold
        threshold = s_temp['A1'].iloc[0]

        # Step 2: Partititon the data set into two parttitions
        s1 = s[s['A1']  threshold]
        print(s1 after spitting)
        print(s1)
        print(******************)
        s2 = s[s['A1'] = threshold]
        print(s2 after spitting)
        print(s2)
        print(******************)

        print(******************)
        print(calculating maxf)
        print(f maxf {maxf(s['A1'])})
        print(******************)

        print(******************)
        print(calculating minf)
        print(f maxf {minf(s['A1'])})
        print(******************)

        # print(maxf(s['A1'])/minf(s['A1']))
        if (maxf(s1['A1'])/minf(s1['A1'])  0.5) and (s_temp.nunique()['A1'] == floor(n/2)):
            print(fCondition b is met{maxf(s1['A1'])}/{minf(s1['A1'])}  {0.5} {s_temp.nunique()['A1']} == {floor(n/2)})
            break
            
        # Step 3: calculate the information gain.
        informationGain = information_gain(s1,s2,s_temp)
        I.update({f'informationGain_{i}':informationGain,f'threshold_{i}': threshold})
        print(f'added informationGain_{i}: {informationGain}, threshold_{i}: {threshold}')
        s_temp = s_temp[s_temp['A1'] != threshold]
        i += 1

    # Step 5: calculate the min information gain
    n = int(((len(I)/2)-1))
    print(Calculating maximum threshold)
    print(*****************************)
    maxInformationGain = 0
    maxThreshold       = 0 
    for i in range(0, n):
        if(I[f'informationGain_{i}']  maxInformationGain):
            maxInformationGain = I[f'informationGain_{i}']
            maxThreshold       = I[f'threshold_{i}']

    print(f'maxThreshold: {maxThreshold}, maxInformationGain: {maxInformationGain}')

    s = pd.merge(s1,s2)

    # Step 6: keep the partitions of S based on the value of threshold_i
    return s #maxPartition(maxInformationGain,maxThreshold,s,s1,s2)


def maxf(s):
    return s.max()

def minf(s):
    return s.min()

def uniqueValue(s):
    # are records in s the same? return true
    if s.nunique()['A1'] == 1:
        return False
    # otherwise false 
    else:
        return True

def maxPartition(maxInformationGain,maxThreshold,s,s1,s2):
    print(f'informationGain: {maxInformationGain}, threshold: {maxThreshold}')
    merged_partitions =  pd.merge(s1,s2)
    merged_partitions =  pd.merge(merged_partitions,s)
    print(Best Partition)
    print(***************)
    print(merged_partitions)
    print(***************)
    return merged_partitions




def information_gain(s1, s2, s):
    # calculate cardinality for s1
    cardinalityS1 = len(pd.Index(s1['A1']).value_counts())
    print(f'The Cardinality of s1 is: {cardinalityS1}')
    # calculate cardinality for s2
    cardinalityS2 = len(pd.Index(s2['A1']).value_counts())
    print(f'The Cardinality of s2 is: {cardinalityS2}')
    # calculate cardinality of s
    cardinalityS = len(pd.Index(s['A1']).value_counts())
    print(f'The Cardinality of s is: {cardinalityS}')
    # calculate informationGain
    informationGain = (cardinalityS1/cardinalityS) * entropy(s1) + (cardinalityS2/cardinalityS) * entropy(s2)
    print(f'The total informationGain is: {informationGain}')
    return informationGain



def entropy(s):
    print(calculating the entropy for s)
    print(*****************************)
    print(s)
    print(*****************************)

    # initialize ent
    ent = 0

    # calculate the number of classes in s
    numberOfClasses = s['Class'].nunique()
    print(f'Number of classes for dataset: {numberOfClasses}')
    value_counts = s['Class'].value_counts()
    p = []
    for i in range(0,numberOfClasses):
        n = s['Class'].count()
        # calculate the frequency of class_i in S1
        print(f'p{i} {value_counts.iloc[i]}/{n}')
        f = value_counts.iloc[i]
        pi = f/n
        p.append(pi)
    
    print(p)

    for pi in p:
        ent += -pi*log2(pi)

    return ent

main()

A set sample data looks like this

A1,A2,A3,Class
2,0.4631338,1.5,3
8,0.7460648,3.0,3
6,0.264391038,2.5,2
5,0.4406713,2.3,1
2,0.410438159,1.5,3
2,0.302901816,1.5,2
6,0.275869396,2.5,3

Any help understanding why the method returns an empty dataset would be greatly appreciated.

Topic pandas python data-mining

Category Data Science

Evan Gertis · Accepted Answer · 2021年10月21日 21:35

I've attempted to create a procedure for this which splits the data into two partitions, but I would appreciate feedback as to whether my implementation is correct

from numpy.core.defchararray import count
import pandas as pd
import numpy as np
import numpy as np
from math import floor, log2
from sklearn.decomposition import PCA
import matplotlib.pyplot as plot

def print_full(x):
    pd.set_option('display.max_rows', len(x))
    print(x)
    pd.reset_option('display.max_rows')

def main():
    s = pd.read_csv('A1-dm.csv')
    print("******************************************************")
    print("Entropy Discretization                         STARTED")
    s = entropy_discretization(s)
    print("Entropy Discretization                         COMPLETED")

# This method discretizes attribute A1
# If the information gain is 0, i.e the number of 
# distinct class is 1 or
# If min f/ max f < 0.5 and the number of distinct values is floor(n/2)
# Then that partition stops splitting.
# This method discretizes s A1
# If the information gain is 0, i.e the number of 
# distinct class is 1 or
# If min f/ max f < 0.5 and the number of distinct values is floor(n/2)
# Then that partition stops splitting.
def entropy_discretization(s):

    I = {}
    i = 0
    n = s.nunique()['Class']
    s1 = pd.DataFrame()
    s2 = pd.DataFrame()
    distinct_values = s['Class'].value_counts().index
    information_gain_indicies = []
    print(f'The unique values for dataset s["Class"] are {distinct_values}')
    for i in distinct_values:

        # Step 1: pick a threshold
        threshold = i
        print(f'Using threshold {threshold}')

        # Step 2: Partititon the data set into two parttitions
        s1 = s[s['Class'] < threshold]
        print("s1 after spitting")
        print(s1)
        print("******************")
        s2 = s[s['Class'] >= threshold]
        print("s2 after spitting")
        print(s2)
        print("******************")

        print("******************")
        print("calculating maxf")
        print(f" maxf {maxf(s['Class'])}")
        print("******************")

        print("******************")
        print("calculating minf")
        print(f" maxf {minf(s['Class'])}")
        print("******************")

        print(f"Checking condition a if {s1.nunique()['Class']} == {1}")
        if (s1.nunique()['Class'] == 1):
            break

        print(f"Checking condition b  {maxf(s1['Class'])}/{minf(s1['Class'])} < {0.5} {s1.nunique()['Class']} == {floor(n/2)}")
        if (maxf(s1['Class'])/minf(s1['Class']) < 0.5) and (s1.nunique()['Class'] == floor(n/2)):
            print(f"Condition b is met{maxf(s1['Class'])}/{minf(s1['Class'])} < {0.5} {s1.nunique()['Class']} == {floor(n/2)}")
            break

        # Step 3: calculate the information gain.
        informationGain = information_gain(s1,s2,s)
        I.update({f'informationGain_{i}':informationGain,f'threshold_{i}': threshold})
        print(f'added informationGain_{i}: {informationGain}, threshold_{i}: {threshold}')
        information_gain_indicies.append(i)

    # Step 5: calculate the min information gain
    n = int(((len(I)/2)-1))
    print("Calculating maximum threshold")
    print("*****************************")
    maxInformationGain = 0
    maxThreshold       = 0 
    for i in information_gain_indicies:
        if(I[f'informationGain_{i}'] > maxInformationGain):
            maxInformationGain = I[f'informationGain_{i}']
            maxThreshold       = I[f'threshold_{i}']

    print(f'maxThreshold: {maxThreshold}, maxInformationGain: {maxInformationGain}')

    partitions = [s1,s2]
    s = pd.concat(partitions)

    # Step 6: keep the partitions of S based on the value of threshold_i
    return s #maxPartition(maxInformationGain,maxThreshold,s,s1,s2)


def maxf(s):
    return s.max()

def minf(s):
    return s.min()

def uniqueValue(s):
    # are records in s the same? return true
    if s.nunique()['Class'] == 1:
        return False
    # otherwise false 
    else:
        return True

def maxPartition(maxInformationGain,maxThreshold,s,s1,s2):
    print(f'informationGain: {maxInformationGain}, threshold: {maxThreshold}')
    merged_partitions =  pd.merge(s1,s2)
    merged_partitions =  pd.merge(merged_partitions,s)
    print("Best Partition")
    print("***************")
    print(merged_partitions)
    print("***************")
    return merged_partitions




def information_gain(s1, s2, s):
    # calculate cardinality for s1
    cardinalityS1 = len(pd.Index(s1['Class']).value_counts())
    print(f'The Cardinality of s1 is: {cardinalityS1}')
    # calculate cardinality for s2
    cardinalityS2 = len(pd.Index(s2['Class']).value_counts())
    print(f'The Cardinality of s2 is: {cardinalityS2}')
    # calculate cardinality of s
    cardinalityS = len(pd.Index(s['Class']).value_counts())
    print(f'The Cardinality of s is: {cardinalityS}')
    # calculate informationGain
    informationGain = (cardinalityS1/cardinalityS) * entropy(s1) + (cardinalityS2/cardinalityS) * entropy(s2)
    print(f'The total informationGain is: {informationGain}')
    return informationGain



def entropy(s):
    print("calculating the entropy for s")
    print("*****************************")
    print(s)
    print("*****************************")

    # initialize ent
    ent = 0

    # calculate the number of classes in s
    numberOfClasses = s['Class'].nunique()
    print(f'Number of classes for dataset: {numberOfClasses}')
    value_counts = s['Class'].value_counts()
    p = []
    for i in range(0,numberOfClasses):
        n = s['Class'].count()
        # calculate the frequency of class_i in S1
        print(f'p{i} {value_counts.iloc[i]}/{n}')
        f = value_counts.iloc[i]
        pi = f/n
        p.append(pi)
    
    print(p)

    for pi in p:
        ent += -pi*log2(pi)

    return ent

How to apply entropy discretization to a dataset

About