How to calculate the information conveyed in a message for a given dataset

Given the data sets.

Test Set


Training Set


I'd like to calculate the message conveyed/ Information Gained via MC = -p1*log2(p1)-p2*log(p2), where p1 and p2 are the probabilities of assigning class 1 or class 2. Ideally, I'd like to do this for n classes MC = -p1log2(p1) - p2*log2(p2)-...-pn*log2(pn)

The step for this calculation is at Step 1

from numpy.core.defchararray import count
import pandas as pd
import numpy as np
import numpy as np
from math import ceil, floor, log2
from sklearn.decomposition import PCA
from numpy import linalg as LA
from sklearn.tree import DecisionTreeClassifier

def calculate_metrics(tp, tn, fn, p, n, fp):
    # calculate the accuracy, error rate, sensitivity, specificity, and precision for the selected classifier in reference to the corresponding test set.
    accuracy = tp + tn /(p+n)
    error_rate = fp + fn /(p + n)
    sensitivity = tp/ p
    precision = tp/ (tp+fp)
    specificity = tn/n

    display_metrics(accuracy, error_rate, sensitivity, precision, specificity)

def display_metrics(accuracy, error_rate, sensitivity, precision, specificity):
    print(f'Accuracy: {accuracy}, Error_rate:{error_rate}, Sensitivity:{sensitivity}, Precision:{precision}, specificity:{specificity}')

def ID3(threshold,g):
    # use the training set to predict the test set.
    # use the Assignment 2--Training set to extract rules and test the quality of the extracted rules against the Assignment 2-- Test set for ID3.
    test_set = pd.read_csv(Assignment 2--Test set for ID3.csv)
    training_set = pd.read_csv(Assignment 2--Training set for ID3.csv)

    print(f'test_set: {test_set}')
    print(f'training_set: {training_set}')
    # Step 1 - Calculate the (Message Conveyed) for the given data set in reference to the class attribute
    # MC = -p1*log2(p1) - p2*log2(p2)
    # For n classes MC = -p1log2(p1) - p2*log2(p2)-...-pn*log2(pn)

    # leaf generated from the decision tree.
    F1 = 0

    # define c1 count of records w/ dominant class in F1
    # How do I determine the number of records w/ dominant class in F1?
    c1 = 0

    # alpha = c1/ |F1|
    # F1 is one of the unique values of a given attribute.
    alpha = c1/ abs(F1)

    # the number of records in the test set that are correctly classified by the rules extracted from the tree before removal.
    # How do I determine the number of records in test set that are correctly classified by rules extracted from the tree before removal?
    N = 0

    # the number of records in the test set that are correctly classified by the rules extracted from the tree.
    # How do I determine the number of records in the test set that are correctly classified by the rules extracted from the tree?
    M = 0

    # the parameter and 0 = g = 0.15
    g = 0

    if g  0 or g  0.15:

    # k is the total number of branches in the subtree
    # How do I determine the total number of branches in the subtree?
    k = 0

    if alpha  threshold:
        # stop splitting tree

    # How do we apply prepruning to the data?

    # For post-pruning use the criteria below
    if (N-M)/Q  g*k:
        # remove subtree
    # true positive
    tp = 0 
    # true negative
    tn = 0
    # postive
    p  = 0
    #  negative
    n  = 0
    # false positive
    fp = 0

    calculate_metrics(tp, tn, p, n, fp)

def BayesClassifier():
    # use the assignment 2-- training set for Bayes as the training set to classify the records of the assignment 2 test set for bayes
    test_set = pd.read_csv(Assignment 2--Test set for Bayes.csv)
    training_set = pd.read_csv(Assignment 2--Training set for Bayes.csv)

# prompt user to select either ID3 or Bayes classifier.
selection = input(Please enter your selection for either ID3 or Bayes classification: )
threshold = input(Please enter a threshold: )
g         = input(Please enter a value for g: )

if(selection == ID3):

if(selection == Bayes):

Any help with this would be greatly appreciated.

Topic information-theory data-mining

Category Data Science

I was able to implement an outline for processing the tree

from numpy.core.defchararray import count
import pandas as pd
import numpy as np
import numpy as np
import math
from math import ceil, floor, log2
from sklearn.decomposition import PCA
from numpy import linalg as LA
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

# Step 1- Calculate MC (Message Conveyed) for the given dataset (let us call it file TF) in reference to  the class attribute 
# MC(TF) = -p1*log2(p1) - p2*log2(p2) 
def mc(classAttribute,attribute,training_set):
    column = training_set[classAttribute]

    if attribute:
        column = training_set[training_set[classAttribute] == attribute] 

    probs = column.value_counts(normalize=True)
    messageConveyed = -1*np.sum(np.log2(probs)*probs)
    return messageConveyed

def wmc(classAttribute,attribute,training_set):
    attributeCount = len(training_set[training_set[classAttribute] == attribute].index)
    total          = len(training_set[classAttribute].index)
    return attributeCount/total

def ID3(root,training_set,test_set):

    highestGainAttribute = ""
    highestGainValue     = -math.inf
    for classAttribute, values in training_set.iteritems():
        messageConveyed = mc(classAttribute, attribute=None, training_set=training_set)
        print(f"{classAttribute} mc: {messageConveyed}")

        attributes = training_set[classAttribute].unique()
        weightedMessageConveyed = 0
        for attribute in attributes:
            weight = wmc(classAttribute, attribute, training_set)
            messageConveyed = mc(classAttribute, attribute, training_set)
            print(f"wmc({attribute}) = {weight}")
            weightedMessageConveyed += weight*messageConveyed

        print(f'wmc({classAttribute}) = {weightedMessageConveyed}')
        gain = messageConveyed - weightedMessageConveyed
        print(f'MC - wmc({classAttribute}) = {messageConveyed} - {weightedMessageConveyed} = {gain}')
        if gain > highestGainValue:
            highestGainAttribute = classAttribute
            highestGainValue     = gain
    print(f'winner is {highestGainAttribute} with gain of {highestGainValue}')
    root = highestGainAttribute
    leaves = training_set[root].unique()
    splits = {}
    for leaf in  leaves:
        print(f'leaf: {leaf} of root: {root}')
        if training_set[training_set[root] == leaf][root].is_unique:
            print(f'all of the records for leaf: {leaf} are the same. NO SPLIT')
            splits.update({leaf:"no split"})
            print(f'all of the records for leaf: {leaf} are NOT the same. SPLIT')

    for leaf,split in splits.items():
        if split == "split":
            print(f"setting {leaf} as the new dataset")
            if root in training_set:
                training_set = training_set[training_set[root] == leaf].drop(columns=root)

# use the training set to predict the test set.
# use the Assignment 2--Training set to extract rules and test the quality of the extracted rules against the Assignment 2-- Test set for ID3.
test_set_ID3 = pd.read_csv("Assignment 2--Test set for ID3.csv")
training_set_ID3 = pd.read_csv("Assignment 2--Training set for ID3.csv")

# prompt user to select either ID3 or Bayes classifier.
selection = "ID3" #= input("Please enter your selection for either ID3 or Bayes classification: ")
threshold = 0.9   #= input("Please enter a threshold: ")
g         = 0.05   #= input("Please enter a value for g: ")

root = ""
if(selection == "ID3"):
    print('TRAINING SET')
    print('TEST SET')


Geeks Mental is a community that publishes articles and tutorials about Web, Android, Data Science, new techniques and Linux security.