Issues with self-implemented logistic regression

I am trying to self-implement a logistic regression algorithm to do some self-learning but I am having a bit of trouble with achieving similar accuracy to the logistic regression of sklearn.

Here is the code I am using (the dataset I am using is the titanic 'training.csv' dataset from kaggle which you can download here if you want to test this out yourself.)

import numpy as np
import random
import matplotlib.pyplot as plt
#%matplotlib inline

def cost(X, Y, W):
    
    x = matrix of features
    y = vector of ground truth labels
    w = vector of weight parameters
    
    m = len(Y)
    if isinstance(Y, list):
        Y = np.array(Y)
    return -(1/m) * np.sum([Y*np.log(sigmoid(W, X)), (1-Y)*np.log(1-sigmoid(W, X))])

def sigmoid(w, x):
    Hypothesis function of the logisitic regression.
    
    w = column vector of weights
    x = column vector of features
    

    z = np.dot(w.T, x)
    return 1 / (1 + np.exp(-z))

def grad_descent(A, w, y, 
                 lr = 0.01, 
                 stochastic = False, 
                 max_iter = 1000, 
                 mute = True, 
                 plot = True):
    
    A = design matrix
    w = weights column vector
    y = ground truth label
    lr = learning rate
    stochastic = whether to use stochastic gradient descent or not
    max_iter = maximum number of epochs to train for
    mute = whether to print the current epoch to the screen
    plot = whether to plot the loss function after training ends
    
    if not isinstance(A, np.ndarray):
        m = A must be a numpy array, got %s
        raise TypeError(m % type(A).__name__)
    
    if isinstance(y, list):
        y = np.array(y)
        y = y.T
        y = np.expand_dims(y, axis = 1)
    if isinstance(w, list):
        w = np.array(w)
        # Make w a column vector
        w.shape = (A.shape[1], 1)
    
    losses = []
    i = 0
    
    while i  max_iter:
        old_weights = w
        # create/update the alpha vector
        alpha = [sigmoid(w, A[i, :]) for i in range(A.shape[0])]
        if not mute:
            print(Epoch %d % (i+1))
        
        if stochastic:
            # stochastic grad descent chooses a training point at random
            # so here we choose a random row from the matrix A
            rand = random.randint(0, A.shape[0]-1)
            # select random entries
            temp_A = A[rand].T
            temp_A = temp_A.reshape(A.shape[1], 1)
            temp_b = alpha[rand] - y[rand]
            
            # Calc gradient
            grad = np.dot(temp_A, (temp_b))
            # Update weights
            w = (w.T - (lr * grad)).T
            
        # perform batch gradient descent
        else:
            # number of samples
            m = len(y)
            # Calc gradient
            grad = (1/m) * np.dot(A.T, (alpha - y))
            # Update weights
            w = w - (lr * grad)
        
        if i != 0:
            # if loss starts increasing then stop
            if cost(A.T, y, w)  losses[-1]:
                print(Stopping at epoch %d % i)
                if plot:
                    print('Losses')
                    plt.plot(losses)
                return old_weights
                
                break
        
        # Track the loss function value
        losses.append(cost(A.T, y, w))
        
        # iterate epoch counter
        i += 1
        
    print(Stopping at epoch %d % i)
    if plot:
        print('Losses')
        plt.plot(losses)
    
    return w


#############################################################################
#############################################################################
#############################################################################
if __name__ == __main__:

    import pandas as pd

    train = pd.read_csv(r'C:\Users\LENOVO\Documents\Self_Study\titanic\train.csv')
    
    # convert the Sex column to zeros and ones
    train['Sex'] = train['Sex'].map({'female': 1, 'male': 0})
    
    # There are some zero values in the fare column, replace these with a more likely value
    rows = np.where(train.Fare == np.min(train.Fare))
    # assign the mean fare value for the given class these people were staying in
    class_ = train.iloc[rows[0], 2].values

    for clas, row in zip(class_, rows[0]):
        
        # get the mean
        Pclass = train.loc[(train['Pclass'] == clas)]
        c_mean = np.mean(Pclass['Fare'])
        # assign the value to the proper row
        train.iloc[row, 9] = c_mean

        
    train.head()
    
    
    # set a learning rate
    lr = 0.01
    
    sexes = train.Sex
    fares = train.Fare
    
    # scale the fare value by dividing by the highest value so we get a range of 0 - 1
    fares = fares/np.max(fares)
    
    # put into matrix format
    A = np.array([[1, s, f] for s, f in zip(sexes, fares)])
    # create initial weights
    w = [0, 0, 0]
    # get the ground truth
    y = list(train.Survived)
    
    # train the model
    weights = grad_descent(A, w, y, lr = 0.01,
                           stochastic = False)
    
    # Lets use these weights to make predictions on the training data and see how it looks
    def classification(weights, features):
        prob = sigmoid(weights, features)
        if prob  .5:
            return 1
        else:
            return 0
    
    correct = 0
    
    for i, row in train.iterrows():
        fare = row['Fare']
        sex = row['Sex']
        A = np.array([[1, sex, fare]])
        A.shape = (1, 3)
        pred = classification(weights, A[0,:])
            
        if row['Survived'] == pred:
            correct += 1
    
    print(correct/len(train.index))


In the end I get around 65% accuracy, while using sklearn I can get 78% accuracy. I understand that sklearn's algorithm is likely much more sophisticated than mine, but I was hoping I could at least come close (maybe in the 70's). Any advice?

Topic implementation logistic-regression scikit-learn python machine-learning

Category Data Science


You are using (stochastic) gradient descent. For that to work properly, the learning rate (step size) must be set correctly. I assume that the error lies there.

Instead, you could try logistic regression via IRLS (see its definition), compare also IRLS vs GD

Or for the input you tested, you just found a bad local optimum.

About

Geeks Mental is a community that publishes articles and tutorials about Web, Android, Data Science, new techniques and Linux security.