Sklearn Decision Tree as weak learner in Adaboost not working properly
I'm trying to implement Adaboost algorithm with sklearn decision tree as the Weak Learner - at each step I want to choose one feature with one threshold to classify all samples.
I have 1400 long feature vectors and want to label them 1 or -1. The features are words from movie ratings and the label represents bad or good. At some of the iterations, the decision tree decides on a feature, threshold 0.5, and classifies all samples as -1 (no matter their value) and on the next iteration chooses the same feature, this time classifying the samples as it is supposed to.
Can anyone find a reason for this?
tree prints:
feat: 311
==================
|--- worst = 0.50
   --- class: 1.0
| --- worst   0.50
   --- class: -1.0
==================
alphas = 0.16872595425475514
feat: 27
==================
|--- bad = 0.50
   --- class: 1.0
|--- bad   0.50
   ---class: -1.0
==================
alphas = 0.21421414954211687
feat: 371
==================
|--- boring = 0.50
   ---class: -1.0
|--- boring   0.50
   --- class: -1.0
==================
alphas = 0.1881155411693614
feat: 371
==================
|--- boring = 0.50
   --- class: 1.0
|--- boring   0.50
   --- class: -1.0
==================
alphas = 0.12644785644997397
feat: 822
==================
|--- ridiculous = 0.50
   --- class: -1.0
|--- ridiculous   0.50
   --- class: -1.0
Code:
def run_adaboost(X_train, y_train, T):
    hypotheses = []
    alpha_vals = []
    num_of_samples = len(X_train)
    D = [1/num_of_samples for _ in range(num_of_samples)]
    for t in range(T):
        h = weak_learner(D, X_train, y_train)
        idx, threshold = h.tree_.feature[0], h.tree_.threshold[0]
        tup = (get_prediction(h, X_train[0]), idx, threshold)
        print_tree(h, [vocabulary[idx] for idx in range(len(X_train[0]))])
        hypotheses.append(tup)
        epsilon = 1-h.score(X_train, y_train, sample_weight=D)
        alpha = 0.5*np.log((1-epsilon)/epsilon)
        alpha_vals.append(alpha)
        D = new_distribution(D, X_train, y_train, alpha, h)
    
    return hypotheses, alpha_vals
##############################################
def weak_learner(D, X_train, y_train):
    clf = tree.DecisionTreeClassifier(max_depth=1, criterion=entropy)
    clf.fit(X_train, y_train, sample_weight=D)
    return clf
def new_distribution(D, X_train, y_train, alpha, h):
    Z = 0
    Dt = [0]*len(D)
    print(falphas = {alpha})
    pred = h.predict(X_train)
    for i in range(len(X_train)):
        exponent = (-1) * alpha * y_train[i] * (pred[i])
        Z += D[i]*np.exp(exponent)
    for i in range(len(X_train)):
        exponent = (-1) * alpha * y_train[i] * (pred[i])
        Dt[i] = (D[i]*np.exp(exponent))/Z
    return Dt
def get_prediction(clf, vector):
    feat = clf.tree_.feature[0]
    print(ffeat: {feat})
    vec = vector.copy()
    vec[feat] = 0
    vec = vec.reshape(1, -1)
    return int(clf.predict(vec)[0])
def print_tree(clf, feat_name):
    r = tree.export_text(clf, feat_name)
    print(r)
    print(==================)
##############################################
def main():
    data = parse_data()
    if not data:
        return
    (X_train, y_train, X_test, y_test, vocab) = data
    global vocabulary, X_test_g, y_test_g
    X_test_g, y_test_g = X_test, y_test
    vocabulary = vocab
    T = 80
    run_adaboost(X_train, y_train, T)
if __name__ == '__main__':
    main()
Topic adaboost learning decision-trees scikit-learn machine-learning
Category Data Science