TF Keras Text Processing - Classification Model
I'm trying to put together a script that classifies comments into either adequate or inadequate. I put a question up here earlier with all my code, but I think I've isolated the problem down into the setup of the model, so I deleted that one, and hopefully this is more streamlined and easy to follow. The example i'm trying to follow is the classic IMDB comment, where the comments are either positive or negative, but again in my instance, adequate or not. My tokenizer and text cleanup and padding are working well, so i have my training dataset that returns my tokenized sequences properly. I think where i'm going wrong is as follows:
model = tf.keras.Sequential([
tf.keras.layers.Embedding(10000, 300),
tf.keras.layers.GlobalAveragePooling1D(),
tf.keras.layers.Dense(1, activation = 'sigmoid')])
model.summary()
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
es = tf.keras.callbacks.EarlyStopping(monitor = 'val_accuracy', mode = 'max')
callbacks = [es]
history = model.fit(train_seqs, train_df['adq'].values,
batch_size = BATCH_SIZE,
epochs = EPOCHS,
verbose = 2,
validation_split = 0.2,
callbacks = callbacks)
model.evaluate(test_seqs, test_df['adq'].values)
My main problem with the output, is that when i run the model to predict comments with no classification, the model is returning the same output value for every single comment. I've done some research, and people have suggested normalizing the batch, and i tried adding a layer for batch normalization in my model, but that doesn't seem to help either. Can someone please have a look and show me where I'm going wrong? Thanks very much for your help!
Here's my entire script per my comment below:
import pandas as pd
import tensorflow as tf
import pickle
import string
import re
NUM_WORDS = 10000
SEQ_LEN = 512
EMBEDDING_SIZE = 300
BATCH_SIZE = 70
EPOCHS = 20
HIGHEST_PROTOCOL = 3
THRESHOLD = 0.60
train_df = pd.read_csv(r'C:\Users\peter\OneDrive\Documents\IMDBtrain.csv')
test_df = pd.read_csv(r'C:\Users\peter\OneDrive\Documents\IMDBtest.csv')
def clean_text(text, remove_stopwords=True):
'''Clean the text, with the option to remove stopwords'''
# Convert words to lower case and split them
text = text.lower().split()
# Optionally, remove stop words
if remove_stopwords:
stops = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours',
'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself',
'its', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themslves', 'what', 'which',
'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be',
'been', 'be', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a',
'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at',
'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before',
'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over',
'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why',
'how', 'all', 'any', 'both', 'each', 'few', 'most', 'more', 'other', 'some', 'such', 'no',
'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will',
'just', 'don', 'should', 'now']
text = [w for w in text if not w in stops]
text = " ".join(text)
# Clean the text
text = re.sub(r"br /", " ", text)
text = re.sub(r"[^a-z]", " ", text)
text = re.sub(r" ", " ", text) # Remove any extra spaces
text = re.sub(r" ", " ", text)
# Return a list of words
return(text)
train_df["text"] = train_df["text"].apply(lambda x: clean_text(x))
test_df["text"] = test_df["text"].apply(lambda x: clean_text(x))
train_df = train_df.sample(frac = 1).reset_index(drop = True)
test_df = test_df.sample(frac = 1).reset_index(drop = True)
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words = NUM_WORDS, oov_token = 'UNK')
tokenizer.fit_on_texts(train_df['text'])
train_seqs = tokenizer.texts_to_sequences(train_df['text'])
test_seqs = tokenizer.texts_to_sequences(test_df['text'])
train_seqs = tf.keras.preprocessing.sequence.pad_sequences(train_seqs, maxlen = SEQ_LEN, padding = 'post')
test_seqs = tf.keras.preprocessing.sequence.pad_sequences(test_seqs, maxlen = SEQ_LEN, padding = 'post')
model = tf.keras.Sequential([
tf.keras.layers.Embedding(NUM_WORDS, EMBEDDING_SIZE),
tf.keras.layers.GlobalAveragePooling1D(),
tf.keras.layers.Dense(1, activation = 'sigmoid')])
model.summary()
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
es = tf.keras.callbacks.EarlyStopping(monitor = 'val_accuracy', mode = 'max')
callbacks = [es]
history = model.fit(train_seqs, train_df['adq'].values,
batch_size = BATCH_SIZE,
epochs = EPOCHS,
validation_split = 0.2,
callbacks = callbacks)
model.evaluate(test_seqs, test_df['adq'].values)
model.save('model.ps1')
with open('tokenizer.pickle', 'wb') as handle:
pickle.dump(tokenizer, handle, protocol = pickle.HIGHEST_PROTOCOL)
del model
del tokenizer
loaded_model = tf.keras.models.load_model('model.ps1')
with open('tokenizer.pickle', 'rb') as f:
loaded_tokenizer = pickle.load(f)
def prepare_predict_data(tokenizer, comments):
seqs = tokenizer.texts_to_sequences(comments)
seqs = tf.keras.preprocessing.sequence.pad_sequences(seqs, maxlen = SEQ_LEN, padding = 'post')
return seqs
comments_to_pred = pd.read_csv(r'C:\Users\peter\OneDrive\Documents\IMDBload.csv')
my_comments = comments_to_pred.to_numpy().tolist()
my_seqs = prepare_predict_data(loaded_tokenizer, my_comments)
preds = loaded_model.predict(my_seqs)
pred_df = pd.DataFrame(columns = ['text', 'adq'])
pred_df['text'] = my_comments
pred_df['adq'] = preds
print(pred_df.head(20))
pred_df['adq'] = pred_df['adq'].apply(lambda x: 'pos' if x THRESHOLD else 'neg')
#print(pred_df.head(40))
```
Topic text-classification keras tensorflow predictive-modeling
Category Data Science