How to build vocabulary file for NLP embeddings efficiently?
I am currently building various word embeddings for my NLP project, ranging from Word2Vec, ELMo, LINE etc.
I am looking to train ELMo using AllenNLP, a Python package for NLP, using the tutorial here. To improve the efficiency during training, the tutorial recommends to input a vocab file for the entire corpus, similar to this, snippet is seen below. (FYI: First 3 tokens represents end-of-sentence, start-of-sentence and unknown tokens; tokens that appear more frequently will be at the start of the file)
/S
S
UNK
the
,
.
to
of
and
a
in
's
that
...
I have followed a few tutorials online to build a custom vocab builder using Python class as seen below. However, the dataset that I'm working contains ~3 million clinical notes and the processing time of my current code is taking too long (at least 30 days at the current speed). Are there any efficient way for me to build this vocab file (e.g. a package, or usage of multiprocess/pyspark to utilise my existing GPUs).
import pandas as pd
from csv import reader
from app.preprocessing.preprocessing import (DataProcessor)
from app.embeddings.constants import (ELMO_VOCAB_PATH, ELMO_VOCAB_DICT_PATH, MIMIC3_NOTEEVENTS_PATH)
class Vocabulary:
UNK_token = 0 # Used for unknown tokens
SOS_token = 1 # Start-of-sentence token
EOS_token = 2 # End-of-sentence token
def __init__(self, name):
self.name = name
self.word2index = {}
self.word2count = {}
self.index2word = {self.UNK_token: @@UNKNOWN@@, self.SOS_token: S, self.EOS_token: /S}
self.num_words = 3
self.num_sentences = 0
self.longest_sentence = 0
def add_word(self, word):
if word not in self.word2index:
# First entry of word into vocabulary
self.word2index[word] = self.num_words
self.word2count[word] = 1
self.index2word[self.num_words] = word
self.num_words += 1
else:
# Word exists; increase word count
self.word2count[word] += 1
def add_sentence(self, sentence):
sentence_len = len(sentence)
# Tokenise and add sentence
for word in sentence:
self.add_word(word)
if sentence_len self.longest_sentence:
# This is the longest sentence
self.longest_sentence = sentence_len
# Count the number of sentences
self.num_sentences += 1
def to_word(self, index):
return self.index2word[index]
def to_index(self, word):
return self.word2index[word]
def sort_word2count(self):
sorted_word2count = {k: v for k, v in sorted(self.word2count.items(), key=lambda item: item[1], reverse=True)}
self.word2count = sorted_word2count
def save_vocab_file(self, file_path):
with open(file_path, 'w') as f:
# Add starting tokens
f.write(str(self.index2word[self.EOS_token]) + \n)
f.write(str(self.index2word[self.SOS_token]) + \n)
f.write(str(self.index2word[self.UNK_token]) + \n)
# Add words
for key, value in self.word2count.items():
f.write(str(key) + \n)
def save_vocab_dict(self, file_path):
pickle.dump(self.word2count, open(file_path, 'wb'))
if __name__ == '__main__':
voc = Vocabulary('elmo_vocab')
# Initialise processor
processor = DataProcessor()
# Iterate through sentences
print(Processing corpus)
with open(MIMIC3_NOTEEVENTS_PATH, 'r') as read_obj:
csv_reader = reader(read_obj)
header = next(csv_reader) # Read header
text_index = header.index('TEXT') # Get index of text column
counter = 0
if header != None:
for row in list(csv_reader): # Iterate over each entry
lines = row[text_index] # Indexing note events
lines = lines.splitlines(True) # Convert string to list, and keep delimiter
lines = processor.processLines(lines) # Process and tokenise
# Add line into voc object
for line in lines:
voc.add_sentence(line)
if counter%1000 == 0: print(counter)
counter += 1
# Sort word2count
print(Sorting vocab)
voc.sort_word2count()
# Save vocab into files
print(Saving vocab to file)
voc.save_vocab_file(ELMO_VOCAB_PATH)
voc.save_vocab_dict(ELMO_VOCAB_DICT_PATH)
Topic allennlp embeddings word-embeddings nlp python
Category Data Science