How to build vocabulary file for NLP embeddings efficiently?

Question

How to build vocabulary file for NLP embeddings efficiently?

Goh Jia Yi

2021年10月6日 11:43

I am currently building various word embeddings for my NLP project, ranging from Word2Vec, ELMo, LINE etc.

I am looking to train ELMo using AllenNLP, a Python package for NLP, using the tutorial here. To improve the efficiency during training, the tutorial recommends to input a vocab file for the entire corpus, similar to this, snippet is seen below. (FYI: First 3 tokens represents end-of-sentence, start-of-sentence and unknown tokens; tokens that appear more frequently will be at the start of the file)

/S
S
UNK
the
,
.
to
of
and
a
in

's
that
...

I have followed a few tutorials online to build a custom vocab builder using Python class as seen below. However, the dataset that I'm working contains ~3 million clinical notes and the processing time of my current code is taking too long (at least 30 days at the current speed). Are there any efficient way for me to build this vocab file (e.g. a package, or usage of multiprocess/pyspark to utilise my existing GPUs).

import pandas as pd
from csv import reader
from app.preprocessing.preprocessing import (DataProcessor)
from app.embeddings.constants import (ELMO_VOCAB_PATH, ELMO_VOCAB_DICT_PATH, MIMIC3_NOTEEVENTS_PATH)

class Vocabulary:
    UNK_token = 0   # Used for unknown tokens
    SOS_token = 1   # Start-of-sentence token
    EOS_token = 2   # End-of-sentence token

    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {self.UNK_token: @@UNKNOWN@@, self.SOS_token: S, self.EOS_token: /S}
        self.num_words = 3
        self.num_sentences = 0
        self.longest_sentence = 0

    def add_word(self, word):
        if word not in self.word2index:
            # First entry of word into vocabulary
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            # Word exists; increase word count
            self.word2count[word] += 1

    def add_sentence(self, sentence):
        sentence_len = len(sentence)

        # Tokenise and add sentence
        for word in sentence:
            self.add_word(word)

        if sentence_len  self.longest_sentence:
            # This is the longest sentence
            self.longest_sentence = sentence_len

        # Count the number of sentences
        self.num_sentences += 1

    def to_word(self, index):
        return self.index2word[index]

    def to_index(self, word):
        return self.word2index[word]

    def sort_word2count(self):
        sorted_word2count = {k: v for k, v in sorted(self.word2count.items(), key=lambda item: item[1], reverse=True)}
        self.word2count = sorted_word2count

    def save_vocab_file(self, file_path):
        with open(file_path, 'w') as f:
            # Add starting tokens
            f.write(str(self.index2word[self.EOS_token]) + \n)
            f.write(str(self.index2word[self.SOS_token]) + \n)
            f.write(str(self.index2word[self.UNK_token]) + \n)

            # Add words
            for key, value in self.word2count.items():
                f.write(str(key) + \n)

    def save_vocab_dict(self, file_path):
        pickle.dump(self.word2count, open(file_path, 'wb'))


if __name__ == '__main__':
    voc = Vocabulary('elmo_vocab')

    # Initialise processor
    processor = DataProcessor()

    # Iterate through sentences
    print(Processing corpus)
    with open(MIMIC3_NOTEEVENTS_PATH, 'r') as read_obj:
        csv_reader = reader(read_obj)
        header = next(csv_reader) # Read header
        text_index = header.index('TEXT') # Get index of text column

        counter = 0

        if header != None:
            for row in list(csv_reader): # Iterate over each entry

                lines = row[text_index] # Indexing note events
                lines = lines.splitlines(True) # Convert string to list, and keep delimiter
                lines = processor.processLines(lines) # Process and tokenise
                # Add line into voc object
                for line in lines:
                    voc.add_sentence(line)

                if counter%1000 == 0: print(counter)
                counter += 1

    # Sort word2count
    print(Sorting vocab)
    voc.sort_word2count()

    # Save vocab into files
    print(Saving vocab to file)
    voc.save_vocab_file(ELMO_VOCAB_PATH)
    voc.save_vocab_dict(ELMO_VOCAB_DICT_PATH)

Topic allennlp embeddings word-embeddings nlp python

Category Data Science

How to build vocabulary file for NLP embeddings efficiently?

About