HuggingFace transformer: CUDA out memory only when performing hyperparameter search
I am working with a GTX3070, which only has 8GB of GPU RAM. When I am running using trainer.train(), I run fine with a maximum batch size of 7 (6 if running in Jupiter notebook). However, when I attempt to run in a hyperparameter search with ray, I get CUDA out of memory every single time.
I am wondering why this could be the case.
Here is my code. Sorry if it’s a little long. It’s based off the following Jupyter notebooks:
- https://github.com/huggingface/notebooks/blob/master/examples/text_classification.ipynb
- https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TrOCR/Fine_tune_TrOCR_on_IAM_Handwriting_Database_using_Seq2SeqTrainer.ipynb
class NPJFixedDataset(Dataset):
def __init__(
self, root_dir, df, feature_extractor, tokenizer, max_target_length=128
):
self.root_dir = root_dir
self.df = df
self.feature_extractor = feature_extractor
self.tokenizer = tokenizer
self.max_target_length = max_target_length
def __len__(self):
return len(self.df)
def __getitem__(self, idx):
# get file name + text
file_name = self.df[path][idx]
text = self.df[label][idx]
# prepare image (i.e. resize + normalize)
image = Image.open(self.root_dir + file_name).convert(RGB)
pixel_values = self.feature_extractor(image, return_tensors=pt).pixel_values
# add labels (input_ids) by encoding the text
labels = self.tokenizer(
text, padding=max_length, max_length=self.max_target_length
).input_ids
# important: make sure that PAD tokens are ignored by the loss function
labels = [
label if label != self.tokenizer.pad_token_id else -100 for label in labels
]
encoding = {
pixel_values: pixel_values.squeeze(),
labels: torch.tensor(labels),
}
return encoding
feature_extractor = ViTFeatureExtractor.from_pretrained(args.encoder_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(args.decoder_checkpoint)
train_dataset = NPJFixedDataset(
root_dir=args.dataset_location,
df=train_df,
feature_extractor=feature_extractor,
tokenizer=tokenizer,
)
eval_dataset = NPJFixedDataset(
root_dir=args.dataset_location,
df=test_df,
feature_extractor=feature_extractor,
tokenizer=tokenizer,
)
def model_init():
model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
google/vit-base-patch16-224, Geotrend/bert-base-th-cased
)
# set special tokens used for creating the decoder_input_ids from the labels
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id
# make sure vocab size is set correctly
model.config.vocab_size = model.config.decoder.vocab_size
# set beam search parameters
model.config.eos_token_id = tokenizer.sep_token_id
model.config.max_length = 64
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3
model.config.length_penalty = 2.0
model.config.num_beams = 4
return model
def compute_metrics(pred) - Dict[str, float]:
labels_ids = pred.label_ids
pred_ids = pred.predictions
pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
labels_ids[labels_ids == -100] = tokenizer.pad_token_id
label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)
# TODO: The following package from datasets load_metric
# cer = cer_metric.compute(predictions=pred_str, references=label_str)
cer = sum([editdistance.eval(a, b) for a, b in zip(pred_str, label_str)]) / sum(
[len(b) for b in label_str]
)
return {cer: cer}
training_args = Seq2SeqTrainingArguments(
num_train_epochs=100,
predict_with_generate=True,
evaluation_strategy=epoch,
per_device_train_batch_size=3, # 7 max for py, 6 max for ipynb
per_device_eval_batch_size=3,
fp16=True, # set to false if turning off gpu
output_dir=args.logging_dir,
save_strategy=epoch,
save_total_limit=10,
logging_steps=1000,
learning_rate=1e-4,
load_best_model_at_end=True,
report_to=wandb,
)
# instantiate trainer
trainer = Seq2SeqTrainer(
model_init=model_init,
tokenizer=feature_extractor,
args=training_args,
compute_metrics=compute_metrics,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
data_collator=default_data_collator,
)
def hp_space(trial) - Dict[str, float]:
# backend for ray
return {
learning_rate: tune.loguniform(1e-6, 1e-4),
num_train_epochs: tune.choice(list(range(1, 6))),
seed: tune.uniform(1, 40),
per_device_train_batch_size: 1,
}
trainer.hyperparameter_search(
hp_space=hp_space,
backend=ray,
n_trials=10,
# search_alg=HyperOptSearch(metric=objective, mode=max),
# scheduler=ASHAScheduler(metric=loss, mode=min),
# fail_fast=True,
max_failures=-1,
name=testing_run_hellobro,
)
```
Topic cuda huggingface transformer hyperparameter-tuning
Category Data Science