How to save hugging face fine tuned model using pytorch and distributed training
I am fine tuning masked language model from XLM Roberta large on google machine specs.
When I copy the model using gsutil and subprocess
from container to GCP bucket it gives me error.
Versions
Versions torch==1.11.0+cu113
torchvision==0.12.0+cu113
torchaudio==0.11.0+cu113
transformers==4.17.0
I am using pre-trained Hugging face model.
I launch it as train.py file which I copy inside docker image and use vertex-ai ( GCP) to launch it using Containerspec
machineSpec = MachineSpec(machine_type=a2-highgpu-4g,accelerator_count=4,accelerator_type=NVIDIA_TESLA_A100)
python -m torch.distributed.launch --nproc_per_node 4 train.py --bf16
I am using
https://huggingface.co/xlm-roberta-large
tokenizer = tr.XLMRobertaTokenizer.from_pretrained(xlm-roberta-large,local_files_only=True)
model = tr.XLMRobertaForMaskedLM.from_pretrained(xlm-roberta-large, return_dict=True,local_files_only=True)
Training Code
training_args = tr.TrainingArguments(
output_dir='****'
,logging_dir='****' # directory for storing logs
,save_strategy=epoch
,run_name=****
,learning_rate=2e-5
,logging_steps=1000
,overwrite_output_dir=True
,num_train_epochs=10
,per_device_train_batch_size=4
,prediction_loss_only=True
,gradient_accumulation_steps=2
# ,gradient_checkpointing=True
,bf16=True #57100
,optim=adafactor
)
trainer = tr.Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=train_data
)
Train.py
import torch
import numpy as np
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
import transformers as tr
from sentence_transformers import SentenceTransformer
from transformers import XLMRobertaTokenizer, XLMRobertaForMaskedLM
from transformers import AdamW
from transformers import AutoTokenizer
from transformers import BertTokenizerFast as BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup,BertForMaskedLM
from transformers import DataCollatorForLanguageModeling
from scipy.special import softmax
import scipy
import random
import pickle
import os
import time
import subprocess as sp
# torch.cuda.empty_cache()
start=time.time()
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(Using, device)
torch.backends.cudnn.deterministic = True
tr.trainer_utils.set_seed(0)
print(here)
tokenizer = tr.XLMRobertaTokenizer.from_pretrained(xlm-roberta-large,local_files_only=True)
model = tr.XLMRobertaForMaskedLM.from_pretrained(xlm-roberta-large, return_dict=True,local_files_only=True)
model.gradient_checkpointing_enable() #included as new line
print(included gradient checkpoint)
model.to(device)
print(Model loaded successfully)
df=pd.read_csv(data.csv)
train_df=df.text.tolist()
print(len(train_df))
train_df=list(set(train_df))
train_df = [x for x in train_df if str(x) != 'nan']
print(Length of training data is \n ,len(train_df))
print(DATA LOADED successfully)
train_encodings = tokenizer(train_df, truncation=True, padding=True, max_length=512, return_tensors=pt)
print(encoding done)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
print(data collector done)
class SEDataset(torch.utils.data.Dataset):
def __init__(self, encodings):
self.encodings = encodings
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
return item
def __len__(self):
return len(self.encodings[attention_mask])
train_data = SEDataset(train_encodings)
print(train data created)
training_args = tr.TrainingArguments(
output_dir='results_mlm_exp1'
,logging_dir='logs_mlm_exp1' # directory for storing logs
,save_strategy=epoch
,learning_rate=2e-5
,logging_steps=500
,overwrite_output_dir=True
,num_train_epochs=20
,per_device_train_batch_size=4
,prediction_loss_only=True
,gradient_accumulation_steps=2
,bf16=True #Ampere GPU
)
trainer = tr.Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=train_data
)
trainer.train()
print(model training finished)
trainer.save_model(model_mlm_exp1)
print(training finished)
end=time.time()
print(total time taken in hours is, (end-start)/3600)
Error
trainer.save_model(model_mlm_exp1)
subprocess.call('gsutil cp -r /pythonPackage/trainer/model_mlm_exp1 gs://******/model_mlm_exp1', shell=True, stdout=subprocess.PIPE)
ERROR ResumableUploadAbortException: 409 The object has already been created in an earlier attempt and was overwritten, possibly due to a race condition.
Topic huggingface google-cloud pytorch python-3.x distributed
Category Data Science