cleaned_data = '../data/splits'Training Model
Preprocessing
Define Parameters
KFOLD = 1
TOKENIZER: str = "bert-base-cased"
LEARNING_RATE: float = 5e-5
BATCH_SIZE: int = 8
EPOCHS: int = 2Read kfold data into dataset
raw_datasets = load_dataset("csv",data_files={'train': [f'{cleaned_data}/train/FAA-{KFOLD}.csv'], 'test': [f'{cleaned_data}/test/FAA-{KFOLD}.csv'],
'val': [f'{cleaned_data}/val/FAA-{KFOLD}.csv']})model_nm = "bert-base-cased"Create tokenizer
tokz = AutoTokenizer.from_pretrained(TOKENIZER)Tokenize inputs
def tok_func(x):
return tokz(x["text"], padding="max_length", truncation=True)
tokenized_datasets = raw_datasets.map(tok_func, batched=True)Define datasets for training
full_train_dataset = tokenized_datasets["train"]
full_eval_dataset = tokenized_datasets["test"]
full_val_dataset = tokenized_datasets["val"]from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokz)import numpy as np
import evaluate
accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
return accuracy.compute(predictions=predictions, references=labels)Train and Evaluate Model
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainermodel = AutoModelForSequenceClassification.from_pretrained(TOKENIZER, num_labels=7)training_args = TrainingArguments(
output_dir="../output/",
learning_rate=LEARNING_RATE,
per_device_train_batch_size=BATCH_SIZE,
per_device_eval_batch_size=BATCH_SIZE,
num_train_epochs=EPOCHS,
weight_decay=0.01,
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=full_train_dataset,
eval_dataset=full_eval_dataset,
tokenizer=tokz,
data_collator=data_collator,
compute_metrics=compute_metrics,
)
history = trainer.train()Save model
trainer.save_model("../model/")