= '../data/splits' cleaned_data
Training Model
Preprocessing
Define Parameters
= 1
KFOLD str = "bert-base-cased"
TOKENIZER: float = 5e-5
LEARNING_RATE: int = 8
BATCH_SIZE: int = 2 EPOCHS:
Read kfold data into dataset
= load_dataset("csv",data_files={'train': [f'{cleaned_data}/train/FAA-{KFOLD}.csv'], 'test': [f'{cleaned_data}/test/FAA-{KFOLD}.csv'],
raw_datasets 'val': [f'{cleaned_data}/val/FAA-{KFOLD}.csv']})
= "bert-base-cased" model_nm
Create tokenizer
= AutoTokenizer.from_pretrained(TOKENIZER) tokz
Tokenize inputs
def tok_func(x):
return tokz(x["text"], padding="max_length", truncation=True)
= raw_datasets.map(tok_func, batched=True) tokenized_datasets
Define datasets for training
= tokenized_datasets["train"]
full_train_dataset = tokenized_datasets["test"]
full_eval_dataset = tokenized_datasets["val"] full_val_dataset
from transformers import DataCollatorWithPadding
= DataCollatorWithPadding(tokenizer=tokz) data_collator
import numpy as np
import evaluate
= evaluate.load("accuracy")
accuracy
def compute_metrics(eval_pred):
= eval_pred
predictions, labels = np.argmax(predictions, axis=1)
predictions return accuracy.compute(predictions=predictions, references=labels)
Train and Evaluate Model
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
= AutoModelForSequenceClassification.from_pretrained(TOKENIZER, num_labels=7) model
= TrainingArguments(
training_args ="../output/",
output_dir=LEARNING_RATE,
learning_rate=BATCH_SIZE,
per_device_train_batch_size=BATCH_SIZE,
per_device_eval_batch_size=EPOCHS,
num_train_epochs=0.01,
weight_decay="epoch",
evaluation_strategy="epoch",
save_strategy=True,
load_best_model_at_end
)
= Trainer(
trainer =model,
model=training_args,
args=full_train_dataset,
train_dataset=full_eval_dataset,
eval_dataset=tokz,
tokenizer=data_collator,
data_collator=compute_metrics,
compute_metrics
)
= trainer.train() history
Save model
"../model/") trainer.save_model(