-
Notifications
You must be signed in to change notification settings - Fork 0
/
temp.py
150 lines (120 loc) · 5.18 KB
/
temp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import json
import utils
import torch
import evaluate
import transformers
import numpy as np
import pandas as pd
from torch_lr_finder import LRFinder
from pathlib import Path
from tqdm.notebook import tqdm
from datasets import Dataset
from tokenizers import Tokenizer
from tokenizers.normalizers import (Sequence, Lowercase, NFD, StripAccents)
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.decoders import BPEDecoder
from transformers import AutoConfig, \
DataCollatorWithPadding, AutoModelForSequenceClassification, \
Trainer, TrainingArguments, AutoTokenizer, GPT2Config
from matplotlib import pyplot as plt
from sklearn.metrics import top_k_accuracy_score
import importlib
importlib.reload(utils)
def data_preparation(labeled_data):
"""Prepare data for training, validation, and testing.
Returns
-------
train_df : pd.DataFrame
Training data with 2 columns, "text" and "label".
val_df : pd.DataFrame
Validation data with 2 columns, "text" and "label".
test_df : pd.DataFrame
Testing data with 2 columns, "text" and "label".
"""
# FILL IN
# train_X, train_y, val_X, val_y, test_X, test_y = utils.load_pkl(labeled_data)
data = utils.load_pkl(labeled_data)
train_X, train_y, val_X, val_y, test_X, test_y = data[0], data[1], data[2], data[3], data[4], data[5]
train_df = pd.DataFrame({"text": train_X, "label": train_y})
val_df = pd.DataFrame({"text": val_X, "label": val_y})
test_df = pd.DataFrame({"text": test_X, "label": test_y})
return train_df, val_df, test_df
def tokenizer_function(examples, tokenizer):
return tokenizer(examples["text"], padding='max_length', truncation=True)
def label2id_function(examples, label2id):
return {"label": [label2id[label] for label in examples["label"]]}
def compute_metrics(eval_pred):
accuracy = evaluate.load("accuracy")
predictions, labels = eval_pred
argmaxed = np.argmax(predictions, axis=1)
# top5_accuracy = top_k_accuracy_score(y_true=labels, y_score=predictions, k=5)
return {"top1": accuracy.compute(predictions=argmaxed, references=labels),}
# "top5": top5_accuracy}
def full_finetune(labeled_data, pretrained_output_model_path, classifier_output_model_path, batch_size, epochs, learning_rate):
# Prepare data
train_df, val_df, test_df = data_preparation(labeled_data)
train_ds = Dataset.from_dict(train_df)
val_ds = Dataset.from_dict(val_df)
test_ds = Dataset.from_dict(test_df)
# Define label map
label2id = {label: i for i, label in enumerate(set(train_df['label']))}
id2label = {i: label for label, i in label2id.items()}
# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(pretrained_output_model_path)
tokenizer.pad_token = '<pad>'
# Define model
config = AutoConfig.from_pretrained(pretrained_output_model_path)
config.num_labels = len(label2id)
config.pad_token_id = tokenizer.pad_token_id
model = AutoModelForSequenceClassification.from_pretrained(pretrained_output_model_path, config=config)
tokenizer.model_max_length = config.n_positions
# Tokenize and convert labels to ids
train_ds = train_ds.map(tokenizer_function, batched=True, fn_kwargs={"tokenizer": tokenizer})
val_ds = val_ds.map(tokenizer_function, batched=True, fn_kwargs={"tokenizer": tokenizer})
train_ds = train_ds.map(label2id_function, batched=True, fn_kwargs={"label2id": label2id})
val_ds = val_ds.map(label2id_function, batched=True, fn_kwargs={"label2id": label2id})
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest", max_length=1024)
# Freeze all layers except the classifier
for name, param in model.named_parameters():
param.requires_grad = True
model.score.weight.requires_grad = True
# Define training arguments
training_args = TrainingArguments(
output_dir=classifier_output_model_path,
learning_rate=learning_rate,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
num_train_epochs=epochs,
weight_decay=0.01,
logging_strategy="epoch",
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
push_to_hub=False,
report_to="none"
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_ds,
eval_dataset=val_ds,
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
)
# Train model
trainer.train()
data = Path("/mnt/data0/BSCRC/data/9_way_encoded/").glob("*.pkl")
model_base = Path("/home/ajain/ttmp/EWLLMs/experiments/9_way_linear_probes/")
finetune_base = Path("/home/ajain/ttmp/EWLLMs/experiments/9_way_full_finetunes")
finetune_base.mkdir(exist_ok=True, parents=True)
for d in data:
if d.stem == "morse":
continue
print(d.stem)
pretrained = model_base/d.stem/"checkpoint-5256"
output = finetune_base/d.stem
output.mkdir(exist_ok=True, parents=True)
full_finetune(d, pretrained, output, batch_size=16, epochs=8, learning_rate=5e-5)