-
Notifications
You must be signed in to change notification settings - Fork 12
/
token_classification.py
157 lines (133 loc) · 5.24 KB
/
token_classification.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# End-to-end script running the Hugging Face Trainer
# for token classification. Based on the Tasks documentation
# originally from: https://hf.co/docs/transformers/tasks/token_classification
import evaluate
import numpy as np
import torch
from datasets import load_dataset
from transformers import (
AutoModelForTokenClassification,
AutoTokenizer,
DataCollatorForTokenClassification,
Trainer,
TrainingArguments,
)
# Constants
dataset_name = "wnut_17"
model_name = "distilbert-base-uncased"
metric = "seqeval"
# AutoModel requires the label mapping
id2label = {
0: "O",
1: "B-corporation",
2: "I-corporation",
3: "B-creative-work",
4: "I-creative-work",
5: "B-group",
6: "I-group",
7: "B-location",
8: "I-location",
9: "B-person",
10: "I-person",
11: "B-product",
12: "I-product",
}
label2id = {v: k for k, v in id2label.items()}
# Load dataset
print(f"Downloading dataset ({dataset_name})")
dataset = load_dataset(dataset_name)
# Tokenize the dataset
tokenizer = AutoTokenizer.from_pretrained(model_name)
def tokenize_function(examples):
"Realigns tokens and labels and limits sequence length"
tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
labels = []
for i, label in enumerate(examples["ner_tags"]):
word_ids = tokenized_inputs.word_ids(batch_index=i) # Map tokens to their respective word.
previous_word_idx = None
label_ids = []
for word_idx in word_ids: # Set the special tokens to -100.
if word_idx is None:
label_ids.append(-100)
elif word_idx != previous_word_idx: # Only label the first token of a given word.
label_ids.append(label[word_idx])
else:
label_ids.append(-100)
previous_word_idx = word_idx
labels.append(label_ids)
tokenized_inputs["labels"] = labels
return tokenized_inputs
print(f"Tokenizing dataset for {model_name}...")
tokenized_dataset = dataset.map(tokenize_function, batched=True)
# Create an efficient collator which dynamically pads
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
# Handle computation of our metrics
print(f"Loading metric ({metric})...")
seqeval = evaluate.load(metric)
# Get the tags from the dataset
tags = dataset["train"][0]["ner_tags"]
label_list = dataset["train"].features["ner_tags"].feature.names
labels = [label_list[i] for i in tags]
def compute_metrics(evaluation_preds):
predictions, labels = evaluation_preds
predictions = np.argmax(predictions, axis=2)
true_predictions = [
[label_list[p] for (p, l) in zip(prediction, label) if l != -100]
for prediction, label in zip(predictions, labels)
]
true_labels = [
[label_list[l] for (p, l) in zip(prediction, label) if l != -100]
for prediction, label in zip(predictions, labels)
]
results = seqeval.compute(predictions=true_predictions, references=true_labels)
return {
"precision": results["overall_precision"],
"recall": results["overall_recall"],
"f1": results["overall_f1"],
"accuracy": results["overall_accuracy"],
}
# Create a model for our problem
print(f"Instantiating model ({model_name})...")
model = AutoModelForTokenClassification.from_pretrained(
model_name, num_labels=13, id2label=id2label, label2id=label2id
)
# Define the hyperparameters in the TrainingArguments
print("Creating training arguments (weights are stored at `results/sequence_classification`)...")
training_args = TrainingArguments(
output_dir="results/token_classification", # Where weights are stored
learning_rate=2e-5, # The learning rate during training
per_device_train_batch_size=16, # Number of samples per batch during training
per_device_eval_batch_size=16, # Number of samples per batch during evaluation
num_train_epochs=2, # How many iterations through the dataloaders should be done
weight_decay=0.01, # Regularization penalization
evaluation_strategy="epoch", # How often metrics on the evaluation dataset should be computed
save_strategy="epoch", # When to try and save the best model (such as a step number or every iteration)
)
# Create the `Trainer`, passing in the model and arguments
# the datasets to train on, how the data should be collated,
# and the method for computing our metrics
print("Creating `Trainer`...")
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset["train"],
eval_dataset=tokenized_dataset["test"],
data_collator=data_collator,
compute_metrics=compute_metrics,
)
# Initiate training
print("Training...")
trainer.train()
# Performing inference
text = "The Golden State Warriors are an American professional basketball team based in San Francisco."
# We need to tokenize the inputs and turn them to PyTorch tensors
encoded_input = tokenizer(text, return_tensors="pt")
# Then we can perform raw torch inference:
print("Performing inference...")
model.eval()
with torch.inference_mode():
logits = model(**encoded_input).logits
# Finally, decode our outputs
predictions = logits.argmax(dim=2)
print(f"Prediction: {[id2label[pred] for pred in predictions[0]]}")
# Can also use `model.config.id2label` instead