forked from optuna/optuna
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathallennlp_simple.py
135 lines (100 loc) · 4.16 KB
/
allennlp_simple.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
"""
Optuna example that optimizes a classifier configuration for IMDB movie review dataset.
This script is based on the example of allentune (https://github.com/allenai/allentune).
In this example, we optimize the validation accuracy of sentiment classification using AllenNLP.
Since it is too time-consuming to use the entire dataset, we here use a small subset of it.
We have the following two ways to execute this example:
(1) Execute this code directly.
$ python allennlp_simple.py
(2) Execute through CLI.
$ STUDY_NAME=`optuna create-study --direction maximize --storage sqlite:///example.db`
$ optuna study optimize allennlp_simple.py objective --n-trials=100 --study-name $STUDY_NAME \
--storage sqlite:///example.db
"""
import os
import shutil
import allennlp
import allennlp.data
import allennlp.models
import allennlp.modules
import torch
import optuna
DEVICE = -1 # If you want to use GPU, use DEVICE = 0.
MAX_DATA_SIZE = 3000
DIR = os.getcwd()
MODEL_DIR = os.path.join(DIR, "result")
GLOVE_FILE_PATH = "https://s3-us-west-2.amazonaws.com/allennlp/datasets/glove/glove.6B.50d.txt.gz"
def prepare_data():
glove_indexer = allennlp.data.token_indexers.SingleIdTokenIndexer(lowercase_tokens=True)
tokenizer = allennlp.data.tokenizers.WordTokenizer(
word_splitter=allennlp.data.tokenizers.word_splitter.JustSpacesWordSplitter(),
)
reader = allennlp.data.dataset_readers.TextClassificationJsonReader(
token_indexers={"tokens": glove_indexer}, tokenizer=tokenizer,
)
train_dataset = reader.read(
"https://s3-us-west-2.amazonaws.com/allennlp/datasets/imdb/train.jsonl"
)
train_dataset = train_dataset[:MAX_DATA_SIZE]
valid_dataset = reader.read(
"https://s3-us-west-2.amazonaws.com/allennlp/datasets/imdb/dev.jsonl"
)
valid_dataset = valid_dataset[:MAX_DATA_SIZE]
vocab = allennlp.data.Vocabulary.from_instances(train_dataset)
return train_dataset, valid_dataset, vocab
def create_model(vocab, trial):
embedding = allennlp.modules.Embedding(
embedding_dim=50,
trainable=True,
pretrained_file=GLOVE_FILE_PATH,
num_embeddings=vocab.get_vocab_size("tokens"),
)
embedder = allennlp.modules.text_field_embedders.BasicTextFieldEmbedder({"tokens": embedding})
output_dim = trial.suggest_int("output_dim", 16, 128)
max_filter_size = trial.suggest_int("max_filter_size", 3, 6)
num_filters = trial.suggest_int("num_filters", 16, 128)
encoder = allennlp.modules.seq2vec_encoders.CnnEncoder(
ngram_filter_sizes=range(1, max_filter_size),
num_filters=num_filters,
embedding_dim=50,
output_dim=output_dim,
)
dropout = trial.suggest_uniform("dropout", 0, 0.5)
model = allennlp.models.BasicClassifier(
text_field_embedder=embedder, seq2vec_encoder=encoder, dropout=dropout, vocab=vocab,
)
return model
def objective(trial):
train_dataset, valid_dataset, vocab = prepare_data()
model = create_model(vocab, trial)
if DEVICE > -1:
model.to(torch.device("cuda:{}".format(DEVICE)))
lr = trial.suggest_loguniform("lr", 1e-1, 1e0)
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
iterator = allennlp.data.iterators.BasicIterator(batch_size=10,)
iterator.index_with(vocab)
serialization_dir = os.path.join(MODEL_DIR, "trial_{}".format(trial.number))
trainer = allennlp.training.Trainer(
model=model,
optimizer=optimizer,
iterator=iterator,
train_dataset=train_dataset,
validation_dataset=valid_dataset,
patience=3,
num_epochs=6,
cuda_device=DEVICE,
serialization_dir=serialization_dir,
)
metrics = trainer.train()
return metrics["best_validation_accuracy"]
if __name__ == "__main__":
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=80, timeout=600)
print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial
print(" Value: ", trial.value)
print(" Params: ")
for key, value in trial.params.items():
print(" {}: {}".format(key, value))
shutil.rmtree(MODEL_DIR)