Skip to content

Commit

Permalink
More closely emulate original Event2Mind implementation. (allenai#1903)
Browse files Browse the repository at this point in the history
- Use word2vec instead of glove.
- Fix bugs in vocabulary configuration.
  1. Place namespace under `min_count`.
  2. Unify source and target namespaces.
  3. Generate vocabulary with `dry-run` to correctly count tokens, i.e. without multiplicity from the combinations of intents and reactions.
- Training is now a two-step process:
```
        allennlp dry-run -o '{"dataset_reader": {"dummy_instances_for_vocab_generation": true}} {"vocabulary": {"min_count": {"source_tokens": 2}}}' training_config/event2mind.json --serialization-dir vocab_output_path
        allennlp train -o '{"vocabulary": {"directory_path": "vocab_output_path/vocabulary/"}}' training_config/event2mind.json --serialization-dir output_path
```
  • Loading branch information
brendan-ai2 authored Oct 16, 2018
1 parent d3a8f4f commit a94a23e
Show file tree
Hide file tree
Showing 5 changed files with 103 additions and 30 deletions.
37 changes: 32 additions & 5 deletions allennlp/data/dataset_readers/event2mind.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,22 +51,31 @@ class Event2MindDatasetReader(DatasetReader):
target_token_indexers : ``Dict[str, TokenIndexer]``, optional
Indexers used to define output (target side) token representations. Defaults to
``source_token_indexers``.
source_add_start_token : bool, (optional, default=True)
Whether or not to add `START_SYMBOL` to the beginning of the source sequence.
source_add_start_token : ``bool``, (optional, default=True)
Whether or not to add ``START_SYMBOL`` to the beginning of the source sequence.
dummy_instances_for_vocab_generation : ``bool`` (optional, default=False)
Whether to generate instances that use each token of input precisely
once. Normally we instead generate all combinations of Source, Xintent,
Xemotion and Otheremotion columns whichs distort the underlying token
counts. This flag should be used exclusively with the ``dry-run``
command as the instances generated will be nonsensical outside the
context of vocabulary generation.
"""
def __init__(self,
source_tokenizer: Tokenizer = None,
target_tokenizer: Tokenizer = None,
source_token_indexers: Dict[str, TokenIndexer] = None,
target_token_indexers: Dict[str, TokenIndexer] = None,
source_add_start_token: bool = True,
dummy_instances_for_vocab_generation: bool = False,
lazy: bool = False) -> None:
super().__init__(lazy)
self._source_tokenizer = source_tokenizer or WordTokenizer()
self._target_tokenizer = target_tokenizer or self._source_tokenizer
self._source_token_indexers = source_token_indexers or {"tokens": SingleIdTokenIndexer()}
self._target_token_indexers = target_token_indexers or self._source_token_indexers
self._source_add_start_token = source_add_start_token
self._dummy_instances_for_vocab_generation = dummy_instances_for_vocab_generation

@overrides
def _read(self, file_path):
Expand All @@ -84,10 +93,28 @@ def _read(self, file_path):
xintents = json.loads(line_parts[2])
xreacts = json.loads(line_parts[3])
oreacts = json.loads(line_parts[4])
for xintent in xintents:

# Generate all combinations.
if not self._dummy_instances_for_vocab_generation:
for xintent in xintents:
for xreact in xreacts:
for oreact in oreacts:
yield self.text_to_instance(
source_sequence, xintent, xreact, oreact
)
# Generate instances where each token of input appears once.
else:
# To the extent that sources are duplicated in the dataset
# (which appears common), we will duplicate them here.
yield self.text_to_instance(source_sequence, "none", "none", "none")
for xintent in xintents:
# Since "none" is a special token we don't mind it
# appearing a disproportionate number of times.
yield self.text_to_instance("none", xintent, "none", "none")
for xreact in xreacts:
for oreact in oreacts:
yield self.text_to_instance(source_sequence, xintent, xreact, oreact)
yield self.text_to_instance("none", "none", xreact, "none")
for oreact in oreacts:
yield self.text_to_instance("none", "none", "none", oreact)

@staticmethod
def _preprocess_string(tokenizer, string: str) -> str:
Expand Down
59 changes: 57 additions & 2 deletions allennlp/tests/data/dataset_readers/event2mind_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,14 @@ def get_text(key: str, instance: Instance):

class TestEvent2MindDatasetReader:
@pytest.mark.parametrize("lazy", (True, False))
def test_default_format(self, lazy):
def test_read(self, lazy):
reader = Event2MindDatasetReader(lazy=lazy)
instances = reader.read(
str(AllenNlpTestCase.FIXTURES_ROOT / 'data' / 'event2mind_small.csv')
)
instances = ensure_list(instances)

assert len(instances) == 12
assert len(instances) == 13
instance = instances[0]
assert get_text("source", instance) == ["@start@", "it", "is", "personx", "'s",
"favorite", "animal", "@end@"]
Expand Down Expand Up @@ -58,3 +58,58 @@ def test_default_format(self, lazy):
assert get_text("xintent", instance) == ["@start@", "helpful", "@end@"]
assert get_text("xreact", instance) == ["@start@", "useful", "@end@"]
assert get_text("oreact", instance) == ["@start@", "grateful", "@end@"]

instance = instances[12]
assert get_text("source", instance) == ["@start@", "personx", "drives",
"persony", "'s", "truck", "@end@"]
assert get_text("xintent", instance) == ["@start@", "for", "fun", "@end@"]
assert get_text("xreact", instance) == ["@start@", "happy", "@end@"]
assert get_text("oreact", instance) == ["@start@", "like", "a", "good", "friend", "@end@"]

@pytest.mark.parametrize("lazy", (True, False))
def test_read_with_dummy_instances_for_vocab_generation(self, lazy):
reader = Event2MindDatasetReader(lazy=lazy, dummy_instances_for_vocab_generation=True)
instances = reader.read(
str(AllenNlpTestCase.FIXTURES_ROOT / 'data' / 'event2mind_small.csv')
)
instances = ensure_list(instances)

assert len(instances) == 21
instance = instances[0]
assert get_text("source", instance) == ["@start@", "it", "is", "personx", "'s",
"favorite", "animal", "@end@"]
assert get_text("xintent", instance) == ["@start@", "none", "@end@"]
assert get_text("xreact", instance) == ["@start@", "none", "@end@"]
assert get_text("oreact", instance) == ["@start@", "none", "@end@"]

instance = instances[6]
assert get_text("source", instance) == ["@start@", "personx", "drives",
"persony", "'s", "truck", "@end@"]
assert get_text("xintent", instance) == ["@start@", "none", "@end@"]
assert get_text("xreact", instance) == ["@start@", "none", "@end@"]
assert get_text("oreact", instance) == ["@start@", "none", "@end@"]

instance = instances[7]
assert get_text("source", instance) == ["@start@", "none", "@end@"]
assert get_text("xintent", instance) == ["@start@", "move", "@end@"]
assert get_text("xreact", instance) == ["@start@", "none", "@end@"]
assert get_text("oreact", instance) == ["@start@", "none", "@end@"]

instance = instances[9]
assert get_text("source", instance) == ["@start@", "none", "@end@"]
assert get_text("xintent", instance) == ["@start@", "none", "@end@"]
assert get_text("xreact", instance) == ["@start@", "grateful", "@end@"]
assert get_text("oreact", instance) == ["@start@", "none", "@end@"]

instance = instances[11]
assert get_text("source", instance) == ["@start@", "none", "@end@"]
assert get_text("xintent", instance) == ["@start@", "none", "@end@"]
assert get_text("xreact", instance) == ["@start@", "none", "@end@"]
assert get_text("oreact", instance) == ["@start@", "charitable", "@end@"]

instance = instances[13]
assert get_text("source", instance) == ["@start@", "personx", "gets", "persony",
"'s", "mother", "@end@"]
assert get_text("xintent", instance) == ["@start@", "none", "@end@"]
assert get_text("xreact", instance) == ["@start@", "none", "@end@"]
assert get_text("oreact", instance) == ["@start@", "none", "@end@"]
1 change: 1 addition & 0 deletions allennlp/tests/fixtures/data/event2mind_small.csv
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@ Source,Event,Xintent,Xemotion,Otheremotion,Xsent,Osent
it_events,It is PersonX's favorite animal,"[""none""]","[""excited to see it"", ""happy"", ""lucky""]","[""none""]",,4.0
rocstory,PersonX drives Person Y's truck,"[""to move"", ""to steal""]","[""grateful"", ""guilty""]","[""charitable"", ""enraged""]",3.0,5.0
rocstory,PersonX gets PersonY's mother,"[""to be helpful""]","[""useful""]","[""grateful""]",3.0,4.0
rocstory,PersonX drives Person Y's truck,"[""for fun""]","[""happy""]","[""like a good friend""]",3.0,5.0
10 changes: 1 addition & 9 deletions allennlp/tests/fixtures/event2mind/experiment.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,11 @@
"type": "spacy"
}
},
"target_tokenizer": {
"type": "word"
},
"source_token_indexers": {
"tokens": {
"type": "single_id",
"namespace": "source_tokens"
}
},
"target_token_indexers": {
"tokens": {
"namespace": "target_tokens"
}
}
},
"vocabulary": {
Expand Down Expand Up @@ -48,7 +40,7 @@
"bidirectional": true
},
"max_decoding_steps": 10,
"target_namespace": "target_tokens"
"target_namespace": "source_tokens"
},
"iterator": {
"type": "bucket",
Expand Down
26 changes: 12 additions & 14 deletions training_config/event2mind.json
Original file line number Diff line number Diff line change
@@ -1,29 +1,26 @@
{
"dataset_reader": {
"type": "event2mind",
# Uncomment this when generating the vocabularly with `dry-run`.
#"dummy_instances_for_vocab_generation": true,
"source_tokenizer": {
"type": "word",
"word_splitter": {
"type": "spacy"
}
},
"target_tokenizer": {
"type": "word"
},
"source_token_indexers": {
"tokens": {
"type": "single_id",
"namespace": "source_tokens"
}
},
"target_token_indexers": {
"tokens": {
"namespace": "target_tokens"
}
}
},
"vocabulary": {
"min_count": {"tokens": 2}
# Uncomment this when generating the vocabularly with `dry-run`.
#"min_count": {"source_tokens": 2}
# Uncomment this when training using an existing vocabularly.
#"directory_path": "output_dir/vocabulary/"
},
"train_data_path": "https://raw.githubusercontent.com/uwnlp/event2mind/master/docs/data/train.csv",
"validation_data_path": "https://raw.githubusercontent.com/uwnlp/event2mind/master/docs/data/dev.csv",
Expand All @@ -34,8 +31,7 @@
"tokens": {
"type": "embedding",
"vocab_namespace": "source_tokens",
# TODO(brendanr): Upload the w2v embeddings and use those if permissible.
"pretrained_file": "https://s3-us-west-2.amazonaws.com/allennlp/datasets/glove/glove.6B.300d.txt.gz",
"pretrained_file": "https://s3-us-west-2.amazonaws.com/allennlp/datasets/word2vec/GoogleNews-vectors-negative300.txt.gz",
"embedding_dim": 300,
"trainable": false
}
Expand All @@ -52,7 +48,8 @@
"bidirectional": true
},
"max_decoding_steps": 10,
"target_namespace": "target_tokens"
# Following the original model we use a single namespace.
"target_namespace": "source_tokens"
},
"iterator": {
"type": "bucket",
Expand All @@ -61,11 +58,12 @@
"sorting_keys": [["source", "num_tokens"]]
},
"trainer": {
"num_epochs": 40,
"num_epochs": 10,
"patience": 10,
"cuda_device": 0,
"optimizer": {
"type": "adam"
}
},
"validation_metric": "+xintent"
}
}

0 comments on commit a94a23e

Please sign in to comment.