From a94a23ebfd7425ee94fdb5a4c2352e45e7b84baa Mon Sep 17 00:00:00 2001 From: Brendan Roof Date: Mon, 15 Oct 2018 18:30:10 -0700 Subject: [PATCH] More closely emulate original Event2Mind implementation. (#1903) - Use word2vec instead of glove. - Fix bugs in vocabulary configuration. 1. Place namespace under `min_count`. 2. Unify source and target namespaces. 3. Generate vocabulary with `dry-run` to correctly count tokens, i.e. without multiplicity from the combinations of intents and reactions. - Training is now a two-step process: ``` allennlp dry-run -o '{"dataset_reader": {"dummy_instances_for_vocab_generation": true}} {"vocabulary": {"min_count": {"source_tokens": 2}}}' training_config/event2mind.json --serialization-dir vocab_output_path allennlp train -o '{"vocabulary": {"directory_path": "vocab_output_path/vocabulary/"}}' training_config/event2mind.json --serialization-dir output_path ``` --- allennlp/data/dataset_readers/event2mind.py | 37 ++++++++++-- .../data/dataset_readers/event2mind_test.py | 59 ++++++++++++++++++- .../tests/fixtures/data/event2mind_small.csv | 1 + .../tests/fixtures/event2mind/experiment.json | 10 +--- training_config/event2mind.json | 26 ++++---- 5 files changed, 103 insertions(+), 30 deletions(-) diff --git a/allennlp/data/dataset_readers/event2mind.py b/allennlp/data/dataset_readers/event2mind.py index ebaf54c1bfa..d1ab7acef06 100644 --- a/allennlp/data/dataset_readers/event2mind.py +++ b/allennlp/data/dataset_readers/event2mind.py @@ -51,8 +51,15 @@ class Event2MindDatasetReader(DatasetReader): target_token_indexers : ``Dict[str, TokenIndexer]``, optional Indexers used to define output (target side) token representations. Defaults to ``source_token_indexers``. - source_add_start_token : bool, (optional, default=True) - Whether or not to add `START_SYMBOL` to the beginning of the source sequence. + source_add_start_token : ``bool``, (optional, default=True) + Whether or not to add ``START_SYMBOL`` to the beginning of the source sequence. + dummy_instances_for_vocab_generation : ``bool`` (optional, default=False) + Whether to generate instances that use each token of input precisely + once. Normally we instead generate all combinations of Source, Xintent, + Xemotion and Otheremotion columns whichs distort the underlying token + counts. This flag should be used exclusively with the ``dry-run`` + command as the instances generated will be nonsensical outside the + context of vocabulary generation. """ def __init__(self, source_tokenizer: Tokenizer = None, @@ -60,6 +67,7 @@ def __init__(self, source_token_indexers: Dict[str, TokenIndexer] = None, target_token_indexers: Dict[str, TokenIndexer] = None, source_add_start_token: bool = True, + dummy_instances_for_vocab_generation: bool = False, lazy: bool = False) -> None: super().__init__(lazy) self._source_tokenizer = source_tokenizer or WordTokenizer() @@ -67,6 +75,7 @@ def __init__(self, self._source_token_indexers = source_token_indexers or {"tokens": SingleIdTokenIndexer()} self._target_token_indexers = target_token_indexers or self._source_token_indexers self._source_add_start_token = source_add_start_token + self._dummy_instances_for_vocab_generation = dummy_instances_for_vocab_generation @overrides def _read(self, file_path): @@ -84,10 +93,28 @@ def _read(self, file_path): xintents = json.loads(line_parts[2]) xreacts = json.loads(line_parts[3]) oreacts = json.loads(line_parts[4]) - for xintent in xintents: + + # Generate all combinations. + if not self._dummy_instances_for_vocab_generation: + for xintent in xintents: + for xreact in xreacts: + for oreact in oreacts: + yield self.text_to_instance( + source_sequence, xintent, xreact, oreact + ) + # Generate instances where each token of input appears once. + else: + # To the extent that sources are duplicated in the dataset + # (which appears common), we will duplicate them here. + yield self.text_to_instance(source_sequence, "none", "none", "none") + for xintent in xintents: + # Since "none" is a special token we don't mind it + # appearing a disproportionate number of times. + yield self.text_to_instance("none", xintent, "none", "none") for xreact in xreacts: - for oreact in oreacts: - yield self.text_to_instance(source_sequence, xintent, xreact, oreact) + yield self.text_to_instance("none", "none", xreact, "none") + for oreact in oreacts: + yield self.text_to_instance("none", "none", "none", oreact) @staticmethod def _preprocess_string(tokenizer, string: str) -> str: diff --git a/allennlp/tests/data/dataset_readers/event2mind_test.py b/allennlp/tests/data/dataset_readers/event2mind_test.py index 8a39c77c173..669fa4d1861 100644 --- a/allennlp/tests/data/dataset_readers/event2mind_test.py +++ b/allennlp/tests/data/dataset_readers/event2mind_test.py @@ -14,14 +14,14 @@ def get_text(key: str, instance: Instance): class TestEvent2MindDatasetReader: @pytest.mark.parametrize("lazy", (True, False)) - def test_default_format(self, lazy): + def test_read(self, lazy): reader = Event2MindDatasetReader(lazy=lazy) instances = reader.read( str(AllenNlpTestCase.FIXTURES_ROOT / 'data' / 'event2mind_small.csv') ) instances = ensure_list(instances) - assert len(instances) == 12 + assert len(instances) == 13 instance = instances[0] assert get_text("source", instance) == ["@start@", "it", "is", "personx", "'s", "favorite", "animal", "@end@"] @@ -58,3 +58,58 @@ def test_default_format(self, lazy): assert get_text("xintent", instance) == ["@start@", "helpful", "@end@"] assert get_text("xreact", instance) == ["@start@", "useful", "@end@"] assert get_text("oreact", instance) == ["@start@", "grateful", "@end@"] + + instance = instances[12] + assert get_text("source", instance) == ["@start@", "personx", "drives", + "persony", "'s", "truck", "@end@"] + assert get_text("xintent", instance) == ["@start@", "for", "fun", "@end@"] + assert get_text("xreact", instance) == ["@start@", "happy", "@end@"] + assert get_text("oreact", instance) == ["@start@", "like", "a", "good", "friend", "@end@"] + + @pytest.mark.parametrize("lazy", (True, False)) + def test_read_with_dummy_instances_for_vocab_generation(self, lazy): + reader = Event2MindDatasetReader(lazy=lazy, dummy_instances_for_vocab_generation=True) + instances = reader.read( + str(AllenNlpTestCase.FIXTURES_ROOT / 'data' / 'event2mind_small.csv') + ) + instances = ensure_list(instances) + + assert len(instances) == 21 + instance = instances[0] + assert get_text("source", instance) == ["@start@", "it", "is", "personx", "'s", + "favorite", "animal", "@end@"] + assert get_text("xintent", instance) == ["@start@", "none", "@end@"] + assert get_text("xreact", instance) == ["@start@", "none", "@end@"] + assert get_text("oreact", instance) == ["@start@", "none", "@end@"] + + instance = instances[6] + assert get_text("source", instance) == ["@start@", "personx", "drives", + "persony", "'s", "truck", "@end@"] + assert get_text("xintent", instance) == ["@start@", "none", "@end@"] + assert get_text("xreact", instance) == ["@start@", "none", "@end@"] + assert get_text("oreact", instance) == ["@start@", "none", "@end@"] + + instance = instances[7] + assert get_text("source", instance) == ["@start@", "none", "@end@"] + assert get_text("xintent", instance) == ["@start@", "move", "@end@"] + assert get_text("xreact", instance) == ["@start@", "none", "@end@"] + assert get_text("oreact", instance) == ["@start@", "none", "@end@"] + + instance = instances[9] + assert get_text("source", instance) == ["@start@", "none", "@end@"] + assert get_text("xintent", instance) == ["@start@", "none", "@end@"] + assert get_text("xreact", instance) == ["@start@", "grateful", "@end@"] + assert get_text("oreact", instance) == ["@start@", "none", "@end@"] + + instance = instances[11] + assert get_text("source", instance) == ["@start@", "none", "@end@"] + assert get_text("xintent", instance) == ["@start@", "none", "@end@"] + assert get_text("xreact", instance) == ["@start@", "none", "@end@"] + assert get_text("oreact", instance) == ["@start@", "charitable", "@end@"] + + instance = instances[13] + assert get_text("source", instance) == ["@start@", "personx", "gets", "persony", + "'s", "mother", "@end@"] + assert get_text("xintent", instance) == ["@start@", "none", "@end@"] + assert get_text("xreact", instance) == ["@start@", "none", "@end@"] + assert get_text("oreact", instance) == ["@start@", "none", "@end@"] diff --git a/allennlp/tests/fixtures/data/event2mind_small.csv b/allennlp/tests/fixtures/data/event2mind_small.csv index 5a466d19f76..a14720e8377 100644 --- a/allennlp/tests/fixtures/data/event2mind_small.csv +++ b/allennlp/tests/fixtures/data/event2mind_small.csv @@ -2,3 +2,4 @@ Source,Event,Xintent,Xemotion,Otheremotion,Xsent,Osent it_events,It is PersonX's favorite animal,"[""none""]","[""excited to see it"", ""happy"", ""lucky""]","[""none""]",,4.0 rocstory,PersonX drives Person Y's truck,"[""to move"", ""to steal""]","[""grateful"", ""guilty""]","[""charitable"", ""enraged""]",3.0,5.0 rocstory,PersonX gets PersonY's mother,"[""to be helpful""]","[""useful""]","[""grateful""]",3.0,4.0 +rocstory,PersonX drives Person Y's truck,"[""for fun""]","[""happy""]","[""like a good friend""]",3.0,5.0 diff --git a/allennlp/tests/fixtures/event2mind/experiment.json b/allennlp/tests/fixtures/event2mind/experiment.json index 844fa73c6e2..1474e258ae8 100644 --- a/allennlp/tests/fixtures/event2mind/experiment.json +++ b/allennlp/tests/fixtures/event2mind/experiment.json @@ -7,19 +7,11 @@ "type": "spacy" } }, - "target_tokenizer": { - "type": "word" - }, "source_token_indexers": { "tokens": { "type": "single_id", "namespace": "source_tokens" } - }, - "target_token_indexers": { - "tokens": { - "namespace": "target_tokens" - } } }, "vocabulary": { @@ -48,7 +40,7 @@ "bidirectional": true }, "max_decoding_steps": 10, - "target_namespace": "target_tokens" + "target_namespace": "source_tokens" }, "iterator": { "type": "bucket", diff --git a/training_config/event2mind.json b/training_config/event2mind.json index 29e74c082dd..6d85b808b22 100644 --- a/training_config/event2mind.json +++ b/training_config/event2mind.json @@ -1,29 +1,26 @@ { "dataset_reader": { "type": "event2mind", + # Uncomment this when generating the vocabularly with `dry-run`. + #"dummy_instances_for_vocab_generation": true, "source_tokenizer": { "type": "word", "word_splitter": { "type": "spacy" } }, - "target_tokenizer": { - "type": "word" - }, "source_token_indexers": { "tokens": { "type": "single_id", "namespace": "source_tokens" } - }, - "target_token_indexers": { - "tokens": { - "namespace": "target_tokens" - } } }, "vocabulary": { - "min_count": {"tokens": 2} + # Uncomment this when generating the vocabularly with `dry-run`. + #"min_count": {"source_tokens": 2} + # Uncomment this when training using an existing vocabularly. + #"directory_path": "output_dir/vocabulary/" }, "train_data_path": "https://raw.githubusercontent.com/uwnlp/event2mind/master/docs/data/train.csv", "validation_data_path": "https://raw.githubusercontent.com/uwnlp/event2mind/master/docs/data/dev.csv", @@ -34,8 +31,7 @@ "tokens": { "type": "embedding", "vocab_namespace": "source_tokens", - # TODO(brendanr): Upload the w2v embeddings and use those if permissible. - "pretrained_file": "https://s3-us-west-2.amazonaws.com/allennlp/datasets/glove/glove.6B.300d.txt.gz", + "pretrained_file": "https://s3-us-west-2.amazonaws.com/allennlp/datasets/word2vec/GoogleNews-vectors-negative300.txt.gz", "embedding_dim": 300, "trainable": false } @@ -52,7 +48,8 @@ "bidirectional": true }, "max_decoding_steps": 10, - "target_namespace": "target_tokens" + # Following the original model we use a single namespace. + "target_namespace": "source_tokens" }, "iterator": { "type": "bucket", @@ -61,11 +58,12 @@ "sorting_keys": [["source", "num_tokens"]] }, "trainer": { - "num_epochs": 40, + "num_epochs": 10, "patience": 10, "cuda_device": 0, "optimizer": { "type": "adam" - } + }, + "validation_metric": "+xintent" } }