More closely emulate original Event2Mind implementation. (allenai#1903)

- Use word2vec instead of glove. - Fix bugs in vocabulary configuration. 1. Place namespace under `min_count`. 2. Unify source and target namespaces. 3. Generate vocabulary with `dry-run` to correctly count tokens, i.e. without multiplicity from the combinations of intents and reactions. - Training is now a two-step process: ``` allennlp dry-run -o '{"dataset_reader": {"dummy_instances_for_vocab_generation": true}} {"vocabulary": {"min_count": {"source_tokens": 2}}}' training_config/event2mind.json --serialization-dir vocab_output_path allennlp train -o '{"vocabulary": {"directory_path": "vocab_output_path/vocabulary/"}}' training_config/event2mind.json --serialization-dir output_path ```
WrRan · Oct 16, 2018 · a94a23e · a94a23e
1 parent d3a8f4f
commit a94a23e
Show file tree

Hide file tree

Showing 5 changed files with 103 additions and 30 deletions.
diff --git a/allennlp/data/dataset_readers/event2mind.py b/allennlp/data/dataset_readers/event2mind.py
@@ -51,22 +51,31 @@ class Event2MindDatasetReader(DatasetReader):
     target_token_indexers : ``Dict[str, TokenIndexer]``, optional
         Indexers used to define output (target side) token representations. Defaults to
         ``source_token_indexers``.
-    source_add_start_token : bool, (optional, default=True)
-        Whether or not to add `START_SYMBOL` to the beginning of the source sequence.
+    source_add_start_token : ``bool``, (optional, default=True)
+        Whether or not to add ``START_SYMBOL`` to the beginning of the source sequence.
+    dummy_instances_for_vocab_generation : ``bool`` (optional, default=False)
+        Whether to generate instances that use each token of input precisely
+        once. Normally we instead generate all combinations of Source, Xintent,
+        Xemotion and Otheremotion columns whichs distort the underlying token
+        counts. This flag should be used exclusively with the ``dry-run``
+        command as the instances generated will be nonsensical outside the
+        context of vocabulary generation.
     """
     def __init__(self,
                  source_tokenizer: Tokenizer = None,
                  target_tokenizer: Tokenizer = None,
                  source_token_indexers: Dict[str, TokenIndexer] = None,
                  target_token_indexers: Dict[str, TokenIndexer] = None,
                  source_add_start_token: bool = True,
+                 dummy_instances_for_vocab_generation: bool = False,
                  lazy: bool = False) -> None:
         super().__init__(lazy)
         self._source_tokenizer = source_tokenizer or WordTokenizer()
         self._target_tokenizer = target_tokenizer or self._source_tokenizer
         self._source_token_indexers = source_token_indexers or {"tokens": SingleIdTokenIndexer()}
         self._target_token_indexers = target_token_indexers or self._source_token_indexers
         self._source_add_start_token = source_add_start_token
+        self._dummy_instances_for_vocab_generation = dummy_instances_for_vocab_generation
 
     @overrides
     def _read(self, file_path):
@@ -84,10 +93,28 @@ def _read(self, file_path):
                 xintents = json.loads(line_parts[2])
                 xreacts = json.loads(line_parts[3])
                 oreacts = json.loads(line_parts[4])
-                for xintent in xintents:
+
+                # Generate all combinations.
+                if not self._dummy_instances_for_vocab_generation:
+                    for xintent in xintents:
+                        for xreact in xreacts:
+                            for oreact in oreacts:
+                                yield self.text_to_instance(
+                                        source_sequence, xintent, xreact, oreact
+                                )
+                # Generate instances where each token of input appears once.
+                else:
+                    # To the extent that sources are duplicated in the dataset
+                    # (which appears common), we will duplicate them here.
+                    yield self.text_to_instance(source_sequence, "none", "none", "none")
+                    for xintent in xintents:
+                        # Since "none" is a special token we don't mind it
+                        # appearing a disproportionate number of times.
+                        yield self.text_to_instance("none", xintent, "none", "none")
                     for xreact in xreacts:
-                        for oreact in oreacts:
-                            yield self.text_to_instance(source_sequence, xintent, xreact, oreact)
+                        yield self.text_to_instance("none", "none", xreact, "none")
+                    for oreact in oreacts:
+                        yield self.text_to_instance("none", "none", "none", oreact)
 
     @staticmethod
     def _preprocess_string(tokenizer, string: str) -> str:

diff --git a/allennlp/tests/data/dataset_readers/event2mind_test.py b/allennlp/tests/data/dataset_readers/event2mind_test.py
@@ -14,14 +14,14 @@ def get_text(key: str, instance: Instance):
 
 class TestEvent2MindDatasetReader:
     @pytest.mark.parametrize("lazy", (True, False))
-    def test_default_format(self, lazy):
+    def test_read(self, lazy):
         reader = Event2MindDatasetReader(lazy=lazy)
         instances = reader.read(
                 str(AllenNlpTestCase.FIXTURES_ROOT / 'data' / 'event2mind_small.csv')
         )
         instances = ensure_list(instances)
 
-        assert len(instances) == 12
+        assert len(instances) == 13
         instance = instances[0]
         assert get_text("source", instance) == ["@start@", "it", "is", "personx", "'s",
                                                 "favorite", "animal", "@end@"]
@@ -58,3 +58,58 @@ def test_default_format(self, lazy):
         assert get_text("xintent", instance) == ["@start@", "helpful", "@end@"]
         assert get_text("xreact", instance) == ["@start@", "useful", "@end@"]
         assert get_text("oreact", instance) == ["@start@", "grateful", "@end@"]
+
+        instance = instances[12]
+        assert get_text("source", instance) == ["@start@", "personx", "drives",
+                                                "persony", "'s", "truck", "@end@"]
+        assert get_text("xintent", instance) == ["@start@", "for", "fun", "@end@"]
+        assert get_text("xreact", instance) == ["@start@", "happy", "@end@"]
+        assert get_text("oreact", instance) == ["@start@", "like", "a", "good", "friend", "@end@"]
+
+    @pytest.mark.parametrize("lazy", (True, False))
+    def test_read_with_dummy_instances_for_vocab_generation(self, lazy):
+        reader = Event2MindDatasetReader(lazy=lazy, dummy_instances_for_vocab_generation=True)
+        instances = reader.read(
+                str(AllenNlpTestCase.FIXTURES_ROOT / 'data' / 'event2mind_small.csv')
+        )
+        instances = ensure_list(instances)
+
+        assert len(instances) == 21
+        instance = instances[0]
+        assert get_text("source", instance) == ["@start@", "it", "is", "personx", "'s",
+                                                "favorite", "animal", "@end@"]
+        assert get_text("xintent", instance) == ["@start@", "none", "@end@"]
+        assert get_text("xreact", instance) == ["@start@", "none", "@end@"]
+        assert get_text("oreact", instance) == ["@start@", "none", "@end@"]
+
+        instance = instances[6]
+        assert get_text("source", instance) == ["@start@", "personx", "drives",
+                                                "persony", "'s", "truck", "@end@"]
+        assert get_text("xintent", instance) == ["@start@", "none", "@end@"]
+        assert get_text("xreact", instance) == ["@start@", "none", "@end@"]
+        assert get_text("oreact", instance) == ["@start@", "none", "@end@"]
+
+        instance = instances[7]
+        assert get_text("source", instance) == ["@start@", "none", "@end@"]
+        assert get_text("xintent", instance) == ["@start@", "move", "@end@"]
+        assert get_text("xreact", instance) == ["@start@", "none", "@end@"]
+        assert get_text("oreact", instance) == ["@start@", "none", "@end@"]
+
+        instance = instances[9]
+        assert get_text("source", instance) == ["@start@", "none", "@end@"]
+        assert get_text("xintent", instance) == ["@start@", "none", "@end@"]
+        assert get_text("xreact", instance) == ["@start@", "grateful", "@end@"]
+        assert get_text("oreact", instance) == ["@start@", "none", "@end@"]
+
+        instance = instances[11]
+        assert get_text("source", instance) == ["@start@", "none", "@end@"]
+        assert get_text("xintent", instance) == ["@start@", "none", "@end@"]
+        assert get_text("xreact", instance) == ["@start@", "none", "@end@"]
+        assert get_text("oreact", instance) == ["@start@", "charitable", "@end@"]
+
+        instance = instances[13]
+        assert get_text("source", instance) == ["@start@", "personx", "gets", "persony",
+                                                "'s", "mother", "@end@"]
+        assert get_text("xintent", instance) == ["@start@", "none", "@end@"]
+        assert get_text("xreact", instance) == ["@start@", "none", "@end@"]
+        assert get_text("oreact", instance) == ["@start@", "none", "@end@"]
diff --git a/allennlp/tests/fixtures/data/event2mind_small.csv b/allennlp/tests/fixtures/data/event2mind_small.csv
@@ -2,3 +2,4 @@ Source,Event,Xintent,Xemotion,Otheremotion,Xsent,Osent
 it_events,It is PersonX's favorite animal,"[""none""]","[""excited to see it"", ""happy"", ""lucky""]","[""none""]",,4.0
 rocstory,PersonX drives Person Y's truck,"[""to move"", ""to steal""]","[""grateful"", ""guilty""]","[""charitable"", ""enraged""]",3.0,5.0
 rocstory,PersonX gets PersonY's mother,"[""to be helpful""]","[""useful""]","[""grateful""]",3.0,4.0
+rocstory,PersonX drives Person Y's truck,"[""for fun""]","[""happy""]","[""like a good friend""]",3.0,5.0
diff --git a/allennlp/tests/fixtures/event2mind/experiment.json b/allennlp/tests/fixtures/event2mind/experiment.json
@@ -7,19 +7,11 @@
         "type": "spacy"
       }
     },
-    "target_tokenizer": {
-      "type": "word"
-    },
     "source_token_indexers": {
       "tokens": {
         "type": "single_id",
         "namespace": "source_tokens"
       }
-    },
-    "target_token_indexers": {
-      "tokens": {
-        "namespace": "target_tokens"
-      }
     }
   },
   "vocabulary": {
@@ -48,7 +40,7 @@
       "bidirectional": true
     },
     "max_decoding_steps": 10,
-    "target_namespace": "target_tokens"
+    "target_namespace": "source_tokens"
   },
   "iterator": {
     "type": "bucket",

diff --git a/training_config/event2mind.json b/training_config/event2mind.json
@@ -1,29 +1,26 @@
 {
   "dataset_reader": {
     "type": "event2mind",
+    # Uncomment this when generating the vocabularly with `dry-run`.
+    #"dummy_instances_for_vocab_generation": true,
     "source_tokenizer": {
       "type": "word",
       "word_splitter": {
         "type": "spacy"
       }
     },
-    "target_tokenizer": {
-      "type": "word"
-    },
     "source_token_indexers": {
       "tokens": {
         "type": "single_id",
         "namespace": "source_tokens"
       }
-    },
-    "target_token_indexers": {
-      "tokens": {
-        "namespace": "target_tokens"
-      }
     }
   },
   "vocabulary": {
-    "min_count": {"tokens": 2}
+    # Uncomment this when generating the vocabularly with `dry-run`.
+    #"min_count": {"source_tokens": 2}
+    # Uncomment this when training using an existing vocabularly.
+    #"directory_path": "output_dir/vocabulary/"
   },
   "train_data_path": "https://raw.githubusercontent.com/uwnlp/event2mind/master/docs/data/train.csv",
   "validation_data_path": "https://raw.githubusercontent.com/uwnlp/event2mind/master/docs/data/dev.csv",
@@ -34,8 +31,7 @@
         "tokens": {
           "type": "embedding",
           "vocab_namespace": "source_tokens",
-          # TODO(brendanr): Upload the w2v embeddings and use those if permissible.
-          "pretrained_file": "https://s3-us-west-2.amazonaws.com/allennlp/datasets/glove/glove.6B.300d.txt.gz",
+          "pretrained_file": "https://s3-us-west-2.amazonaws.com/allennlp/datasets/word2vec/GoogleNews-vectors-negative300.txt.gz",
           "embedding_dim": 300,
           "trainable": false
         }
@@ -52,7 +48,8 @@
       "bidirectional": true
     },
     "max_decoding_steps": 10,
-    "target_namespace": "target_tokens"
+    # Following the original model we use a single namespace.
+    "target_namespace": "source_tokens"
   },
   "iterator": {
     "type": "bucket",
@@ -61,11 +58,12 @@
     "sorting_keys": [["source", "num_tokens"]]
   },
   "trainer": {
-    "num_epochs": 40,
+    "num_epochs": 10,
     "patience": 10,
     "cuda_device": 0,
     "optimizer": {
       "type": "adam"
-    }
+    },
+    "validation_metric": "+xintent"
   }
 }