Rework training action to take input more closely inline with existin…

…g config format. Reference: https://github.com/mozilla/firefox-translations-training/blob/main/configs/config.test.yml I've been meaning to do this for awhile, and I had to change this anyways due to merge corpus tasks not having a single `dataset` that we can generate a name from. This ends being a fairly significant rework to do a few things: * Use the same format for `datasets` as the existing pipeline configs * Use attributes to filter tasks in `train-target-tasks`, rather than the more brittle method of names. The former necessitated amending the pipeline steps with a `dataset-category` attribute -- as there are multiple lists of datasets that the pipeline configures. Each part of the pipeline uses a different list. There's still more to do to get this schema inline with the config, but this is a significant step forward, and unblocks the current work.
mozilla-releng · May 16, 2023 · 253e8ae · 253e8ae
1 parent e407285
commit 253e8ae
Show file tree

Hide file tree

Showing 6 changed files with 126 additions and 24 deletions.
diff --git a/taskcluster/ci/bicleaner/kind.yml b/taskcluster/ci/bicleaner/kind.yml
@@ -20,6 +20,7 @@ kind-dependencies:
 
 task-defaults:
     attributes:
+        dataset-category: train
         cache:
             resources:
                 - pipeline/bicleaner/bicleaner.sh

diff --git a/taskcluster/ci/clean/kind.yml b/taskcluster/ci/clean/kind.yml
@@ -19,6 +19,7 @@ task-defaults:
     description: Clean {provider} {dataset} dataset {src_locale}-{trg_locale}
     attributes:
         cleaning-type: clean
+        dataset-category: train
         cache:
             type: clean
             resources:

diff --git a/taskcluster/ci/dataset/kind.yml b/taskcluster/ci/dataset/kind.yml
@@ -18,6 +18,7 @@ transforms:
 task-defaults:
     worker-type: b-linux
     attributes:
+        dataset-category: train
         cache:
             type: dataset
     dataset-config:

diff --git a/taskcluster/translations_taskgraph/actions/train.py b/taskcluster/translations_taskgraph/actions/train.py
@@ -16,6 +16,11 @@ def can_train(parameters):
     return parameters["head_repository"] in TRAIN_ON_PROJECTS
 
 
+# Stages that only have locales in their task names (not providers/datasets).
+# Typically these are stages that "fan in" and a consume a number of upstream
+# tasks that are per-dataset.
+LOCALE_ONLY_STAGES = ["merge-corpus"]
+
 @register_callback_action(
     name="train",
     title="Train",
@@ -37,16 +42,69 @@ def can_train(parameters):
 (any stages this choice depends on will be automatically included).""",
                 "default": "",
                 # TODO: this should probably be specified in ci/config.yml
-                "enum": ["clean", "bicleaner", "bicleaner-ai"],
+                "enum": ["clean", "bicleaner", "bicleaner-ai", "merge-corpus"],
             },
             "datasets": {
-                "type": "array",
+                "type": "object",
                 "description": "The datasets to train with",
-                "default": [],
-                "items": {
-                    "type": "string",
-                    # TODO: pull this from ci/config.yml
-                    "enum": ["flores-dev"],
+                "default": {},
+                "properties": {
+                    "train": {
+                        "type": "array",
+                        "description": "Parallel training corpus",
+                        "default": [],
+                        "items": {
+                            "type": "string",
+                            # TODO
+                            # "enum": []
+                        },
+                    },
+                    "devtest": {
+                        "type": "array",
+                        "description": "datasets to merge for validation while training",
+                        "default": [],
+                        "items": {
+                            "type": "string",
+                            # TODO
+                            # "enum": []
+                        },
+                    },
+                    "test": {
+                        "type": "array",
+                        "description": "datasets for evaluation",
+                        "default": [],
+                        "items": {
+                            "type": "string",
+                            # TODO
+                            # "enum": []
+                        },
+                    },
+                    "mono-src": {
+                        "type": "array",
+                        "description": """
+monolingual datasets (ex. paracrawl-mono_paracrawl8, commoncrawl_wmt16, news-crawl_news.2020)
+to be translated by the teacher model
+""",
+                        "default": [],
+                        "items": {
+                            "type": "string",
+                            # TODO
+                            # "enum": []
+                        },
+                    },
+                    "mono-trg": {
+                        "type": "array",
+                        "description": """
+to be translated by the backward model to augment teacher corpus with back-translations
+leave empty to skip augmentation step (high resource languages)
+""",
+                        "default": [],
+                        "items": {
+                            "type": "string",
+                            # TODO
+                            # "enum": []
+                        },
+                    },
                 },
             },
             # TODO: should these be replaced with a single pair?
@@ -77,12 +135,6 @@ def can_train(parameters):
     },
 )
 def train_action(parameters, graph_config, input, task_group_id, task_id):
-    stage = input["stage"]
-    target_datasets = input["datasets"]
-    src_locale = input.get("src_locale")
-    trg_locale = input.get("trg_locale")
-    graph_config["datasets"]
-    locale_str = f"{src_locale}-{trg_locale}"
 
     # TODO: Add a whack load of verification here. Things such as:
     # - datasets all exist
@@ -93,15 +145,13 @@ def train_action(parameters, graph_config, input, task_group_id, task_id):
     parameters = dict(parameters)
 
     parameters["target_tasks_method"] = "train-target-tasks"
-
-    # When doing staging releases, we still want to re-use tasks from previous
-    # graphs.
     parameters["optimize_target_tasks"] = True
     parameters["tasks_for"] = "action"
-
-    # make parameters read-only
-    parameters["target_task_names"] = [f"{stage}-{d}-{locale_str}" for d in target_datasets]
+    parameters["stage"] = input["stage"]
+    parameters["datasets"] = input["datasets"]
+    parameters["src_locale"] = input["src_locale"]
+    parameters["trg_locale"] = input["trg_locale"]
     parameters["bicleaner_threshold"] = input["bicleaner_threshold"]
-    parameters = Parameters(**parameters)
 
+    parameters = Parameters(**parameters)
     taskgraph_decision({"root": graph_config.root_dir}, parameters=parameters)
diff --git a/taskcluster/translations_taskgraph/parameters.py b/taskcluster/translations_taskgraph/parameters.py
@@ -8,11 +8,39 @@
 def get_defaults(repo_root):
     return {
         "bicleaner_threshold": "0.0",
+        # These will never be used in practice, but specifying them ensures
+        # that we always generate at least one task for each kind, which helps
+        # to avoid bustage that doesn't show up until we run the training action.
+        "datasets": {
+            "train": [
+                "flores_dev",
+                "sacrebleu_wmt19",
+            ],
+            "devtest": [
+                "flores_dev",
+                "sacrebleu_wmt19",
+            ],
+            "test": [
+                "flores_dev",
+                "sacrebleu_wmt19",
+            ],
+            "mono-src": [
+                "flores_dev",
+                "sacrebleu_wmt19",
+            ],
+            "mono-trg": [
+                "flores_dev",
+                "sacrebleu_wmt19",
+            ],
+        },
     }
 
 extend_parameters_schema(
     {
         Optional("bicleaner_threshold"): str,
+        Optional("datasets"): {
+            str: [str],
+        },
     },
     defaults_fn=get_defaults,
 )

diff --git a/taskcluster/translations_taskgraph/target_tasks.py b/taskcluster/translations_taskgraph/target_tasks.py
@@ -3,8 +3,29 @@
 
 @_target_task("train-target-tasks")
 def train_target_tasks(full_task_graph, parameters, graph_config):
-    def filter(label):
-        if label in parameters["target_task_names"]:
-            return True
+    stage = parameters["stage"]
+    datasets = parameters["datasets"]
+    src_locale = parameters["src_locale"]
+    trg_locale = parameters["trg_locale"]
+    def filter(task):
+        # These attributes will be present on tasks from all stages
+        for attr in ("stage", "src_locale", "trg_locale"):
+            if task.attributes.get(attr) != parameters[attr]:
+                return False
 
-    return [label for label in full_task_graph.tasks.keys() if filter(label)]
+        # Datasets are only applicable to dataset-specific tasks. If these
+        # attribute isn't present on the task it can be assumed to be included
+        # if the above attributes matched, as it will be a task that is either
+        # agnostic of datasets, or folds in datasets from earlier tasks.
+        # (Pulling in the appropriate datasets for these task must be handled at 
+        # the task generation level, usually by the `find_upstreams` transform.)
+        if "dataset" in task.attributes:
+            dataset_category = task.attributes["dataset-category"]
+            for ds in parameters["datasets"][dataset_category]:
+                provider, dataset = ds.split("_", 1)
+                if task.attributes["provider"] != provider or task.attributes["dataset"] != dataset:
+                    return False
+
+        return True
+
+    return [label for label, task in full_task_graph.tasks.items() if filter(task)]