Skip to content

Commit

Permalink
Rework training action to take input more closely inline with existin…
Browse files Browse the repository at this point in the history
…g config format.

Reference: https://github.com/mozilla/firefox-translations-training/blob/main/configs/config.test.yml

I've been meaning to do this for awhile, and I had to change this anyways due to merge corpus tasks not having a single `dataset` that we can generate a name from. This ends being a fairly significant rework to do a few things:
* Use the same format for `datasets` as the existing pipeline configs
* Use attributes to filter tasks in `train-target-tasks`, rather than the more brittle method of names.

The former necessitated amending the pipeline steps with a `dataset-category` attribute -- as there are multiple lists of datasets that the pipeline configures. Each part of the pipeline uses a different list.

There's still more to do to get this schema inline with the config, but this is a significant step forward, and unblocks the current work.
  • Loading branch information
bhearsum committed May 16, 2023
1 parent e407285 commit 253e8ae
Show file tree
Hide file tree
Showing 6 changed files with 126 additions and 24 deletions.
1 change: 1 addition & 0 deletions taskcluster/ci/bicleaner/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ kind-dependencies:

task-defaults:
attributes:
dataset-category: train
cache:
resources:
- pipeline/bicleaner/bicleaner.sh
Expand Down
1 change: 1 addition & 0 deletions taskcluster/ci/clean/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ task-defaults:
description: Clean {provider} {dataset} dataset {src_locale}-{trg_locale}
attributes:
cleaning-type: clean
dataset-category: train
cache:
type: clean
resources:
Expand Down
1 change: 1 addition & 0 deletions taskcluster/ci/dataset/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ transforms:
task-defaults:
worker-type: b-linux
attributes:
dataset-category: train
cache:
type: dataset
dataset-config:
Expand Down
90 changes: 70 additions & 20 deletions taskcluster/translations_taskgraph/actions/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@ def can_train(parameters):
return parameters["head_repository"] in TRAIN_ON_PROJECTS


# Stages that only have locales in their task names (not providers/datasets).
# Typically these are stages that "fan in" and a consume a number of upstream
# tasks that are per-dataset.
LOCALE_ONLY_STAGES = ["merge-corpus"]

@register_callback_action(
name="train",
title="Train",
Expand All @@ -37,16 +42,69 @@ def can_train(parameters):
(any stages this choice depends on will be automatically included).""",
"default": "",
# TODO: this should probably be specified in ci/config.yml
"enum": ["clean", "bicleaner", "bicleaner-ai"],
"enum": ["clean", "bicleaner", "bicleaner-ai", "merge-corpus"],
},
"datasets": {
"type": "array",
"type": "object",
"description": "The datasets to train with",
"default": [],
"items": {
"type": "string",
# TODO: pull this from ci/config.yml
"enum": ["flores-dev"],
"default": {},
"properties": {
"train": {
"type": "array",
"description": "Parallel training corpus",
"default": [],
"items": {
"type": "string",
# TODO
# "enum": []
},
},
"devtest": {
"type": "array",
"description": "datasets to merge for validation while training",
"default": [],
"items": {
"type": "string",
# TODO
# "enum": []
},
},
"test": {
"type": "array",
"description": "datasets for evaluation",
"default": [],
"items": {
"type": "string",
# TODO
# "enum": []
},
},
"mono-src": {
"type": "array",
"description": """
monolingual datasets (ex. paracrawl-mono_paracrawl8, commoncrawl_wmt16, news-crawl_news.2020)
to be translated by the teacher model
""",
"default": [],
"items": {
"type": "string",
# TODO
# "enum": []
},
},
"mono-trg": {
"type": "array",
"description": """
to be translated by the backward model to augment teacher corpus with back-translations
leave empty to skip augmentation step (high resource languages)
""",
"default": [],
"items": {
"type": "string",
# TODO
# "enum": []
},
},
},
},
# TODO: should these be replaced with a single pair?
Expand Down Expand Up @@ -77,12 +135,6 @@ def can_train(parameters):
},
)
def train_action(parameters, graph_config, input, task_group_id, task_id):
stage = input["stage"]
target_datasets = input["datasets"]
src_locale = input.get("src_locale")
trg_locale = input.get("trg_locale")
graph_config["datasets"]
locale_str = f"{src_locale}-{trg_locale}"

# TODO: Add a whack load of verification here. Things such as:
# - datasets all exist
Expand All @@ -93,15 +145,13 @@ def train_action(parameters, graph_config, input, task_group_id, task_id):
parameters = dict(parameters)

parameters["target_tasks_method"] = "train-target-tasks"

# When doing staging releases, we still want to re-use tasks from previous
# graphs.
parameters["optimize_target_tasks"] = True
parameters["tasks_for"] = "action"

# make parameters read-only
parameters["target_task_names"] = [f"{stage}-{d}-{locale_str}" for d in target_datasets]
parameters["stage"] = input["stage"]
parameters["datasets"] = input["datasets"]
parameters["src_locale"] = input["src_locale"]
parameters["trg_locale"] = input["trg_locale"]
parameters["bicleaner_threshold"] = input["bicleaner_threshold"]
parameters = Parameters(**parameters)

parameters = Parameters(**parameters)
taskgraph_decision({"root": graph_config.root_dir}, parameters=parameters)
28 changes: 28 additions & 0 deletions taskcluster/translations_taskgraph/parameters.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,39 @@
def get_defaults(repo_root):
return {
"bicleaner_threshold": "0.0",
# These will never be used in practice, but specifying them ensures
# that we always generate at least one task for each kind, which helps
# to avoid bustage that doesn't show up until we run the training action.
"datasets": {
"train": [
"flores_dev",
"sacrebleu_wmt19",
],
"devtest": [
"flores_dev",
"sacrebleu_wmt19",
],
"test": [
"flores_dev",
"sacrebleu_wmt19",
],
"mono-src": [
"flores_dev",
"sacrebleu_wmt19",
],
"mono-trg": [
"flores_dev",
"sacrebleu_wmt19",
],
},
}

extend_parameters_schema(
{
Optional("bicleaner_threshold"): str,
Optional("datasets"): {
str: [str],
},
},
defaults_fn=get_defaults,
)
Expand Down
29 changes: 25 additions & 4 deletions taskcluster/translations_taskgraph/target_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,29 @@

@_target_task("train-target-tasks")
def train_target_tasks(full_task_graph, parameters, graph_config):
def filter(label):
if label in parameters["target_task_names"]:
return True
stage = parameters["stage"]
datasets = parameters["datasets"]
src_locale = parameters["src_locale"]
trg_locale = parameters["trg_locale"]
def filter(task):
# These attributes will be present on tasks from all stages
for attr in ("stage", "src_locale", "trg_locale"):
if task.attributes.get(attr) != parameters[attr]:
return False

return [label for label in full_task_graph.tasks.keys() if filter(label)]
# Datasets are only applicable to dataset-specific tasks. If these
# attribute isn't present on the task it can be assumed to be included
# if the above attributes matched, as it will be a task that is either
# agnostic of datasets, or folds in datasets from earlier tasks.
# (Pulling in the appropriate datasets for these task must be handled at
# the task generation level, usually by the `find_upstreams` transform.)
if "dataset" in task.attributes:
dataset_category = task.attributes["dataset-category"]
for ds in parameters["datasets"][dataset_category]:
provider, dataset = ds.split("_", 1)
if task.attributes["provider"] != provider or task.attributes["dataset"] != dataset:
return False

return True

return [label for label, task in full_task_graph.tasks.items() if filter(task)]

0 comments on commit 253e8ae

Please sign in to comment.