Skip to content

Commit

Permalink
Combine split_by_provider and dataset_substitution transforms into a …
Browse files Browse the repository at this point in the history
…more general from_datasets one (mozilla#118)
  • Loading branch information
bhearsum authored May 12, 2023
1 parent 3f023cf commit 222e541
Show file tree
Hide file tree
Showing 6 changed files with 217 additions and 130 deletions.
24 changes: 13 additions & 11 deletions taskcluster/ci/bicleaner/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,11 @@
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
---
# TODO: this may not be sude for all locale pairs? or not for all dataset types?
# TODO: this this run on large instances? gpu?

loader: taskgraph.loader.transform:loader

transforms:
- translations_taskgraph.transforms.split_by_provider:transforms
- translations_taskgraph.transforms.dataset_substitutions:transforms
- translations_taskgraph.transforms.from_datasets:transforms
- translations_taskgraph.transforms.command_context_from_params:transforms
- taskgraph.transforms.job:transforms
- translations_taskgraph.transforms.cache:transforms
Expand All @@ -28,13 +25,14 @@ task-defaults:
- pipeline/bicleaner/bicleaner.sh
parameters:
- bicleaner_threshold
substitution-fields:
- description
- name
- dependencies
- fetches
- treeherder.symbol
- worker.env
dataset-config:
substitution-fields:
- description
- name
- dependencies
- fetches
- treeherder.symbol
- worker.env
worker:
max-run-time: 3600
artifacts:
Expand Down Expand Up @@ -123,6 +121,10 @@ tasks:
worker-type: t-linux-v100-gpu
treeherder:
platform: bicleaner-ai/opt
dataset-config:
exclude-locales:
- src: en
trg: ru
attributes:
cache:
type: bicleaner-ai
Expand Down
39 changes: 26 additions & 13 deletions taskcluster/ci/clean/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
loader: taskgraph.loader.transform:loader

transforms:
- translations_taskgraph.transforms.dataset_substitutions:transforms
- translations_taskgraph.transforms.from_datasets:transforms
- taskgraph.transforms.job:transforms
- translations_taskgraph.transforms.cache:transforms
- taskgraph.transforms.cached_tasks:transforms
Expand All @@ -27,13 +27,14 @@ task-defaults:
- pipeline/clean/tools/clean_parallel.py
- pipeline/clean/tools/langid_fasttext.py
worker-type: b-linux
substitution-fields:
- description
- name
- dependencies
- fetches
- treeherder.symbol
- worker.env
dataset-config:
substitution-fields:
- description
- name
- dependencies
- fetches
- treeherder.symbol
- worker.env
worker:
docker-image: {"in-tree": "train"}
max-run-time: 3600
Expand All @@ -55,6 +56,8 @@ task-defaults:
platform: clean/opt
run:
using: run-task
# Include this so `from_datasets` will add a number of other values to it.
command-context: {}
command:
- bash
- -c
Expand All @@ -70,16 +73,24 @@ task-defaults:

tasks:
flores-{dataset}-{src_locale}-{trg_locale}:
provider: flores
dataset-config:
include-datasets:
flores: {}

sacrebleu-{dataset}-{src_locale}-{trg_locale}:
provider: sacrebleu
dataset-config:
include-datasets:
sacrebleu: {}

opus-{dataset_no_slashes}-{src_locale}-{trg_locale}:
provider: opus
dataset-config:
include-datasets:
opus: {}

mtdata-{dataset}-{src_locale}-{trg_locale}:
provider: mtdata
dataset-config:
include-datasets:
mtdata: {}
attributes:
cache:
resources:
Expand All @@ -96,4 +107,6 @@ tasks:
- pipeline/clean/fixes/mtdata_OPUS_UNPC_v1_0.fr.sh

news-crawl-{dataset}-{src_locale}-{trg_locale}:
provider: news-crawl
dataset-config:
include-datasets:
news-crawl: {}
33 changes: 23 additions & 10 deletions taskcluster/ci/dataset/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
loader: taskgraph.loader.transform:loader

transforms:
- translations_taskgraph.transforms.dataset_substitutions:transforms
- translations_taskgraph.transforms.from_datasets:transforms
- taskgraph.transforms.job:transforms
- translations_taskgraph.transforms.cache:transforms
- taskgraph.transforms.cached_tasks:transforms
Expand All @@ -20,10 +20,11 @@ task-defaults:
attributes:
cache:
type: dataset
substitution-fields:
- name
- label
- treeherder.symbol
dataset-config:
substitution-fields:
- name
- label
- treeherder.symbol
worker:
docker-image: {in-tree: toolchain-build}
max-run-time: 1800
Expand All @@ -41,12 +42,16 @@ task-defaults:
run-on-tasks-for: []
run:
using: run-task
# Include this so `from_datasets` will add a number of other values to it.
command-context: {}

tasks:
flores:
description: Fetch flores101 dataset
label: dataset-flores-{dataset}-{src_locale}-{trg_locale}
provider: flores
dataset-config:
include-datasets:
flores: {}
attributes:
cache:
resources:
Expand All @@ -60,7 +65,9 @@ tasks:
sacrebleu:
description: Fetch sacrebleu dataset
label: dataset-sacrebleu-{dataset}-{src_locale}-{trg_locale}
provider: sacrebleu
dataset-config:
include-datasets:
sacrebleu: {}
attributes:
cache:
resources:
Expand All @@ -75,7 +82,9 @@ tasks:
description: Fetch opus dataset
# No slashes version of dataset used here because slashes break caches
label: dataset-opus-{dataset_no_slashes}-{src_locale}-{trg_locale}
provider: opus
dataset-config:
include-datasets:
opus: {}
attributes:
cache:
resources:
Expand All @@ -89,7 +98,9 @@ tasks:
mtdata:
description: Fetch mtdata dataset
label: dataset-mtdata-{dataset}-{src_locale}-{trg_locale}
provider: mtdata
dataset-config:
include-datasets:
mtdata: {}
attributes:
cache:
resources:
Expand All @@ -103,7 +114,9 @@ tasks:
news-crawl:
description: Fetch news-crawl dataset
label: dataset-news-crawl-{dataset}-{src_locale}-{trg_locale}
provider: news-crawl
dataset-config:
include-datasets:
news-crawl: {}
attributes:
cache:
resources:
Expand Down

This file was deleted.

Loading

0 comments on commit 222e541

Please sign in to comment.