diff --git a/taskcluster/ci/merge-corpus/kind.yml b/taskcluster/ci/merge-corpus/kind.yml new file mode 100644 index 000000000..22d047ad2 --- /dev/null +++ b/taskcluster/ci/merge-corpus/kind.yml @@ -0,0 +1,96 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +--- + +loader: taskgraph.loader.transform:loader + +transforms: + - translations_taskgraph.transforms.from_datasets:locales_only + - translations_taskgraph.transforms.find_upstreams:by_locales + - taskgraph.transforms.job:transforms + - translations_taskgraph.transforms.cache:transforms + - taskgraph.transforms.cached_tasks:transforms + - taskgraph.transforms.task:transforms + +kind-dependencies: + # There are three possible upstream tasks for `merge_corpus`, in order of preference: + # 1) `bicleaner-ai` (from the `bicleaner` kind) is used if there is a bicleaner-ai data pack + # for the language pair. + # 2) `bicleaner` (also from the `bicleaner` kind) is used if there is a non-ai bicleaner + # data pack available + # 3) Otherwise, `clean` is the upstream and `bicleaner` is skipped altogether. + - bicleaner + - clean + - toolchain + +tasks: + "{src_locale}-{trg_locale}": + description: merge corpus for {src_locale}-{trg_locale} + treeherder: + platform: merge-corpus/opt + attributes: + dataset-category: train + stage: merge-corpus + cache: + type: merge-corpus + resources: + - pipeline/clean/merge-corpus.sh + dataset-config: + substitution-fields: + - description + - name + - treeherder.symbol + - worker.env + - upstreams-config.locale-pair + upstreams-config: + locale-pair: + src: "{src_locale}" + trg: "{trg_locale}" + upstream-task-attributes: + cleaning-type: + by-cleaning-type: + bicleaner-ai: bicleaner-ai + bicleaner: bicleaner + clean: clean + upstream-artifacts: + - "{dataset_no_slashes}.{src_locale}.zst" + - "{dataset_no_slashes}.{trg_locale}.zst" + worker-type: b-linux-large + worker: + docker-image: {"in-tree": "train"} + max-run-time: 3600 + artifacts: + - name: public/build + path: /builds/worker/artifacts + type: directory + env: + SRC: "{src_locale}" + TRG: "{trg_locale}" + COMPRESSION_CMD: zstdmt + ARTIFACT_EXT: zst + + # Don't run unless explicitly scheduled + run-on-tasks-for: [] + + treeherder: + symbol: "{src_locale}-{trg_locale}" + platform: merge-corpus/opt + run: + using: run-task + command-context: {} + command: + - bash + - -c + # Arguments are: + # 1) output directory + # 2) input files + - >- + export BIN=$MOZ_FETCHES_DIR && + $VCS_PATH/pipeline/clean/merge-corpus.sh + artifacts + $MOZ_FETCHES_DIR/*.zst + + fetches: + toolchain: + - preprocess