Skip to content

Commit

Permalink
Add merge-corpus pipeline step
Browse files Browse the repository at this point in the history
The kind itself is quite boring -- all the interesting bits have been handled by transforms.
  • Loading branch information
bhearsum committed May 16, 2023
1 parent ccfed74 commit 0ba83fd
Showing 1 changed file with 96 additions and 0 deletions.
96 changes: 96 additions & 0 deletions taskcluster/ci/merge-corpus/kind.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
---

loader: taskgraph.loader.transform:loader

transforms:
- translations_taskgraph.transforms.from_datasets:locales_only
- translations_taskgraph.transforms.find_upstreams:by_locales
- taskgraph.transforms.job:transforms
- translations_taskgraph.transforms.cache:transforms
- taskgraph.transforms.cached_tasks:transforms
- taskgraph.transforms.task:transforms

kind-dependencies:
# There are three possible upstream tasks for `merge_corpus`, in order of preference:
# 1) `bicleaner-ai` (from the `bicleaner` kind) is used if there is a bicleaner-ai data pack
# for the language pair.
# 2) `bicleaner` (also from the `bicleaner` kind) is used if there is a non-ai bicleaner
# data pack available
# 3) Otherwise, `clean` is the upstream and `bicleaner` is skipped altogether.
- bicleaner
- clean
- toolchain

tasks:
"{src_locale}-{trg_locale}":
description: merge corpus for {src_locale}-{trg_locale}
treeherder:
platform: merge-corpus/opt
attributes:
dataset-category: train
stage: merge-corpus
cache:
type: merge-corpus
resources:
- pipeline/clean/merge-corpus.sh
dataset-config:
substitution-fields:
- description
- name
- treeherder.symbol
- worker.env
- upstreams-config.locale-pair
upstreams-config:
locale-pair:
src: "{src_locale}"
trg: "{trg_locale}"
upstream-task-attributes:
cleaning-type:
by-cleaning-type:
bicleaner-ai: bicleaner-ai
bicleaner: bicleaner
clean: clean
upstream-artifacts:
- "{dataset_no_slashes}.{src_locale}.zst"
- "{dataset_no_slashes}.{trg_locale}.zst"
worker-type: b-linux-large
worker:
docker-image: {"in-tree": "train"}
max-run-time: 3600
artifacts:
- name: public/build
path: /builds/worker/artifacts
type: directory
env:
SRC: "{src_locale}"
TRG: "{trg_locale}"
COMPRESSION_CMD: zstdmt
ARTIFACT_EXT: zst

# Don't run unless explicitly scheduled
run-on-tasks-for: []

treeherder:
symbol: "{src_locale}-{trg_locale}"
platform: merge-corpus/opt
run:
using: run-task
command-context: {}
command:
- bash
- -c
# Arguments are:
# 1) output directory
# 2) input files
- >-
export BIN=$MOZ_FETCHES_DIR &&
$VCS_PATH/pipeline/clean/merge-corpus.sh
artifacts
$MOZ_FETCHES_DIR/*.zst
fetches:
toolchain:
- preprocess

0 comments on commit 0ba83fd

Please sign in to comment.