diff --git a/README.md b/README.md index 72fc218..00fa387 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,8 @@ Configuration files could include: # Download models: https://object.pouta.csc.fi/hplt_bitextor_models/ara_base.tar.gz \ https://object.pouta.csc.fi/hplt_bitextor_models/ara_tiny.tar.gz \ +https://object.pouta.csc.fi/hplt_bitextor_models/ca-en_exported_base.zip \ +https://object.pouta.csc.fi/hplt_bitextor_models/ca-en_exported_tiny.zip \ https://object.pouta.csc.fi/hplt_bitextor_models/eus_base.zip \ https://object.pouta.csc.fi/hplt_bitextor_models/eus_tiny.zip \ https://object.pouta.csc.fi/hplt_bitextor_models/gl-en_exported_base.zip \ diff --git a/cat-eng/README.md b/cat-eng/README.md new file mode 100644 index 0000000..eddc187 --- /dev/null +++ b/cat-eng/README.md @@ -0,0 +1,22 @@ +# Bitextor Catalan-English models + +# Teacher +We use an Tatoeba-MT Teacher: [Tatoeba-MT-models/cat-eng/opus+bt-2021-04-30](https://object.pouta.csc.fi/Tatoeba-MT-models/cat-eng/opus+bt-2021-04-30.zip), which has been trained on the [Tatoeba](https://github.com/Helsinki-NLP/Tatoeba-Challenge/tree/master/data) dataset and backtranslations. + + +# Students +We used the Firefox pipeline to train new students, with tiny and base architectures. For data, we used [Tatoeba](https://github.com/Helsinki-NLP/Tatoeba-Challenge/tree/master/data), as well as the monolingual [Catalan Textual Corpus](https://zenodo.org/record/4519349) and the [Macocu](https://www.clarin.si/repository/xmlui/handle/11356/1837) dataset. + +# Evaluation +BLEU scores for the teacher and the students. + +|Model|Flores200-devtest| Model Size | tokens/s on 1 CPU +|---|---|---|---| +|Teacher | 41,5 | 798M | - | +|Student-base | 40,3 | 41M | 1224,97 | +|Student-tiny | 39,4 | 17M | 2940,61 | + +# How to run +1. Compile [browser-mt/marian-dev](https://github.com/browsermt/marian-dev) +2. Each model comes with a script called `run.sh` and a configuration file `config.yml`, modify them as needed +3. Run `bash run.sh` \ No newline at end of file diff --git a/cat-eng/students/base/download_student.sh b/cat-eng/students/base/download_student.sh new file mode 100644 index 0000000..73c4aea --- /dev/null +++ b/cat-eng/students/base/download_student.sh @@ -0,0 +1,3 @@ +# Download student model from Allas container +wget https://object.pouta.csc.fi/hplt_bitextor_models/ca-en_exported_base.zip + diff --git a/cat-eng/students/base/ftt_config.yml b/cat-eng/students/base/ftt_config.yml new file mode 100644 index 0000000..049f3a2 --- /dev/null +++ b/cat-eng/students/base/ftt_config.yml @@ -0,0 +1,63 @@ +#### +# Example of a production config +# Change language pair, experiment name, datasets and other settings if needed +# Training low resource languages might require more tuning of pipeline/training/configs +### + + +experiment: + name: opusmt + src: ca + trg: en + src_three_letter: cat + trg_three_letter: eng + + #OPUS models are not ensembled, they have different vocabs anyway + teacher-ensemble: 1 + + #URL to the OPUS-MT model to use as the teacher + opusmt-teacher: "https://object.pouta.csc.fi/Tatoeba-MT-models/cat-eng/opus+bt-2021-04-30.zip" + + #URL to the OPUS-MT model to use as the backward model + opusmt-backward: "https://object.pouta.csc.fi/Tatoeba-MT-models/eng-cat/opus+bt-2021-04-10.zip" + + #Only specify this if model is target-multilingual + target-language-token: + + # path to a pretrained backward model (optional) + backward-model: "" + + # limits per downloaded dataset + mono-max-sentences-src: 100000000 + mono-max-sentences-trg: 20000000 + + spm-sample-size: 2000000 + + # split corpus to parallelize translation + split-length: 2000000 + + best-model: perplexity + bicleaner: + default-threshold: 0.5 + dataset-thresholds: + tc_Tatoeba-train-v2021-08-07.cat.eng: 0.5 + +#TODO: extract this info straight from the OPUS model yml info file +datasets: + # parallel training corpus + train: + - tc_Tatoeba-Challenge-v2021-08-07 + # datasets to merge for validation while training. Translations by teacher + devtest: + - flores_dev + # datasets for evaluation + test: + - flores_devtest + mono-src: + - custom-corpus/catext + - custom-corpus/macocu + +marian-args: + decoding-teacher: #added to reduce time + mini-batch: 64 + beam-size: 4 diff --git a/cat-eng/students/base/model_config.yml b/cat-eng/students/base/model_config.yml new file mode 100644 index 0000000..bd2776d --- /dev/null +++ b/cat-eng/students/base/model_config.yml @@ -0,0 +1,22 @@ +# Model options +dec-cell-base-depth: 2 +dec-cell-high-depth: 1 +dec-cell: ssru +dec-depth: 2 +dim-emb: 512 +enc-cell-depth: 1 +enc-cell: gru +enc-depth: 6 +enc-type: bidirectional +tied-embeddings-all: true +transformer-decoder-autoreg: rnn +transformer-dim-ffn: 2048 +transformer-ffn-activation: relu +transformer-ffn-depth: 2 +transformer-guided-alignment-layer: last +transformer-heads: 8 +transformer-postprocess: dan +transformer-preprocess: "" +transformer-tied-layers: [] +transformer-train-position-embeddings: false +type: transformer \ No newline at end of file diff --git a/cat-eng/students/tiny/download_student.sh b/cat-eng/students/tiny/download_student.sh new file mode 100644 index 0000000..27516b0 --- /dev/null +++ b/cat-eng/students/tiny/download_student.sh @@ -0,0 +1,3 @@ +# Download student model from Allas container +wget https://object.pouta.csc.fi/hplt_bitextor_models/ca-en_exported_tiny.zip + diff --git a/cat-eng/students/tiny/ftt_config.yml b/cat-eng/students/tiny/ftt_config.yml new file mode 100644 index 0000000..049f3a2 --- /dev/null +++ b/cat-eng/students/tiny/ftt_config.yml @@ -0,0 +1,63 @@ +#### +# Example of a production config +# Change language pair, experiment name, datasets and other settings if needed +# Training low resource languages might require more tuning of pipeline/training/configs +### + + +experiment: + name: opusmt + src: ca + trg: en + src_three_letter: cat + trg_three_letter: eng + + #OPUS models are not ensembled, they have different vocabs anyway + teacher-ensemble: 1 + + #URL to the OPUS-MT model to use as the teacher + opusmt-teacher: "https://object.pouta.csc.fi/Tatoeba-MT-models/cat-eng/opus+bt-2021-04-30.zip" + + #URL to the OPUS-MT model to use as the backward model + opusmt-backward: "https://object.pouta.csc.fi/Tatoeba-MT-models/eng-cat/opus+bt-2021-04-10.zip" + + #Only specify this if model is target-multilingual + target-language-token: + + # path to a pretrained backward model (optional) + backward-model: "" + + # limits per downloaded dataset + mono-max-sentences-src: 100000000 + mono-max-sentences-trg: 20000000 + + spm-sample-size: 2000000 + + # split corpus to parallelize translation + split-length: 2000000 + + best-model: perplexity + bicleaner: + default-threshold: 0.5 + dataset-thresholds: + tc_Tatoeba-train-v2021-08-07.cat.eng: 0.5 + +#TODO: extract this info straight from the OPUS model yml info file +datasets: + # parallel training corpus + train: + - tc_Tatoeba-Challenge-v2021-08-07 + # datasets to merge for validation while training. Translations by teacher + devtest: + - flores_dev + # datasets for evaluation + test: + - flores_devtest + mono-src: + - custom-corpus/catext + - custom-corpus/macocu + +marian-args: + decoding-teacher: #added to reduce time + mini-batch: 64 + beam-size: 4 diff --git a/cat-eng/students/tiny/model_config.yml b/cat-eng/students/tiny/model_config.yml new file mode 100644 index 0000000..0e9c47c --- /dev/null +++ b/cat-eng/students/tiny/model_config.yml @@ -0,0 +1,25 @@ +# https://github.com/browsermt/students/tree/master/train-student/models/student.tiny11 +dec-cell-base-depth: 2 +dec-cell-high-depth: 1 +dec-cell: ssru +dec-depth: 2 +dim-emb: 256 +dim-vocabs: [32000, 32000] +enc-cell-depth: 1 +enc-cell: gru +enc-depth: 6 +enc-type: bidirectional +tied-embeddings-all: true +transformer-decoder-autoreg: rnn +transformer-dim-ffn: 1536 +transformer-ffn-activation: relu +transformer-ffn-depth: 2 +transformer-guided-alignment-layer: last +transformer-heads: 8 +transformer-no-projection: false +transformer-postprocess-emb: d +transformer-postprocess: dan +transformer-preprocess: "" +transformer-tied-layers: [] +transformer-train-position-embeddings: false +type: transformer