diff --git a/pipeline/clean/opuscleaner/configs/en-ja/default.filters.json b/pipeline/clean/opuscleaner/configs/en-ja/default.filters.json new file mode 100644 index 000000000..dac745b3a --- /dev/null +++ b/pipeline/clean/opuscleaner/configs/en-ja/default.filters.json @@ -0,0 +1,77 @@ +{ + "version": 1, + "files": [ + ], + "filters": [ + { + "filter": "remove_empty_lines", + "parameters": {}, + "language": null + }, + { + "filter": "normalize_whitespace", + "parameters": { + "COLLAPSE": true + }, + "language": "" + }, + { + "filter": "deescape-special-chars", + "parameters": { + "LANG1": "other" + }, + "language": "" + }, + { + "filter": "max_length", + "parameters": { + "MAXLENGTH": 150, + "MINLENGTH": 1 + }, + "language": null + }, + { + "filter": "fix_wiki", + "parameters": { + "ALWAYS": false, + "FOOTNOTES": true, + "URLS": true, + "WIKILINKS": true, + "CODE": true, + "HEADINGS": true, + "REMOVEEMPTYLINES": true + }, + "language": null + }, + { + "filter": "alpha_ratio", + "parameters": { + "LANG1": "", + "LANG2": "", + "SRCWORDRAT": 0.4, + "TRGWORDRAT": 0.0, + "SRCALPHARAT": 0.5, + "TRGALPHARAT": 0.0, + "DEBUG": false + }, + "language": null + }, + { + "filter": "num_mismatch", + "parameters": { + "RATIO": 1, + "DEBUG": false + }, + "language": null + }, + { + "filter": "fasttext_filter", + "parameters": { + "FASTTEXT_MODEL_TYPE": "large", + "LANG1": "", + "LANG2": "" + }, + "language": null + } + ] +} diff --git a/pipeline/clean/opuscleaner/configs/en-ko/default.filters.json b/pipeline/clean/opuscleaner/configs/en-ko/default.filters.json new file mode 100644 index 000000000..dac745b3a --- /dev/null +++ b/pipeline/clean/opuscleaner/configs/en-ko/default.filters.json @@ -0,0 +1,77 @@ +{ + "version": 1, + "files": [ + ], + "filters": [ + { + "filter": "remove_empty_lines", + "parameters": {}, + "language": null + }, + { + "filter": "normalize_whitespace", + "parameters": { + "COLLAPSE": true + }, + "language": "" + }, + { + "filter": "deescape-special-chars", + "parameters": { + "LANG1": "other" + }, + "language": "" + }, + { + "filter": "max_length", + "parameters": { + "MAXLENGTH": 150, + "MINLENGTH": 1 + }, + "language": null + }, + { + "filter": "fix_wiki", + "parameters": { + "ALWAYS": false, + "FOOTNOTES": true, + "URLS": true, + "WIKILINKS": true, + "CODE": true, + "HEADINGS": true, + "REMOVEEMPTYLINES": true + }, + "language": null + }, + { + "filter": "alpha_ratio", + "parameters": { + "LANG1": "", + "LANG2": "", + "SRCWORDRAT": 0.4, + "TRGWORDRAT": 0.0, + "SRCALPHARAT": 0.5, + "TRGALPHARAT": 0.0, + "DEBUG": false + }, + "language": null + }, + { + "filter": "num_mismatch", + "parameters": { + "RATIO": 1, + "DEBUG": false + }, + "language": null + }, + { + "filter": "fasttext_filter", + "parameters": { + "FASTTEXT_MODEL_TYPE": "large", + "LANG1": "", + "LANG2": "" + }, + "language": null + } + ] +} diff --git a/pipeline/clean/opuscleaner/configs/en-zh/default.filters.json b/pipeline/clean/opuscleaner/configs/en-zh/default.filters.json new file mode 100644 index 000000000..dac745b3a --- /dev/null +++ b/pipeline/clean/opuscleaner/configs/en-zh/default.filters.json @@ -0,0 +1,77 @@ +{ + "version": 1, + "files": [ + ], + "filters": [ + { + "filter": "remove_empty_lines", + "parameters": {}, + "language": null + }, + { + "filter": "normalize_whitespace", + "parameters": { + "COLLAPSE": true + }, + "language": "" + }, + { + "filter": "deescape-special-chars", + "parameters": { + "LANG1": "other" + }, + "language": "" + }, + { + "filter": "max_length", + "parameters": { + "MAXLENGTH": 150, + "MINLENGTH": 1 + }, + "language": null + }, + { + "filter": "fix_wiki", + "parameters": { + "ALWAYS": false, + "FOOTNOTES": true, + "URLS": true, + "WIKILINKS": true, + "CODE": true, + "HEADINGS": true, + "REMOVEEMPTYLINES": true + }, + "language": null + }, + { + "filter": "alpha_ratio", + "parameters": { + "LANG1": "", + "LANG2": "", + "SRCWORDRAT": 0.4, + "TRGWORDRAT": 0.0, + "SRCALPHARAT": 0.5, + "TRGALPHARAT": 0.0, + "DEBUG": false + }, + "language": null + }, + { + "filter": "num_mismatch", + "parameters": { + "RATIO": 1, + "DEBUG": false + }, + "language": null + }, + { + "filter": "fasttext_filter", + "parameters": { + "FASTTEXT_MODEL_TYPE": "large", + "LANG1": "", + "LANG2": "" + }, + "language": null + } + ] +} diff --git a/pipeline/clean/opuscleaner/configs/ja-en/default.filters.json b/pipeline/clean/opuscleaner/configs/ja-en/default.filters.json new file mode 100644 index 000000000..dac86c6c4 --- /dev/null +++ b/pipeline/clean/opuscleaner/configs/ja-en/default.filters.json @@ -0,0 +1,77 @@ +{ + "version": 1, + "files": [ + ], + "filters": [ + { + "filter": "remove_empty_lines", + "parameters": {}, + "language": null + }, + { + "filter": "normalize_whitespace", + "parameters": { + "COLLAPSE": true + }, + "language": "" + }, + { + "filter": "deescape-special-chars", + "parameters": { + "LANG1": "other" + }, + "language": "" + }, + { + "filter": "max_length", + "parameters": { + "MAXLENGTH": 150, + "MINLENGTH": 1 + }, + "language": null + }, + { + "filter": "fix_wiki", + "parameters": { + "ALWAYS": false, + "FOOTNOTES": true, + "URLS": true, + "WIKILINKS": true, + "CODE": true, + "HEADINGS": true, + "REMOVEEMPTYLINES": true + }, + "language": null + }, + { + "filter": "alpha_ratio", + "parameters": { + "LANG1": "", + "LANG2": "", + "SRCWORDRAT": 0.0, + "TRGWORDRAT": 0.4, + "SRCALPHARAT": 0.0, + "TRGALPHARAT": 0.5, + "DEBUG": false + }, + "language": null + }, + { + "filter": "num_mismatch", + "parameters": { + "RATIO": 1, + "DEBUG": false + }, + "language": null + }, + { + "filter": "fasttext_filter", + "parameters": { + "FASTTEXT_MODEL_TYPE": "large", + "LANG1": "", + "LANG2": "" + }, + "language": null + } + ] +} diff --git a/pipeline/clean/opuscleaner/configs/ko-en/default.filters.json b/pipeline/clean/opuscleaner/configs/ko-en/default.filters.json new file mode 100644 index 000000000..dac86c6c4 --- /dev/null +++ b/pipeline/clean/opuscleaner/configs/ko-en/default.filters.json @@ -0,0 +1,77 @@ +{ + "version": 1, + "files": [ + ], + "filters": [ + { + "filter": "remove_empty_lines", + "parameters": {}, + "language": null + }, + { + "filter": "normalize_whitespace", + "parameters": { + "COLLAPSE": true + }, + "language": "" + }, + { + "filter": "deescape-special-chars", + "parameters": { + "LANG1": "other" + }, + "language": "" + }, + { + "filter": "max_length", + "parameters": { + "MAXLENGTH": 150, + "MINLENGTH": 1 + }, + "language": null + }, + { + "filter": "fix_wiki", + "parameters": { + "ALWAYS": false, + "FOOTNOTES": true, + "URLS": true, + "WIKILINKS": true, + "CODE": true, + "HEADINGS": true, + "REMOVEEMPTYLINES": true + }, + "language": null + }, + { + "filter": "alpha_ratio", + "parameters": { + "LANG1": "", + "LANG2": "", + "SRCWORDRAT": 0.0, + "TRGWORDRAT": 0.4, + "SRCALPHARAT": 0.0, + "TRGALPHARAT": 0.5, + "DEBUG": false + }, + "language": null + }, + { + "filter": "num_mismatch", + "parameters": { + "RATIO": 1, + "DEBUG": false + }, + "language": null + }, + { + "filter": "fasttext_filter", + "parameters": { + "FASTTEXT_MODEL_TYPE": "large", + "LANG1": "", + "LANG2": "" + }, + "language": null + } + ] +} diff --git a/pipeline/clean/opuscleaner/configs/zh-en/default.filters.json b/pipeline/clean/opuscleaner/configs/zh-en/default.filters.json new file mode 100644 index 000000000..dac86c6c4 --- /dev/null +++ b/pipeline/clean/opuscleaner/configs/zh-en/default.filters.json @@ -0,0 +1,77 @@ +{ + "version": 1, + "files": [ + ], + "filters": [ + { + "filter": "remove_empty_lines", + "parameters": {}, + "language": null + }, + { + "filter": "normalize_whitespace", + "parameters": { + "COLLAPSE": true + }, + "language": "" + }, + { + "filter": "deescape-special-chars", + "parameters": { + "LANG1": "other" + }, + "language": "" + }, + { + "filter": "max_length", + "parameters": { + "MAXLENGTH": 150, + "MINLENGTH": 1 + }, + "language": null + }, + { + "filter": "fix_wiki", + "parameters": { + "ALWAYS": false, + "FOOTNOTES": true, + "URLS": true, + "WIKILINKS": true, + "CODE": true, + "HEADINGS": true, + "REMOVEEMPTYLINES": true + }, + "language": null + }, + { + "filter": "alpha_ratio", + "parameters": { + "LANG1": "", + "LANG2": "", + "SRCWORDRAT": 0.0, + "TRGWORDRAT": 0.4, + "SRCALPHARAT": 0.0, + "TRGALPHARAT": 0.5, + "DEBUG": false + }, + "language": null + }, + { + "filter": "num_mismatch", + "parameters": { + "RATIO": 1, + "DEBUG": false + }, + "language": null + }, + { + "filter": "fasttext_filter", + "parameters": { + "FASTTEXT_MODEL_TYPE": "large", + "LANG1": "", + "LANG2": "" + }, + "language": null + } + ] +} diff --git a/pipeline/clean/opuscleaner/generate_filters.py b/pipeline/clean/opuscleaner/generate_filters.py index f30420650..f5316d1a2 100644 --- a/pipeline/clean/opuscleaner/generate_filters.py +++ b/pipeline/clean/opuscleaner/generate_filters.py @@ -33,8 +33,7 @@ def find_custom_filter(src: str, trg: str, dataset: str) -> Optional[str]: paths = [ f"{CURRENT_FOLDER}/configs/{src}-{trg}/{dataset}.filters.json", f"{CURRENT_FOLDER}/configs/{src}-{trg}/{dataset_opus}.filters.json", - f"{CURRENT_FOLDER}/configs/{trg}-{src}/{dataset}.filters.json", - f"{CURRENT_FOLDER}/configs/{trg}-{src}/{dataset_opus}.filters.json", + f"{CURRENT_FOLDER}/configs/{src}-{trg}/default.filters.json", f"{CURRENT_FOLDER}/configs/{dataset}.filters.json", f"{CURRENT_FOLDER}/configs/{dataset_opus}.filters.json", ] diff --git a/pipeline/clean/tools/clean_mono.py b/pipeline/clean/tools/clean_mono.py index 0df09ec42..c3e7a01c8 100755 --- a/pipeline/clean/tools/clean_mono.py +++ b/pipeline/clean/tools/clean_mono.py @@ -31,7 +31,11 @@ def main(): def clean_mono(src, lang): - src_toks = src.split() + # TODO: move mono cleaning to OpusCleaner + # when it support this https://github.com/hplt-project/OpusCleaner/issues/141 + + # treat individual characters as tokens for CJK + src_toks = src.split() if lang not in {"zh", "ja", "ko"} else src src_len = len(src_toks) if not src_len: diff --git a/taskcluster/kinds/clean-corpus/kind.yml b/taskcluster/kinds/clean-corpus/kind.yml index b212cd668..04c59230d 100644 --- a/taskcluster/kinds/clean-corpus/kind.yml +++ b/taskcluster/kinds/clean-corpus/kind.yml @@ -38,6 +38,12 @@ tasks: - pipeline/clean/opuscleaner/configs/remove_frequent_patterns.txt - pipeline/clean/opuscleaner/configs/default.filters.json - pipeline/clean/opuscleaner/configs/ru-en/opus_ELRC-3075-wikipedia_health-v1.filters.json + - pipeline/clean/opuscleaner/configs/en-zh/default.filters.json + - pipeline/clean/opuscleaner/configs/zh-en/default.filters.json + - pipeline/clean/opuscleaner/configs/en-ja/default.filters.json + - pipeline/clean/opuscleaner/configs/ja-en/default.filters.json + - pipeline/clean/opuscleaner/configs/en-ko/default.filters.json + - pipeline/clean/opuscleaner/configs/ko-en/default.filters.json - pipeline/clean/opuscleaner/configs/opus_LinguaTools-WikiTitles-v2014.filters.json - pipeline/clean/opuscleaner/configs/opus_NLLB-v1.filters.json - pipeline/clean/opuscleaner/configs/opus_OpenSubtitles-v2018.filters.json diff --git a/tests/test_filters.py b/tests/test_filters.py index e07610a60..c85621e56 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -28,12 +28,13 @@ def data_dir(): Mode.custom, "ru-en/opus_ELRC-3075-wikipedia_health-v1.filters.json", ), + # backward direction should have a separate custom config because it can be different (for example, zh) ( "en", "ru", "opus_ELRC-3075-wikipedia_health/v1", Mode.custom, - "ru-en/opus_ELRC-3075-wikipedia_health-v1.filters.json", + "default.filters.json", ), # verify dataset specific config is used for different language pairs ("ru", "en", "opus_UNPC/v1.0", Mode.custom, "opus_UNPC-v1.0.filters.json"), @@ -47,6 +48,21 @@ def data_dir(): "default.filters.json", ), ("fr", "en", "opus_UNPC/v1.0", Mode.defaults, "default.filters.json"), + # make sure Chinese uses language level default configs + ( + "zh", + "en", + "opus_UNPC/v1.0", + Mode.custom, + "zh-en/default.filters.json", + ), + ( + "en", + "zh", + "opus_UNPC/v1.0", + Mode.custom, + "en-zh/default.filters.json", + ), ], ids=[ "default-en-ru", @@ -58,6 +74,8 @@ def data_dir(): "dataset-fr-en", "override-with-default-ru-en-elrc", "override-with-default-fr-en-unpc", + "zh-en", + "en-zh", ], ) def test_generate_filters(params, data_dir): @@ -79,13 +97,27 @@ def test_generate_filters(params, data_dir): expected = json.load(f_conf) assert len(actual["filters"]) == len(expected["filters"]) assert {f["filter"] for f in actual["filters"]} == {f["filter"] for f in expected["filters"]} - assert {f["language"] for f in actual["filters"] if f["filter"] == "normalize_whitespace"} == { - src, - trg, + # check languages in whitespace filters where there are two of them + whitespace_filters = { + f["language"] for f in actual["filters"] if f["filter"] == "normalize_whitespace" } + if len(whitespace_filters) == 2: + assert whitespace_filters == { + src, + trg, + } # max length value is slightly changed in opus_ELRC-3075-wikipedia_health/v1 to verify that this is the same config assert [f for f in actual["filters"] if f["filter"] == "max_length"][0]["parameters"][ "MAXLENGTH" ] == [f for f in expected["filters"] if f["filter"] == "max_length"][0]["parameters"][ "MAXLENGTH" ] + # alpha ratios are different for zh + alpha_ratio_filters = [f for f in actual["filters"] if f["filter"] == "alpha_ratio"] + if alpha_ratio_filters: + assert ( + alpha_ratio_filters[0]["parameters"]["SRCWORDRAT"] + == [f for f in expected["filters"] if f["filter"] == "alpha_ratio"][0]["parameters"][ + "SRCWORDRAT" + ] + ) diff --git a/utils/config_generator.py b/utils/config_generator.py index c14c03a8c..aa6d5c181 100644 --- a/utils/config_generator.py +++ b/utils/config_generator.py @@ -118,6 +118,9 @@ def update_config( else: experiment["pretrained-models"] = {} + if source in CJK_LANGS or target in CJK_LANGS: + experiment["opuscleaner-mode"] = "custom" + datasets = prod_config["datasets"] # Clear out the base config.