From 9e8641b91a5eaa6be53bec7704ddc762e359e0cb Mon Sep 17 00:00:00 2001 From: Jaume Zaragoza Date: Mon, 25 Nov 2024 18:55:47 +0100 Subject: [PATCH] Cjk corpora fixes (#937) * Remove num_mismatch filter for Chinese It is removing a lot of sentences that have exactly the same numbers. Seems that is happenning when the Chinese side has the numbers glued to the Chinese characters. * Remove displaystyle in WikiMatrix * Normalize to full-width punctuation * Do not omit the character preceeding the period --------- Co-authored-by: Evgeny Pavlov --- .../configs/en-zh/WikiMatrix-v1.filters.json | 21 +++++++++++ .../configs/en-zh/default.filters.json | 35 ++++++++++++++----- .../configs/zh-en/WikiMatrix-v1.filters.json | 21 +++++++++++ .../configs/zh-en/default.filters.json | 8 ----- 4 files changed, 69 insertions(+), 16 deletions(-) create mode 100644 pipeline/clean/opuscleaner/configs/en-zh/WikiMatrix-v1.filters.json create mode 100644 pipeline/clean/opuscleaner/configs/zh-en/WikiMatrix-v1.filters.json diff --git a/pipeline/clean/opuscleaner/configs/en-zh/WikiMatrix-v1.filters.json b/pipeline/clean/opuscleaner/configs/en-zh/WikiMatrix-v1.filters.json new file mode 100644 index 000000000..f82aee0a0 --- /dev/null +++ b/pipeline/clean/opuscleaner/configs/en-zh/WikiMatrix-v1.filters.json @@ -0,0 +1,21 @@ +{ + "version": 1, + "files": [ + ], + "filters": [ + { + "filter": "regexp", + "parameters": { + "PATTERN": "s#\\{(?:[^{}]++|(?R))*+\\}##g" + }, + "language": "" + }, + { + "filter": "regexp", + "parameters": { + "PATTERN": "s#\\{(?:[^{}]++|(?R))*+\\}##g" + }, + "language": "" + } + ] +} diff --git a/pipeline/clean/opuscleaner/configs/en-zh/default.filters.json b/pipeline/clean/opuscleaner/configs/en-zh/default.filters.json index dac745b3a..6b102e475 100644 --- a/pipeline/clean/opuscleaner/configs/en-zh/default.filters.json +++ b/pipeline/clean/opuscleaner/configs/en-zh/default.filters.json @@ -22,6 +22,33 @@ }, "language": "" }, + { + "_comment": "Normalize to full-width punctuation", + "filter": "opus.RegExpSub", + "parameters": { + "patterns": [ + [ + "([\\u3400-\\u4dbf\\u4e00-\\u9fff\\uf900-\\ufaff!\uff01\uff1f\\?])\\?", + "\\1\uff1f", + 0, + "" + ], + [ + "([\\u3400-\\u4dbf\\u4e00-\\u9fff\\uf900-\\ufaff!\uff01\uff1f\\?])\\!", + "\\1\uff01", + 0, + "" + ], + [ + "([\\u3400-\\u4dbf\\u4e00-\\u9fff\\uf900-\\ufaff])\\.\\s*(?!\\s*\\.)", + "\\1\uff61", + 0, + "" + ] + ] + }, + "language": null + }, { "filter": "max_length", "parameters": { @@ -56,14 +83,6 @@ }, "language": null }, - { - "filter": "num_mismatch", - "parameters": { - "RATIO": 1, - "DEBUG": false - }, - "language": null - }, { "filter": "fasttext_filter", "parameters": { diff --git a/pipeline/clean/opuscleaner/configs/zh-en/WikiMatrix-v1.filters.json b/pipeline/clean/opuscleaner/configs/zh-en/WikiMatrix-v1.filters.json new file mode 100644 index 000000000..f82aee0a0 --- /dev/null +++ b/pipeline/clean/opuscleaner/configs/zh-en/WikiMatrix-v1.filters.json @@ -0,0 +1,21 @@ +{ + "version": 1, + "files": [ + ], + "filters": [ + { + "filter": "regexp", + "parameters": { + "PATTERN": "s#\\{(?:[^{}]++|(?R))*+\\}##g" + }, + "language": "" + }, + { + "filter": "regexp", + "parameters": { + "PATTERN": "s#\\{(?:[^{}]++|(?R))*+\\}##g" + }, + "language": "" + } + ] +} diff --git a/pipeline/clean/opuscleaner/configs/zh-en/default.filters.json b/pipeline/clean/opuscleaner/configs/zh-en/default.filters.json index dac86c6c4..e34004af8 100644 --- a/pipeline/clean/opuscleaner/configs/zh-en/default.filters.json +++ b/pipeline/clean/opuscleaner/configs/zh-en/default.filters.json @@ -56,14 +56,6 @@ }, "language": null }, - { - "filter": "num_mismatch", - "parameters": { - "RATIO": 1, - "DEBUG": false - }, - "language": null - }, { "filter": "fasttext_filter", "parameters": {