From 13c1055653eb23452750950b9674e51b9e3cd867 Mon Sep 17 00:00:00 2001 From: Christian Geng Date: Thu, 31 Aug 2023 10:30:48 +0200 Subject: [PATCH 1/7] add initial parametrization. --- bertalign/aligner.py | 44 ++++++++++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 16 deletions(-) diff --git a/bertalign/aligner.py b/bertalign/aligner.py index 6c9f916..824c991 100644 --- a/bertalign/aligner.py +++ b/bertalign/aligner.py @@ -1,52 +1,64 @@ import numpy as np + from bertalign import model from bertalign.corelib import * from bertalign.utils import * class Bertalign: def __init__(self, - src, - tgt, + src_raw, + tgt_raw, max_align=5, top_k=3, win=5, skip=-0.1, margin=True, len_penalty=True, - is_split=False, + input_type='raw', + src_lang='de', + tgt_lang='fr', ): - + self.max_align = max_align self.top_k = top_k self.win = win self.skip = skip self.margin = margin self.len_penalty = len_penalty - - src = clean_text(src) - tgt = clean_text(tgt) - src_lang = detect_lang(src) - tgt_lang = detect_lang(tgt) - - if is_split: + + if not src_lang: + src_lang = detect_lang(src) + if not tgt_lang: + tgt_lang = detect_lang(tgt) + + if input_type == 'lines': + # need to split + src = clean_text(src_raw) + tgt = clean_text(tgt_raw) src_sents = src.splitlines() tgt_sents = tgt.splitlines() - else: + elif input_type == 'raw': + src = clean_text(src_raw) + tgt = clean_text(tgt_raw) src_sents = split_sents(src, src_lang) tgt_sents = split_sents(tgt, tgt_lang) - + elif input_type == 'tokenized': + src_sents = src_raw + tgt_sents = tgt_raw + src_num = len(src_sents) tgt_num = len(tgt_sents) - + src_lang = LANG.ISO[src_lang] tgt_lang = LANG.ISO[tgt_lang] - + print("Source language: {}, Number of sentences: {}".format(src_lang, src_num)) print("Target language: {}, Number of sentences: {}".format(tgt_lang, tgt_num)) - print("Embedding source and target text using {} ...".format(model.model_name)) + print("Embedding source text using {} ...".format(model.model_name)) src_vecs, src_lens = model.transform(src_sents, max_align - 1) + print("Embedding target text using {} ...".format(model.model_name)) tgt_vecs, tgt_lens = model.transform(tgt_sents, max_align - 1) char_ratio = np.sum(src_lens[0,]) / np.sum(tgt_lens[0,]) From 80c197424dd632c0f8f4610254ae8d40a17f0879 Mon Sep 17 00:00:00 2001 From: Christian Geng Date: Thu, 31 Aug 2023 15:26:03 +0200 Subject: [PATCH 2/7] Check input type options. --- bertalign/aligner.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/bertalign/aligner.py b/bertalign/aligner.py index 824c991..061a2a8 100644 --- a/bertalign/aligner.py +++ b/bertalign/aligner.py @@ -16,8 +16,8 @@ def __init__(self, margin=True, len_penalty=True, input_type='raw', - src_lang='de', - tgt_lang='fr', + src_lang=None, + tgt_lang=None, ): self.max_align = max_align @@ -32,6 +32,10 @@ def __init__(self, if not tgt_lang: tgt_lang = detect_lang(tgt) + input_types = ['raw', 'newline', 'tokenized'] + if input_type not in input_types: + raise ValueError("Invalid input type. Expected one of: %s" % input_types) + if input_type == 'lines': # need to split src = clean_text(src_raw) @@ -74,7 +78,7 @@ def __init__(self, self.char_ratio = char_ratio self.src_vecs = src_vecs self.tgt_vecs = tgt_vecs - + def align_sents(self): print("Performing first-step alignment ...") @@ -83,7 +87,7 @@ def align_sents(self): first_w, first_path = find_first_search_path(self.src_num, self.tgt_num) first_pointers = first_pass_align(self.src_num, self.tgt_num, first_w, first_path, first_alignment_types, D, I) first_alignment = first_back_track(self.src_num, self.tgt_num, first_pointers, first_path, first_alignment_types) - + print("Performing second-step alignment ...") second_alignment_types = get_alignment_types(self.max_align) second_w, second_path = find_second_search_path(first_alignment, self.win, self.src_num, self.tgt_num) @@ -91,10 +95,10 @@ def align_sents(self): second_w, second_path, second_alignment_types, self.char_ratio, self.skip, margin=self.margin, len_penalty=self.len_penalty) second_alignment = second_back_track(self.src_num, self.tgt_num, second_pointers, second_path, second_alignment_types) - + print("Finished! Successfully aligning {} {} sentences to {} {} sentences\n".format(self.src_num, self.src_lang, self.tgt_num, self.tgt_lang)) self.result = second_alignment - + def print_sents(self): for bead in (self.result): src_line = self._get_line(bead[0], self.src_sents) From ecc223fb0fdb8e5ab532548ab8f550ca2036f800 Mon Sep 17 00:00:00 2001 From: Christian Geng Date: Fri, 1 Sep 2023 12:24:00 +0200 Subject: [PATCH 3/7] Add unit tests for results. --- tests/conftest.py | 37 +++++++++++++++ tests/gold_standard_text_und_berg.json | 58 +++++++++++++++++++++++ tests/requirements.txt | 1 + tests/test_results.py | 64 ++++++++++++++++++++++++++ 4 files changed, 160 insertions(+) create mode 100644 tests/conftest.py create mode 100644 tests/gold_standard_text_und_berg.json create mode 100644 tests/requirements.txt create mode 100644 tests/test_results.py diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..8801e6f --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,37 @@ +import pytest +import json +import os + +def load_json(fpath): + with open(fpath) as json_file: + data = json.load(json_file) + return data + + +@pytest.fixture +def text_and_berg_expected_results(): + """Fixture for the Text und Berg expected result.""" + + cur_dir = os.path.dirname(os.path.realpath(__file__)) + fname = 'gold_standard_text_und_berg.json' + fpath = os.path.join(cur_dir, fname) + data = load_json(fpath) + yield data + + + +@pytest.fixture +def text_and_berg_inputs(): + r"""Input data for Text and Berg.""" + + src_dir = 'text+berg/de' + tgt_dir = 'text+berg/fr' + gold_dir = 'text+berg/gold' + + data = [] + for file in os.listdir(src_dir): + src_file = os.path.join(src_dir, file).replace("\\","/") + tgt_file = os.path.join(tgt_dir, file).replace("\\","/") + data.append((file, src_file, tgt_file, gold_dir)) + + yield data diff --git a/tests/gold_standard_text_und_berg.json b/tests/gold_standard_text_und_berg.json new file mode 100644 index 0000000..7a85923 --- /dev/null +++ b/tests/gold_standard_text_und_berg.json @@ -0,0 +1,58 @@ +{ + "002": { + "recall_strict": 0.9588477366255144, + "recall_lax": 0.9917695473251029, + "precision_strict": 0.9505703422053232, + "precision_lax": 0.9847908745247148, + "f1_strict": 0.9546910980176844, + "f1_lax": 0.9882678910702977 + }, + "006": { + "recall_strict": 0.9694444444444444, + "recall_lax": 0.9944444444444445, + "precision_strict": 0.9607329842931938, + "precision_lax": 0.9869109947643979, + "f1_strict": 0.9650690556740179, + "f1_lax": 0.9906633978772443 + }, + "001": { + "recall_strict": 0.9553191489361702, + "recall_lax": 0.9957446808510638, + "precision_strict": 0.9496981891348089, + "precision_lax": 0.9879275653923542, + "f1_strict": 0.9525003764104154, + "f1_lax": 0.991820720553515 + }, + "005": { + "recall_strict": 0.9502982107355865, + "recall_lax": 0.9960238568588469, + "precision_strict": 0.9453860640301318, + "precision_lax": 0.9887005649717514, + "f1_strict": 0.9478357731413087, + "f1_lax": 0.9923487000713064 + }, + "007": { + "recall_strict": 0.937592867756315, + "recall_lax": 0.9910846953937593, + "precision_strict": 0.9265536723163842, + "precision_lax": 0.9830508474576272, + "f1_strict": 0.9320405838088075, + "f1_lax": 0.9870514243433223 + }, + "004": { + "recall_strict": 0.9404145077720207, + "recall_lax": 0.9896373056994818, + "precision_strict": 0.9320148331273177, + "precision_lax": 0.9851668726823238, + "f1_strict": 0.9361958300767388, + "f1_lax": 0.9873970292534215 + }, + "003": { + "recall_strict": 0.9405594405594405, + "recall_lax": 0.9906759906759907, + "precision_strict": 0.9319955406911928, + "precision_lax": 0.9866220735785953, + "f1_strict": 0.9362579076540054, + "f1_lax": 0.9886448763947483 + } +} diff --git a/tests/requirements.txt b/tests/requirements.txt new file mode 100644 index 0000000..e079f8a --- /dev/null +++ b/tests/requirements.txt @@ -0,0 +1 @@ +pytest diff --git a/tests/test_results.py b/tests/test_results.py new file mode 100644 index 0000000..1e645b3 --- /dev/null +++ b/tests/test_results.py @@ -0,0 +1,64 @@ +import os + +import pytest + +from bertalign import Bertalign +from bertalign.eval import read_alignments +from bertalign.eval import score_multiple +from bertalign.eval import log_final_scores + + +def align_text_and_berg(filespec, aligner_spec): + r"""Align Text and Berg using the original aligner.""" + + test_alignments = [] + gold_alignments = [] + + results = {} + + for test_data in filespec: + + file, src_file, tgt_file, gold_dir = test_data + src = open(src_file, 'rt', encoding='utf-8').read() + tgt = open(tgt_file, 'rt', encoding='utf-8').read() + + print("Start aligning {} to {}".format(src_file, tgt_file)) + # aligner = Bertalign(src, tgt, is_split=True) + aligner = Bertalign(src, tgt, **aligner_spec) + aligner.align_sents() + test_alignments.append(aligner.result) + + gold_file = os.path.join(gold_dir, file) + gold_alignments.append(read_alignments(gold_file)) + + scores = score_multiple(gold_list=gold_alignments, test_list=test_alignments) + log_final_scores(scores) + results[file] = scores + return results + + +@pytest.mark.skip(reason="is_split is removed at the moment.") +def test_aligner_original(text_and_berg_expected_results, text_and_berg_inputs): + r"""Test results for the original aligner using is_split.""" + + aligner_spec = {"is_split": True} + result = align_text_and_berg(text_and_berg_inputs, aligner_spec) + + for file in result: + expected = text_and_berg_expected_results[file] + calculated = result[file] + for metric in expected: + assert expected[metric] == calculated[metric], "Result mismatch" + + +def test_aligner_altered_parametrization(text_and_berg_expected_results, text_and_berg_inputs): + r"""Test results for the aligner using input_type and languages.""" + + aligner_spec = {"input_type": 'lines', 'src_lang': 'de', 'tgt_lang': 'fr'} + result = align_text_and_berg(text_and_berg_inputs, aligner_spec) + + for file in result: + expected = text_and_berg_expected_results[file] + calculated = result[file] + for metric in expected: + assert expected[metric] == calculated[metric], "Result mismatch" From f84fe54d02ff7abb018c0c0bd75d965c1a79df70 Mon Sep 17 00:00:00 2001 From: Christian Geng Date: Fri, 1 Sep 2023 12:24:55 +0200 Subject: [PATCH 4/7] move checking of input types to better location. --- bertalign/aligner.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bertalign/aligner.py b/bertalign/aligner.py index 061a2a8..f9fe6ee 100644 --- a/bertalign/aligner.py +++ b/bertalign/aligner.py @@ -27,15 +27,15 @@ def __init__(self, self.margin = margin self.len_penalty = len_penalty + input_types = ['raw', 'lines', 'tokenized'] + if input_type not in input_types: + raise ValueError("Invalid input type '%s'. Expected one of: %s" % (input_type, input_types)) + if not src_lang: src_lang = detect_lang(src) if not tgt_lang: tgt_lang = detect_lang(tgt) - input_types = ['raw', 'newline', 'tokenized'] - if input_type not in input_types: - raise ValueError("Invalid input type. Expected one of: %s" % input_types) - if input_type == 'lines': # need to split src = clean_text(src_raw) From 81358be724318d4c64f0589d7b2eabf4ff3a33e6 Mon Sep 17 00:00:00 2001 From: Christian Geng Date: Fri, 1 Sep 2023 12:30:24 +0200 Subject: [PATCH 5/7] parametrize aligner spec. --- tests/test_results.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/tests/test_results.py b/tests/test_results.py index 1e645b3..97a74da 100644 --- a/tests/test_results.py +++ b/tests/test_results.py @@ -18,9 +18,9 @@ def align_text_and_berg(filespec, aligner_spec): for test_data in filespec: - file, src_file, tgt_file, gold_dir = test_data - src = open(src_file, 'rt', encoding='utf-8').read() - tgt = open(tgt_file, 'rt', encoding='utf-8').read() + file, src_file, tgt_file, gold_dir = test_data + src = open(src_file, "rt", encoding="utf-8").read() + tgt = open(tgt_file, "rt", encoding="utf-8").read() print("Start aligning {} to {}".format(src_file, tgt_file)) # aligner = Bertalign(src, tgt, is_split=True) @@ -51,10 +51,19 @@ def test_aligner_original(text_and_berg_expected_results, text_and_berg_inputs): assert expected[metric] == calculated[metric], "Result mismatch" -def test_aligner_altered_parametrization(text_and_berg_expected_results, text_and_berg_inputs): +aligner_spec_explicit = { + "input_type": "lines", + "src_lang": "de", + "tgt_lang": "fr", +} + + +@pytest.mark.parametrize("aligner_spec", [aligner_spec_explicit]) +def test_aligner_altered_parametrization( + text_and_berg_expected_results, text_and_berg_inputs, aligner_spec +): r"""Test results for the aligner using input_type and languages.""" - aligner_spec = {"input_type": 'lines', 'src_lang': 'de', 'tgt_lang': 'fr'} result = align_text_and_berg(text_and_berg_inputs, aligner_spec) for file in result: From 698fa0ed9a525a62cefe11dd3e8a1656ef8d1e9a Mon Sep 17 00:00:00 2001 From: Christian Geng Date: Fri, 1 Sep 2023 14:34:20 +0200 Subject: [PATCH 6/7] Add paramtrization for aligner not spefifying lang. --- tests/test_results.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/test_results.py b/tests/test_results.py index 97a74da..730dc27 100644 --- a/tests/test_results.py +++ b/tests/test_results.py @@ -58,6 +58,11 @@ def test_aligner_original(text_and_berg_expected_results, text_and_berg_inputs): } +aligner_spec_detect = { + "input_type": "lines", +} + +# @pytest.mark.parametrize("aligner_spec", [aligner_spec_detect]) @pytest.mark.parametrize("aligner_spec", [aligner_spec_explicit]) def test_aligner_altered_parametrization( text_and_berg_expected_results, text_and_berg_inputs, aligner_spec From d56f1d47f36eeb303ee09ebf70479eca29a0924a Mon Sep 17 00:00:00 2001 From: Christian Geng Date: Fri, 1 Sep 2023 14:34:45 +0200 Subject: [PATCH 7/7] inelegant way of specifying language. --- bertalign/aligner.py | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/bertalign/aligner.py b/bertalign/aligner.py index f9fe6ee..b3678ad 100644 --- a/bertalign/aligner.py +++ b/bertalign/aligner.py @@ -31,26 +31,47 @@ def __init__(self, if input_type not in input_types: raise ValueError("Invalid input type '%s'. Expected one of: %s" % (input_type, input_types)) - if not src_lang: - src_lang = detect_lang(src) - if not tgt_lang: - tgt_lang = detect_lang(tgt) - if input_type == 'lines': # need to split src = clean_text(src_raw) tgt = clean_text(tgt_raw) src_sents = src.splitlines() tgt_sents = tgt.splitlines() + + if not src_lang: + src_lang = detect_lang(src) + if not tgt_lang: + tgt_lang = detect_lang(tgt) + + elif input_type == 'raw': src = clean_text(src_raw) tgt = clean_text(tgt_raw) + + if not src_lang: + src_lang = detect_lang(src) + if not tgt_lang: + tgt_lang = detect_lang(tgt) + src_sents = split_sents(src, src_lang) tgt_sents = split_sents(tgt, tgt_lang) + elif input_type == 'tokenized': + + if not src_lang: + src_lang = detect_lang(src) + if not tgt_lang: + tgt_lang = detect_lang(tgt) + src_sents = src_raw tgt_sents = tgt_raw + if not src_lang: + src_lang = detect_lang(' '.join(src_sents)) + if not tgt_lang: + tgt_lang = detect_lang(' '.join(tgt_sents)) + + src_num = len(src_sents) tgt_num = len(tgt_sents)