[ci skip] benchmarking logic

TalkBank · Dec 7, 2024 · 40dc89b · 40dc89b
1 parent 75bba0b
commit 40dc89b
Show file tree

Hide file tree

Showing 6 changed files with 72 additions and 15 deletions.
diff --git a/batchalign/formats/chat/utils.py b/batchalign/formats/chat/utils.py
@@ -108,7 +108,7 @@ def annotation_clean(content, special=False):
     cleaned_word = re.sub(r"\x15\d+_\d+\x15", '', cleaned_word)
     if not special:
         cleaned_word = re.sub(r"&~\w+", '', cleaned_word)
-    cleaned_word = cleaned_word.replace("(","").replace(")","")
+    # cleaned_word = cleaned_word.replace("(","").replace(")","")
     cleaned_word = cleaned_word.replace("[","").replace("]","")
     cleaned_word = cleaned_word.replace("<","").replace(">","")
     cleaned_word = cleaned_word.replace("“","").replace("”","")

diff --git a/batchalign/pipelines/analysis/eval.py b/batchalign/pipelines/analysis/eval.py
@@ -3,6 +3,7 @@
 Engines for transcript evaluation
 """
 
+import re
 from batchalign.document import *
 from batchalign.pipelines.base import *
 from batchalign.pipelines.asr.utils import *
@@ -22,11 +23,34 @@ def __compute_wer(doc, gold):
         forms = [ j.text.lower() for i in doc.content for j in i.content if isinstance(i, Utterance)]
         gold_forms = [ j.text.lower() for i in gold.content for j in i.content if isinstance(i, Utterance)]
 
-        forms = [i for i in forms if i.strip() not in MOR_PUNCT+ENDING_PUNCT]
-        gold_forms = [i for i in gold_forms if i.strip() not in MOR_PUNCT+ENDING_PUNCT]
+        forms = [i.replace("-", "") for i in forms if i.strip() not in MOR_PUNCT+ENDING_PUNCT]
+        gold_forms = [i.replace("-", "") for i in gold_forms if i.strip() not in MOR_PUNCT+ENDING_PUNCT]
+
+        forms = [re.sub(r"\((.*)\)",r"", i) for i in forms]
+        gold_forms = [re.sub(r"\((.*)\)",r"", i) for i in gold_forms]
+
+        # if there are single letter frames, we combine them tofgether
+        # until the utterance is done or there isn't any left
+        forms_finished = []
+
+        single_sticky = ""
+        is_single = False
+
+        for i in forms:
+            if len(i) == 1:
+                single_sticky += i
+            else:
+                if single_sticky != "":
+                    forms_finished.append(single_sticky)
+                    single_sticky = ""
+                forms_finished.append(i)
+
+        if single_sticky != "":
+            forms_finished.append(single_sticky)
+            single_sticky = ""
 
         # dp!
-        alignment = align(forms, gold_forms, False)
+        alignment = align(forms_finished, gold_forms, False)
 
         # calculate each type of error
         sub = 0
@@ -39,14 +63,28 @@ def __compute_wer(doc, gold):
         #     but if we have <extra.reference> <extra.reference> this is 2 insertions
 
         cleaned_alignment = []
+        # whether we had a "firstname" in reference document and hence are
+        # anticipating a payload for it (the actual name) in the next entry in the
+        # alignment
+        anticipating_payload = False
 
         for i in alignment:
 
             if isinstance(i, Extra):
-                if len(cleaned_alignment) > 0 and i.extra_type == ExtraType.REFERENCE and "name" in i.key and i.key[:4] != "name":
-                    cleaned_alignment.pop(-1)
+
+                if i.extra_type == ExtraType.REFERENCE and "name" in i.key and i.key[:4] != "name":
+                    if (isinstance(cleaned_alignment[-1], Extra) and
+                        cleaned_alignment[-1].extra_type ==  ExtraType.PAYLOAD and
+                        len(cleaned_alignment) > 0):
+                        cleaned_alignment.pop(-1)
+                    else:
+                        anticipating_payload = True
                     cleaned_alignment.append(Match(i.key, None, None))
                     continue
+                elif i.extra_type == ExtraType.PAYLOAD and anticipating_payload:
+                    anticipating_payload = False
+                    continue
+
 
                 if prev_error != None and prev_error != i.extra_type:
                     # this is a substitution: we have different "extra"s in
@@ -75,7 +113,7 @@ def __compute_wer(doc, gold):
             cleaned_alignment.append(i)
 
         diff = []
-        for i in alignment:
+        for i in cleaned_alignment:
             if isinstance(i, Extra):
                 diff.append(f"{'+' if i.extra_type == ExtraType.REFERENCE else '-'} {i.key}")
             else:

diff --git a/batchalign/pipelines/morphosyntax/ud.py b/batchalign/pipelines/morphosyntax/ud.py
@@ -115,6 +115,7 @@ def handler(word, lang=None):
     target = target.replace('/100', '')
     target = target.replace('/r', '')
     target = target.replace('(', '')
+    target = target.replace("(","").replace(")","")
 
     # remove attachments
     if "|" in target:
@@ -848,7 +849,7 @@ def morphoanalyze(doc: Document, retokenize:bool, status_hook:callable = None, *
         inputs.append(line_cut)
 
         try:
-            sents = nlp(line_cut.strip()).sentences
+            sents = nlp(line_cut.replace("(","").replace(")","").strip()).sentences
 
             if len(sents) == 0:
                 continue

diff --git a/batchalign/utils/utils.py b/batchalign/utils/utils.py
@@ -29,6 +29,7 @@ def word_tokenize(str):
         return tmp.tokenize(str)
     except LookupError:
         nltk.download("punkt")
+        nltk.download("punkt_tab")
         return tmp.tokenize(str)
 
 def sent_tokenize(str):
@@ -49,6 +50,7 @@ def sent_tokenize(str):
         return ST(str)
     except LookupError:
         nltk.download("punkt")
+        nltk.download("punkt_tab")
         return ST(str)
 
 def detokenize(tokens):
@@ -69,6 +71,7 @@ def detokenize(tokens):
         return TreebankWordDetokenizer().detokenize(tokens)
     except LookupError:
         nltk.download("punkt")
+        nltk.download("punkt_tab")
         return TreebankWordDetokenizer().detokenize(tokens)
 
 def correct_timing(doc):

diff --git a/batchalign/version b/batchalign/version
@@ -1,3 +1,3 @@
-0.7.6-alpha.33
-November 26, 2024
-French APM (minor)
+0.7.7-alpha.1
+December 06, 2024
+Changing some benchmarking form handling logic
diff --git a/scratchpad.py b/scratchpad.py
@@ -54,7 +54,25 @@
 
 # # ng = NgramRetraceEngine()
 # # disf = DisfluencyReplacementEngine()
-# # doc = Document.new("I ' m such an idiot", lang="eng")
+# doc = Document.new("in general, we have hair here.", lang="eng")
+
+# forms, delim = chat_parse_utterance("in general, we have hair(stuff) here.", None, None, None, None)
+# utterance = Utterance(content=forms, delim=delim)
+# gold = Document(content=[utterance], langs=["eng"])
+
+# pipeline = BatchalignPipeline(EvaluationEngine())
+# result = pipeline(doc, gold=gold)
+
+# # pipeline = BatchalignPipeline.new("morphosyntax")
+# # result2 = pipeline(gold)
+
+# # print(str(CHATFile(doc=result2)))
+
+
+# result
+# print(result["diff"])
+
+
 # # # # doc[0].content[4].text = "maman,"
 # # # # doc[0].content[5].text = "maman,"
 # # pipe = BatchalignPipeline(ng, disf)
@@ -92,9 +110,6 @@
 
 # from batchalign.models import BertUtteranceModel
 # from batchalign.pipelines import BatchalignPipeline
-# forms, delim = chat_parse_utterance("les chevaux.", None, None, None, None)
-# utterance = Utterance(content=forms, delim=delim)
-# ut = Document(content=[utterance], langs=["fra"])
 # pipe = BatchalignPipeline.new("morphosyntax", "fra")
 # res = pipe(ut)
 # print(str(CHATFile(doc=res)))