Skip to content

Commit

Permalink
[ci skip] benchmarking logic
Browse files Browse the repository at this point in the history
  • Loading branch information
Jemoka committed Dec 7, 2024
1 parent 75bba0b commit 40dc89b
Show file tree
Hide file tree
Showing 6 changed files with 72 additions and 15 deletions.
2 changes: 1 addition & 1 deletion batchalign/formats/chat/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ def annotation_clean(content, special=False):
cleaned_word = re.sub(r"\x15\d+_\d+\x15", '', cleaned_word)
if not special:
cleaned_word = re.sub(r"&~\w+", '', cleaned_word)
cleaned_word = cleaned_word.replace("(","").replace(")","")
# cleaned_word = cleaned_word.replace("(","").replace(")","")
cleaned_word = cleaned_word.replace("[","").replace("]","")
cleaned_word = cleaned_word.replace("<","").replace(">","")
cleaned_word = cleaned_word.replace("“","").replace("”","")
Expand Down
50 changes: 44 additions & 6 deletions batchalign/pipelines/analysis/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
Engines for transcript evaluation
"""

import re
from batchalign.document import *
from batchalign.pipelines.base import *
from batchalign.pipelines.asr.utils import *
Expand All @@ -22,11 +23,34 @@ def __compute_wer(doc, gold):
forms = [ j.text.lower() for i in doc.content for j in i.content if isinstance(i, Utterance)]
gold_forms = [ j.text.lower() for i in gold.content for j in i.content if isinstance(i, Utterance)]

forms = [i for i in forms if i.strip() not in MOR_PUNCT+ENDING_PUNCT]
gold_forms = [i for i in gold_forms if i.strip() not in MOR_PUNCT+ENDING_PUNCT]
forms = [i.replace("-", "") for i in forms if i.strip() not in MOR_PUNCT+ENDING_PUNCT]
gold_forms = [i.replace("-", "") for i in gold_forms if i.strip() not in MOR_PUNCT+ENDING_PUNCT]

forms = [re.sub(r"\((.*)\)",r"", i) for i in forms]
gold_forms = [re.sub(r"\((.*)\)",r"", i) for i in gold_forms]

# if there are single letter frames, we combine them tofgether
# until the utterance is done or there isn't any left
forms_finished = []

single_sticky = ""
is_single = False

for i in forms:
if len(i) == 1:
single_sticky += i
else:
if single_sticky != "":
forms_finished.append(single_sticky)
single_sticky = ""
forms_finished.append(i)

if single_sticky != "":
forms_finished.append(single_sticky)
single_sticky = ""

# dp!
alignment = align(forms, gold_forms, False)
alignment = align(forms_finished, gold_forms, False)

# calculate each type of error
sub = 0
Expand All @@ -39,14 +63,28 @@ def __compute_wer(doc, gold):
# but if we have <extra.reference> <extra.reference> this is 2 insertions

cleaned_alignment = []
# whether we had a "firstname" in reference document and hence are
# anticipating a payload for it (the actual name) in the next entry in the
# alignment
anticipating_payload = False

for i in alignment:

if isinstance(i, Extra):
if len(cleaned_alignment) > 0 and i.extra_type == ExtraType.REFERENCE and "name" in i.key and i.key[:4] != "name":
cleaned_alignment.pop(-1)

if i.extra_type == ExtraType.REFERENCE and "name" in i.key and i.key[:4] != "name":
if (isinstance(cleaned_alignment[-1], Extra) and
cleaned_alignment[-1].extra_type == ExtraType.PAYLOAD and
len(cleaned_alignment) > 0):
cleaned_alignment.pop(-1)
else:
anticipating_payload = True
cleaned_alignment.append(Match(i.key, None, None))
continue
elif i.extra_type == ExtraType.PAYLOAD and anticipating_payload:
anticipating_payload = False
continue


if prev_error != None and prev_error != i.extra_type:
# this is a substitution: we have different "extra"s in
Expand Down Expand Up @@ -75,7 +113,7 @@ def __compute_wer(doc, gold):
cleaned_alignment.append(i)

diff = []
for i in alignment:
for i in cleaned_alignment:
if isinstance(i, Extra):
diff.append(f"{'+' if i.extra_type == ExtraType.REFERENCE else '-'} {i.key}")
else:
Expand Down
3 changes: 2 additions & 1 deletion batchalign/pipelines/morphosyntax/ud.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ def handler(word, lang=None):
target = target.replace('/100', '')
target = target.replace('/r', '')
target = target.replace('(', '')
target = target.replace("(","").replace(")","")

# remove attachments
if "|" in target:
Expand Down Expand Up @@ -848,7 +849,7 @@ def morphoanalyze(doc: Document, retokenize:bool, status_hook:callable = None, *
inputs.append(line_cut)

try:
sents = nlp(line_cut.strip()).sentences
sents = nlp(line_cut.replace("(","").replace(")","").strip()).sentences

if len(sents) == 0:
continue
Expand Down
3 changes: 3 additions & 0 deletions batchalign/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ def word_tokenize(str):
return tmp.tokenize(str)
except LookupError:
nltk.download("punkt")
nltk.download("punkt_tab")
return tmp.tokenize(str)

def sent_tokenize(str):
Expand All @@ -49,6 +50,7 @@ def sent_tokenize(str):
return ST(str)
except LookupError:
nltk.download("punkt")
nltk.download("punkt_tab")
return ST(str)

def detokenize(tokens):
Expand All @@ -69,6 +71,7 @@ def detokenize(tokens):
return TreebankWordDetokenizer().detokenize(tokens)
except LookupError:
nltk.download("punkt")
nltk.download("punkt_tab")
return TreebankWordDetokenizer().detokenize(tokens)

def correct_timing(doc):
Expand Down
6 changes: 3 additions & 3 deletions batchalign/version
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
0.7.6-alpha.33
November 26, 2024
French APM (minor)
0.7.7-alpha.1
December 06, 2024
Changing some benchmarking form handling logic
23 changes: 19 additions & 4 deletions scratchpad.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,25 @@

# # ng = NgramRetraceEngine()
# # disf = DisfluencyReplacementEngine()
# # doc = Document.new("I ' m such an idiot", lang="eng")
# doc = Document.new("in general, we have hair here.", lang="eng")

# forms, delim = chat_parse_utterance("in general, we have hair(stuff) here.", None, None, None, None)
# utterance = Utterance(content=forms, delim=delim)
# gold = Document(content=[utterance], langs=["eng"])

# pipeline = BatchalignPipeline(EvaluationEngine())
# result = pipeline(doc, gold=gold)

# # pipeline = BatchalignPipeline.new("morphosyntax")
# # result2 = pipeline(gold)

# # print(str(CHATFile(doc=result2)))


# result
# print(result["diff"])


# # # # doc[0].content[4].text = "maman,"
# # # # doc[0].content[5].text = "maman,"
# # pipe = BatchalignPipeline(ng, disf)
Expand Down Expand Up @@ -92,9 +110,6 @@

# from batchalign.models import BertUtteranceModel
# from batchalign.pipelines import BatchalignPipeline
# forms, delim = chat_parse_utterance("les chevaux.", None, None, None, None)
# utterance = Utterance(content=forms, delim=delim)
# ut = Document(content=[utterance], langs=["fra"])
# pipe = BatchalignPipeline.new("morphosyntax", "fra")
# res = pipe(ut)
# print(str(CHATFile(doc=res)))
Expand Down

0 comments on commit 40dc89b

Please sign in to comment.