diff --git a/batchalign/document.py b/batchalign/document.py index 7cc62eb..81cc51d 100644 --- a/batchalign/document.py +++ b/batchalign/document.py @@ -208,6 +208,7 @@ def __str__(self): # t = re.sub(r"^[^\w\d\s<]+", "", t.strip()).strip() t = re.sub(r",", " , ", t.strip()).strip() t = re.sub(r" +", " ", t.strip()).strip() + t = t.replace("+ ,", "+,").strip() return t def __repr__(self): diff --git a/batchalign/pipelines/morphosyntax/ud.py b/batchalign/pipelines/morphosyntax/ud.py index fcc9bca..ae80c27 100644 --- a/batchalign/pipelines/morphosyntax/ud.py +++ b/batchalign/pipelines/morphosyntax/ud.py @@ -883,6 +883,7 @@ def morphoanalyze(doc: Document, retokenize:bool, status_hook:callable = None, * retokenized_ut = retokenized_ut.replace(" >", ">") retokenized_ut = retokenized_ut.replace("< ", "<") retokenized_ut = retokenized_ut.replace(" :", ":") + retokenized_ut = retokenized_ut.replace("+ ,", "+,") retokenized_ut = retokenized_ut.replace(": <", ": <") retokenized_ut = retokenized_ut.replace(" ↑", "↑") retokenized_ut = re.sub(r"@ ?w ?p", "@wp", retokenized_ut) diff --git a/batchalign/version b/batchalign/version index 2287b0f..27b567a 100644 --- a/batchalign/version +++ b/batchalign/version @@ -1,3 +1,3 @@ -0.7.3-beta.13 +0.7.3-beta.14 July 6th, 2024 UD Fixes diff --git a/scratchpad.py b/scratchpad.py index 352bce4..08c7941 100644 --- a/scratchpad.py +++ b/scratchpad.py @@ -21,7 +21,7 @@ # json.dump(Document.model_json_schema(), df, indent=4) ########### The Batchalign Core Test Harness ########### -# from batchalign.formats.chat.parser import chat_parse_utterance +from batchalign.formats.chat.parser import chat_parse_utterance # ng = NgramRetraceEngine() # disf = DisfluencyReplacementEngine() @@ -100,19 +100,20 @@ # text = "ice ice cream ice cream" -# function = "morphosyntax" -# lang = "cym" -# num_speakers = 1 +function = "morphosyntax" +lang = "ron" +num_speakers = 1 -# forms, delim = chat_parse_utterance("<ポン@o ポン@o> [/] ポン@o .", None, None, None, None) -# utterance = Utterance(content=forms, delim=delim, text="<ポン@o ポン@o> [/] ポン@o .") +forms, delim = chat_parse_utterance("+, culoarea galbenă. ", None, None, None, None) +utterance = Utterance(content=forms, delim=delim, text="+, culoarea galbenă .") -# ut = Document(content=[utterance], langs=["jpn"]) -# pipeline = BatchalignPipeline.new("morphosyntax", lang="jpn") -# res = pipeline(ut, retokenize=True) +ut = Document(content=[utterance], langs=["ron"]) -# print(str(CHATFile(doc=res))) +pipeline = BatchalignPipeline.new("morphosyntax", lang="ron") +res = pipeline(ut, retokenize=True) + +print(str(CHATFile(doc=res))) ########### The Batchalign Individual Engine Harness ###########