Skip to content

Commit

Permalink
more creaky fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
Jemoka committed Nov 11, 2024
1 parent 562ffcc commit 0ad8499
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 9 deletions.
12 changes: 8 additions & 4 deletions batchalign/pipelines/morphosyntax/ud.py
Original file line number Diff line number Diff line change
Expand Up @@ -893,8 +893,11 @@ def morphoanalyze(doc: Document, retokenize:bool, status_hook:callable = None, *

for i in aligned:
if isinstance(i, Match):
if i.reference_payload not in chunks_backplate[i.payload]:
chunks_backplate[i.payload].append(i.reference_payload)
if not creaky:
if i.reference_payload not in chunks_backplate[i.payload]:
chunks_backplate[i.payload].append(i.reference_payload)
else:
collected += i.key
elif isinstance(i, Extra) and i.extra_type == ExtraType.PAYLOAD:
if i.key == "⁎":
creaky = not creaky
Expand All @@ -913,7 +916,6 @@ def morphoanalyze(doc: Document, retokenize:bool, status_hook:callable = None, *
pos = "x",
feats = ""
)]

poses = [i.morphology[0].pos.upper() for i in ut
if i.morphology
and len(i.morphology) > 0]
Expand All @@ -938,9 +940,11 @@ def morphoanalyze(doc: Document, retokenize:bool, status_hook:callable = None, *
retokenized_ut = retokenized_ut.replace(" ↑", "↑")
retokenized_ut = re.sub(r"@ ?w ?p", "@wp", retokenized_ut)
retokenized_ut = retokenized_ut.replace(" @", "@")
retokenized_ut = re.sub(r"\*[* ]*", "*", retokenized_ut)
# retokenized_ut = re.sub(r"\*[* ]*", "*", retokenized_ut)
retokenized_ut = re.sub(r"⁎[⁎ ]*(.*?)[⁎ ]*⁎", r"⁎\1⁎ ", retokenized_ut)
retokenized_ut = re.sub(r"\[\*(.)\]", r"[* \1]", retokenized_ut)
retokenized_ut = re.sub(r" +", r" ", retokenized_ut)

# pray to everyone that it works---this will simply crash and ignore
# the utterance if it didn't work, so we are doing this as a sanity
# check rather than needing the parsed result
Expand Down
2 changes: 1 addition & 1 deletion batchalign/version
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
0.7.6-alpha.27
0.7.6-alpha.28
November 10, 2024
Japanese changes + packaging
8 changes: 4 additions & 4 deletions scratchpad.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,12 @@
# # json.dump(Document.model_json_schema(), df, indent=4)

# ########### The Batchalign Core Test Harness ###########
from batchalign.formats.chat.parser import chat_parse_utterance
# from batchalign.formats.chat.parser import chat_parse_utterance

# print(str(CHATFile(doc=ut)))
# doc = CHATFile(path="../talkbank-alignment/input/barry.cha").doc
# doc[3][0]

# て
# print(str(CHATFile(doc=res)))


Expand Down Expand Up @@ -138,12 +138,12 @@
# # forms
# utterance = Utterance(content=forms, delim=delim, text=ut)

# sec = "ミッキーさん ⁎いい子⁎ [: いい子いい子] なの . "
# sec = "カッカに ⁎かって⁎ [: 貸て] くれる [* s] ."

# forms, delim = chat_parse_utterance(sec, None, None, None, None)
# utterance = Utterance(content=forms, delim=delim, text=sec)

# # # =======
# # # # =======
# ut = Document(content=[utterance], langs=["jpn"])

# pipeline = BatchalignPipeline.new("morphosyntax", lang="jpn")
Expand Down

0 comments on commit 0ad8499

Please sign in to comment.