Skip to content

Commit

Permalink
muting catalan warning
Browse files Browse the repository at this point in the history
  • Loading branch information
Jemoka committed Jan 25, 2024
1 parent 7d0990c commit f9e11b2
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 15 deletions.
24 changes: 12 additions & 12 deletions batchalign/pipelines/morphosyntax/ud.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
from batchalign.document import *
from batchalign.pipelines.base import *
from batchalign.formats.chat.parser import chat_parse_utterance

from batchalign.utils.dp import *

import logging
Expand All @@ -60,7 +60,7 @@ def stringify_feats(*feats):

# the following is a list of feature-extracting handlers
# it is used to extract features from specific parts of
# speech.
# speech.

def handler(word, lang=None):
"""The generic handler"""
Expand All @@ -76,7 +76,7 @@ def handler(word, lang=None):

# unknown flag
unknown = False

# if there is a 0 in front, the word is unkown
# so we mark it as such
if target[0] == '0':
Expand Down Expand Up @@ -173,7 +173,7 @@ def handler__ADJ(word, lang=None):
person = str(feats.get("Person", 1))
if person == "0":
person = '4'

return handler(word, lang)+stringify_feats(deg, case, number[:1]+person)

def handler__NOUN(word, lang=None):
Expand Down Expand Up @@ -213,7 +213,7 @@ def handler__VERB(word, lang=None):
if person == "0":
person = '4'
number = feats.get("Number", "Sing")

tense = feats.get("Tense", "")
polarity = feats.get("Polarity", "")
polite = feats.get("Polite", "")
Expand Down Expand Up @@ -369,7 +369,7 @@ def parse_sentence(sentence, delimiter=".", special_forms=[], lang="$nospecial$"
mor_word = handle(word, lang)
# exception: if the word is 0, it is probably 0word
# occationally Stanza screws up and makes forms like 0thing as 2 tokens:
# 0 and thing
# 0 and thing
if word.text.strip() == "0":
mor.append("$ZERO$")
num_skipped+=1 # mark skipped if skipped
Expand All @@ -381,7 +381,7 @@ def parse_sentence(sentence, delimiter=".", special_forms=[], lang="$nospecial$"
mor_word = "cm|begin"
elif word.text.strip() == '„':
mor_word = "cm|end"


# specivl forms: recall the special form marker is xbxxx
if "xbxxx" in word.text.strip():
Expand Down Expand Up @@ -499,10 +499,10 @@ def parse_sentence(sentence, delimiter=".", special_forms=[], lang="$nospecial$"
return (mor_str, gra_str)

def clean_sentence(sent):
"""clean a sentence
"""clean a sentence
Arguments:
sent (string):
sent (string):
"""

remove = ["+,", "++", "+\""]
Expand Down Expand Up @@ -632,7 +632,7 @@ def morphoanalyze(doc: Document, status_hook:callable = None):
# some languages don't have alpha 2
pass


# pycountry.languages.get(alpha_3=i).alpha_2 for i in lang

config = {"processors": {"tokenize": "default",
Expand All @@ -647,7 +647,7 @@ def morphoanalyze(doc: Document, status_hook:callable = None):
if "zh" in lang:
lang.pop(lang.index("zh"))
lang.append("zh-hans")

elif "hr" not in lang and "zh" not in lang and "zh-hans" not in lang and "ja" not in lang and "ko" not in lang:
if "en" in lang:
config["processors"]["mwt"] = "gum"
Expand Down Expand Up @@ -757,7 +757,7 @@ def morphoanalyze(doc: Document, status_hook:callable = None):
mor, gra = parse_sentence(sents[0], ending, special_forms_cleaned, lang[0])
# breakpoint()

if mor.strip() == "":
if mor.strip() == "" or mor.strip() in ENDING_PUNCT:
L.debug(f"Encountered an utterance that's likely devoid of morphological information; skipping... utterance='{doc.content[indx]}'")
continue

Expand Down
6 changes: 3 additions & 3 deletions batchalign/version
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
0.4.0-post.1
Jan 23st, 2024
Croatian models
0.4.0-post.2
Jan 25th, 2024
Muting warning about Catalan

0 comments on commit f9e11b2

Please sign in to comment.