-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[WIP] OCC-Cont [Working Tokenizer and Ref Excluder]
- Loading branch information
1 parent
7d37a36
commit 3cef707
Showing
5 changed files
with
166 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,5 +4,6 @@ | |
"fr", | ||
"freem", | ||
"grc", | ||
"dum" | ||
"dum", | ||
"occ_cont" | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
from ...utils import Metadata, File, get_path | ||
|
||
|
||
DESC = Metadata( | ||
"OccitanContemporain", | ||
"occ_cont", | ||
["Oriane Nédey", "Juliette Janès"], | ||
"Model trained on ...", | ||
"https://github.com/DEFI-COLaF/modeles-papie" | ||
) | ||
|
||
VERSION = "0.0.1" | ||
DOWNLOADS = [ | ||
File("https://github.com/DEFI-COLaF/modeles-papie/releases/download/" + VERSION + | ||
"/occ-cont-lemma.tar", | ||
"lemma.tar"), | ||
File("https://github.com/DEFI-COLaF/modeles-papie/releases/download/" + VERSION + | ||
"/occ-cont-POS.tar", | ||
"lemma.tar"), | ||
] | ||
|
||
Models = "".join([ | ||
"<{},lemma>".format(get_path("occ_cont", "lemma.tar")), | ||
"<{},pos>".format(get_path("occ_cont", "pos.tar")) | ||
]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
from pie_extended.pipeline.iterators.proto import DataIterator, GenericExcludePatterns | ||
from pie_extended.pipeline.postprocessor.memory import MemoryzingProcessor | ||
from pie_extended.models.occ_cont.tokenizer import OccMemorizingTokenizer | ||
from pie_extended.pipeline.postprocessor.proto import ProcessorPrototype | ||
|
||
|
||
def get_iterator_and_processor(max_tokens=256): | ||
tokenizer = OccMemorizingTokenizer() | ||
processor = MemoryzingProcessor( | ||
tokenizer_memory=tokenizer, | ||
head_processor=ProcessorPrototype() | ||
) | ||
iterator = DataIterator( | ||
tokenizer=tokenizer, | ||
max_tokens=max_tokens, | ||
exclude_patterns=[ | ||
excl.exclude_regexp | ||
for excl in tokenizer.normalizers | ||
if excl.exclude_regexp | ||
] | ||
) | ||
return iterator, processor | ||
|
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
import regex as re | ||
from pie_extended.pipeline.tokenizers.memorizing import MemorizingTokenizer | ||
from typing import List, Generator, Tuple | ||
import unicodedata | ||
from pie_extended.pipeline.tokenizers.utils.excluder import ( | ||
ReferenceExcluder, | ||
DEFAULT_CHAR_REGISTRY | ||
) | ||
|
||
_Dots_except_apostrophe = r".?!\"“”\"«»…\[\]\(\)„“" | ||
_SpaceNormalizer = re.compile(r"(\s+)") | ||
|
||
|
||
class OccMemorizingTokenizer(MemorizingTokenizer): | ||
_sentence_boundaries = re.compile( | ||
r"([" + _Dots_except_apostrophe + r"]+\s*)+" | ||
) | ||
re_add_space_around_punct = re.compile(r"(\s*)([^\w\s])(\s*)") | ||
|
||
# Define a pattern that matches any punctuation or symbol, with exceptions | ||
re_in_non_amb = re.compile(r"(?!['\-,.<>])[\p{P}\p{S}]") | ||
|
||
# Define a pattern that matches (XML/HTML...) tags # ToDO check that this change is ok | ||
re_tags = re.compile(r'(<\\?[^\d\s].*>)') | ||
|
||
re_split_match = re.compile(rf"(\.{2,})|({re_in_non_amb.pattern})|{re_tags.pattern}") | ||
|
||
def __init__(self): | ||
super(OccMemorizingTokenizer, self).__init__() | ||
self.tokens = [] | ||
self.char_registry = DEFAULT_CHAR_REGISTRY | ||
self.normalizers: Tuple[ReferenceExcluder] = ( | ||
ReferenceExcluder(char_registry=self.char_registry), | ||
) | ||
self.re_ref = re.compile(rf"{self.char_registry['[']}REF[^{self.char_registry[']']}]+{self.char_registry[']']}") | ||
self.re_split_step_one = re.compile( | ||
rf"(?:{self.normalizers[0].re.pattern})|({self.re_in_non_amb.pattern}|\s|\.{2,}|{self.re_tags.pattern})" | ||
) | ||
|
||
@staticmethod | ||
def _sentence_tokenizer_merge_matches(match): | ||
""" Best way we found to deal with repeating groups""" | ||
start, end = match.span() | ||
return match.string[start:end] + "<SPLIT>" | ||
|
||
def _real_sentence_tokenizer(self, string: str) -> List[str]: | ||
string = _SpaceNormalizer.sub(" ", string.strip()) | ||
string = self._sentence_boundaries.sub(self._sentence_tokenizer_merge_matches, string) | ||
|
||
for normalizer in self.normalizers: | ||
string = normalizer.after_sentence_tokenizer(string) | ||
|
||
return string.split("<SPLIT>") | ||
|
||
def _real_word_tokenizer(self, text: str, lower: bool = False) -> List[str]: | ||
""" | ||
Segments a string into a list of tokens by applying Occitan-specific regular expressions. | ||
:param text: string, ideally one single segment. | ||
:returns: list of segmented tokens | ||
""" | ||
text = text.replace("qu'", "qu' ") # TODO Is this not already done by the regexes afterwards ? | ||
text = text.replace("d'", "d' ") # TODO Is this not already done by the regexes afterwards ? | ||
res = [] | ||
text = re.sub(r'(\d)\s(\d)', r'\1<PPLesp>\2', text) | ||
for m in self.re_split_step_one.split(text): | ||
if not m: | ||
continue | ||
elif self.normalizers[0].re.match(m): | ||
res.append(m) | ||
elif self.re_split_match.match(m): | ||
res.append(m) | ||
elif not re.match(r'^\s*$', m): | ||
m = re.sub(r"(-[nz]-)(\P{L}*)", r"\t\1\t\2", m, flags=re.IGNORECASE) # pas d'espace | ||
m = re.sub(r"(\P{L}|^)([dlmnst]\')", r"\1\t\2\t", m, flags=re.IGNORECASE) # espace avant | ||
m = re.sub(r"(\P{L}|^)(\p{L}*[qnv][us]\')", r"\1\t\2\t", m, flags=re.IGNORECASE) # espace avant | ||
m = re.sub(r"(\P{L}|^)(\p{L}*qu\')", r"\1\t\2\t", m, flags=re.IGNORECASE) # espace avant # TODO Duplicate of [qnv][us]' ? | ||
m = re.sub(r"(\P{L}|^)(\p{L}*ent\')", r"\1\t\2\t", m, flags=re.IGNORECASE) # espace avant | ||
m = re.sub(r"(\P{L}|^)(\p{L}*[çcbzu]\')", r"\1\t\2\t", m, flags=re.IGNORECASE) # espace avant # TODO Merge with [dlmnst] ? | ||
m = re.sub(r"([\p{L}\d]+(\.[\p{L}\d]+)+)", r"\t\1\t", m) # espace avant et après | ||
m = re.sub(r"\.($|\P{L})", r"\t.\1", m) | ||
m = re.sub(r"(\D|^),", r"\1\t,\t", m) | ||
m = re.sub(r",($|\D)", r"\t,\t\1", m) | ||
m = re.sub(r"-(vos|ne|[st][eu]?'?|l[aoi']s?|me|d'|en|[nv]os|u)($|\P{L})", r"\t-\1\t\2", m, flags=re.IGNORECASE) # espace après # TODO Try to simplify ? | ||
m = re.sub(r"\'([unv]\p{L}*)($|\P{L})", r"\t'\1\t\2", m, flags=re.IGNORECASE) # règle pour 'u 'us 'n 'v 'ns 'vs... # espace après | ||
m = re.sub(r"\'([dlmnsti])($|\P{L})", r"\t'\1\t\2", m, flags=re.IGNORECASE) # règle pour 'm 't 'i 's 'ac ... # espace après | ||
m = re.sub(r"(\p{P})(\p{P})", r"\t\1\t\2\t", m) | ||
m = re.sub(r"<PPLesp>", ' ', m) | ||
m = re.sub(r"([<>])", r"\t\1\t", m) | ||
res.extend(m.split('\t')) | ||
|
||
# Remove empty tokens | ||
res = [item for item in res if item.strip()] | ||
return res | ||
|
||
def normalizer(self, data: str) -> str: | ||
for excluder in self.normalizers: | ||
data = excluder.before_sentence_tokenizer(data) | ||
return data | ||
|
||
def sentence_tokenizer(self, text: str, lower: bool = False) -> Generator[List[str], None, None]: | ||
sentences = list() | ||
data = self.normalizer(text) | ||
for sent in self._real_sentence_tokenizer(data): | ||
sent = sent.strip() | ||
if sent: | ||
sentences.append(self.word_tokenizer(sent)) | ||
yield from sentences | ||
|
||
def replacer(self, inp: str): | ||
for excluder in self.normalizers: | ||
if excluder.exclude_regexp.match(inp): | ||
if excluder.can_be_replaced: | ||
return inp | ||
|
||
return unicodedata.normalize("NFKC", inp) |