Skip to content

Commit

Permalink
[WIP] OCC-Cont [Working Tokenizer and Ref Excluder]
Browse files Browse the repository at this point in the history
  • Loading branch information
PonteIneptique committed May 30, 2024
1 parent 7d37a36 commit 3cef707
Show file tree
Hide file tree
Showing 5 changed files with 166 additions and 1 deletion.
3 changes: 2 additions & 1 deletion pie_extended/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,6 @@
"fr",
"freem",
"grc",
"dum"
"dum",
"occ_cont"
]
25 changes: 25 additions & 0 deletions pie_extended/models/occ_cont/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from ...utils import Metadata, File, get_path


DESC = Metadata(
"OccitanContemporain",
"occ_cont",
["Oriane Nédey", "Juliette Janès"],
"Model trained on ...",
"https://github.com/DEFI-COLaF/modeles-papie"
)

VERSION = "0.0.1"
DOWNLOADS = [
File("https://github.com/DEFI-COLaF/modeles-papie/releases/download/" + VERSION +
"/occ-cont-lemma.tar",
"lemma.tar"),
File("https://github.com/DEFI-COLaF/modeles-papie/releases/download/" + VERSION +
"/occ-cont-POS.tar",
"lemma.tar"),
]

Models = "".join([
"<{},lemma>".format(get_path("occ_cont", "lemma.tar")),
"<{},pos>".format(get_path("occ_cont", "pos.tar"))
])
23 changes: 23 additions & 0 deletions pie_extended/models/occ_cont/imports.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from pie_extended.pipeline.iterators.proto import DataIterator, GenericExcludePatterns
from pie_extended.pipeline.postprocessor.memory import MemoryzingProcessor
from pie_extended.models.occ_cont.tokenizer import OccMemorizingTokenizer
from pie_extended.pipeline.postprocessor.proto import ProcessorPrototype


def get_iterator_and_processor(max_tokens=256):
tokenizer = OccMemorizingTokenizer()
processor = MemoryzingProcessor(
tokenizer_memory=tokenizer,
head_processor=ProcessorPrototype()
)
iterator = DataIterator(
tokenizer=tokenizer,
max_tokens=max_tokens,
exclude_patterns=[
excl.exclude_regexp
for excl in tokenizer.normalizers
if excl.exclude_regexp
]
)
return iterator, processor

Empty file.
116 changes: 116 additions & 0 deletions pie_extended/models/occ_cont/tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
import regex as re
from pie_extended.pipeline.tokenizers.memorizing import MemorizingTokenizer
from typing import List, Generator, Tuple
import unicodedata
from pie_extended.pipeline.tokenizers.utils.excluder import (
ReferenceExcluder,
DEFAULT_CHAR_REGISTRY
)

_Dots_except_apostrophe = r".?!\"“”\"«»…\[\]\(\)„“"
_SpaceNormalizer = re.compile(r"(\s+)")


class OccMemorizingTokenizer(MemorizingTokenizer):
_sentence_boundaries = re.compile(
r"([" + _Dots_except_apostrophe + r"]+\s*)+"
)
re_add_space_around_punct = re.compile(r"(\s*)([^\w\s])(\s*)")

# Define a pattern that matches any punctuation or symbol, with exceptions
re_in_non_amb = re.compile(r"(?!['\-,.<>])[\p{P}\p{S}]")

# Define a pattern that matches (XML/HTML...) tags # ToDO check that this change is ok
re_tags = re.compile(r'(<\\?[^\d\s].*>)')

re_split_match = re.compile(rf"(\.{2,})|({re_in_non_amb.pattern})|{re_tags.pattern}")

def __init__(self):
super(OccMemorizingTokenizer, self).__init__()
self.tokens = []
self.char_registry = DEFAULT_CHAR_REGISTRY
self.normalizers: Tuple[ReferenceExcluder] = (
ReferenceExcluder(char_registry=self.char_registry),
)
self.re_ref = re.compile(rf"{self.char_registry['[']}REF[^{self.char_registry[']']}]+{self.char_registry[']']}")
self.re_split_step_one = re.compile(
rf"(?:{self.normalizers[0].re.pattern})|({self.re_in_non_amb.pattern}|\s|\.{2,}|{self.re_tags.pattern})"
)

@staticmethod
def _sentence_tokenizer_merge_matches(match):
""" Best way we found to deal with repeating groups"""
start, end = match.span()
return match.string[start:end] + "<SPLIT>"

def _real_sentence_tokenizer(self, string: str) -> List[str]:
string = _SpaceNormalizer.sub(" ", string.strip())
string = self._sentence_boundaries.sub(self._sentence_tokenizer_merge_matches, string)

for normalizer in self.normalizers:
string = normalizer.after_sentence_tokenizer(string)

return string.split("<SPLIT>")

def _real_word_tokenizer(self, text: str, lower: bool = False) -> List[str]:
"""
Segments a string into a list of tokens by applying Occitan-specific regular expressions.
:param text: string, ideally one single segment.
:returns: list of segmented tokens
"""
text = text.replace("qu'", "qu' ") # TODO Is this not already done by the regexes afterwards ?
text = text.replace("d'", "d' ") # TODO Is this not already done by the regexes afterwards ?
res = []
text = re.sub(r'(\d)\s(\d)', r'\1<PPLesp>\2', text)
for m in self.re_split_step_one.split(text):
if not m:
continue
elif self.normalizers[0].re.match(m):
res.append(m)
elif self.re_split_match.match(m):
res.append(m)
elif not re.match(r'^\s*$', m):
m = re.sub(r"(-[nz]-)(\P{L}*)", r"\t\1\t\2", m, flags=re.IGNORECASE) # pas d'espace
m = re.sub(r"(\P{L}|^)([dlmnst]\')", r"\1\t\2\t", m, flags=re.IGNORECASE) # espace avant
m = re.sub(r"(\P{L}|^)(\p{L}*[qnv][us]\')", r"\1\t\2\t", m, flags=re.IGNORECASE) # espace avant
m = re.sub(r"(\P{L}|^)(\p{L}*qu\')", r"\1\t\2\t", m, flags=re.IGNORECASE) # espace avant # TODO Duplicate of [qnv][us]' ?
m = re.sub(r"(\P{L}|^)(\p{L}*ent\')", r"\1\t\2\t", m, flags=re.IGNORECASE) # espace avant
m = re.sub(r"(\P{L}|^)(\p{L}*[çcbzu]\')", r"\1\t\2\t", m, flags=re.IGNORECASE) # espace avant # TODO Merge with [dlmnst] ?
m = re.sub(r"([\p{L}\d]+(\.[\p{L}\d]+)+)", r"\t\1\t", m) # espace avant et après
m = re.sub(r"\.($|\P{L})", r"\t.\1", m)
m = re.sub(r"(\D|^),", r"\1\t,\t", m)
m = re.sub(r",($|\D)", r"\t,\t\1", m)
m = re.sub(r"-(vos|ne|[st][eu]?'?|l[aoi']s?|me|d'|en|[nv]os|u)($|\P{L})", r"\t-\1\t\2", m, flags=re.IGNORECASE) # espace après # TODO Try to simplify ?
m = re.sub(r"\'([unv]\p{L}*)($|\P{L})", r"\t'\1\t\2", m, flags=re.IGNORECASE) # règle pour 'u 'us 'n 'v 'ns 'vs... # espace après
m = re.sub(r"\'([dlmnsti])($|\P{L})", r"\t'\1\t\2", m, flags=re.IGNORECASE) # règle pour 'm 't 'i 's 'ac ... # espace après
m = re.sub(r"(\p{P})(\p{P})", r"\t\1\t\2\t", m)
m = re.sub(r"<PPLesp>", ' ', m)
m = re.sub(r"([<>])", r"\t\1\t", m)
res.extend(m.split('\t'))

# Remove empty tokens
res = [item for item in res if item.strip()]
return res

def normalizer(self, data: str) -> str:
for excluder in self.normalizers:
data = excluder.before_sentence_tokenizer(data)
return data

def sentence_tokenizer(self, text: str, lower: bool = False) -> Generator[List[str], None, None]:
sentences = list()
data = self.normalizer(text)
for sent in self._real_sentence_tokenizer(data):
sent = sent.strip()
if sent:
sentences.append(self.word_tokenizer(sent))
yield from sentences

def replacer(self, inp: str):
for excluder in self.normalizers:
if excluder.exclude_regexp.match(inp):
if excluder.can_be_replaced:
return inp

return unicodedata.normalize("NFKC", inp)

0 comments on commit 3cef707

Please sign in to comment.