diff --git a/pie_extended/models/__init__.py b/pie_extended/models/__init__.py index 609edd5..6937cb9 100644 --- a/pie_extended/models/__init__.py +++ b/pie_extended/models/__init__.py @@ -4,5 +4,6 @@ "fr", "freem", "grc", - "dum" + "dum", + "occ_cont" ] diff --git a/pie_extended/models/occ_cont/__init__.py b/pie_extended/models/occ_cont/__init__.py new file mode 100644 index 0000000..79a942e --- /dev/null +++ b/pie_extended/models/occ_cont/__init__.py @@ -0,0 +1,25 @@ +from ...utils import Metadata, File, get_path + + +DESC = Metadata( + "OccitanContemporain", + "occ_cont", + ["Oriane Nédey", "Juliette Janès"], + "Model trained on ...", + "https://github.com/DEFI-COLaF/modeles-papie" +) + +VERSION = "0.0.1" +DOWNLOADS = [ + File("https://github.com/DEFI-COLaF/modeles-papie/releases/download/" + VERSION + + "/occ-cont-lemma.tar", + "lemma.tar"), + File("https://github.com/DEFI-COLaF/modeles-papie/releases/download/" + VERSION + + "/occ-cont-POS.tar", + "lemma.tar"), +] + +Models = "".join([ + "<{},lemma>".format(get_path("occ_cont", "lemma.tar")), + "<{},pos>".format(get_path("occ_cont", "pos.tar")) +]) diff --git a/pie_extended/models/occ_cont/imports.py b/pie_extended/models/occ_cont/imports.py new file mode 100644 index 0000000..7b45045 --- /dev/null +++ b/pie_extended/models/occ_cont/imports.py @@ -0,0 +1,23 @@ +from pie_extended.pipeline.iterators.proto import DataIterator, GenericExcludePatterns +from pie_extended.pipeline.postprocessor.memory import MemoryzingProcessor +from pie_extended.models.occ_cont.tokenizer import OccMemorizingTokenizer +from pie_extended.pipeline.postprocessor.proto import ProcessorPrototype + + +def get_iterator_and_processor(max_tokens=256): + tokenizer = OccMemorizingTokenizer() + processor = MemoryzingProcessor( + tokenizer_memory=tokenizer, + head_processor=ProcessorPrototype() + ) + iterator = DataIterator( + tokenizer=tokenizer, + max_tokens=max_tokens, + exclude_patterns=[ + excl.exclude_regexp + for excl in tokenizer.normalizers + if excl.exclude_regexp + ] + ) + return iterator, processor + diff --git a/pie_extended/models/occ_cont/processor.py b/pie_extended/models/occ_cont/processor.py new file mode 100644 index 0000000..e69de29 diff --git a/pie_extended/models/occ_cont/tokenizer.py b/pie_extended/models/occ_cont/tokenizer.py new file mode 100644 index 0000000..e474e10 --- /dev/null +++ b/pie_extended/models/occ_cont/tokenizer.py @@ -0,0 +1,116 @@ +import regex as re +from pie_extended.pipeline.tokenizers.memorizing import MemorizingTokenizer +from typing import List, Generator, Tuple +import unicodedata +from pie_extended.pipeline.tokenizers.utils.excluder import ( + ReferenceExcluder, + DEFAULT_CHAR_REGISTRY +) + +_Dots_except_apostrophe = r".?!\"“”\"«»…\[\]\(\)„“" +_SpaceNormalizer = re.compile(r"(\s+)") + + +class OccMemorizingTokenizer(MemorizingTokenizer): + _sentence_boundaries = re.compile( + r"([" + _Dots_except_apostrophe + r"]+\s*)+" + ) + re_add_space_around_punct = re.compile(r"(\s*)([^\w\s])(\s*)") + + # Define a pattern that matches any punctuation or symbol, with exceptions + re_in_non_amb = re.compile(r"(?!['\-,.<>])[\p{P}\p{S}]") + + # Define a pattern that matches (XML/HTML...) tags # ToDO check that this change is ok + re_tags = re.compile(r'(<\\?[^\d\s].*>)') + + re_split_match = re.compile(rf"(\.{2,})|({re_in_non_amb.pattern})|{re_tags.pattern}") + + def __init__(self): + super(OccMemorizingTokenizer, self).__init__() + self.tokens = [] + self.char_registry = DEFAULT_CHAR_REGISTRY + self.normalizers: Tuple[ReferenceExcluder] = ( + ReferenceExcluder(char_registry=self.char_registry), + ) + self.re_ref = re.compile(rf"{self.char_registry['[']}REF[^{self.char_registry[']']}]+{self.char_registry[']']}") + self.re_split_step_one = re.compile( + rf"(?:{self.normalizers[0].re.pattern})|({self.re_in_non_amb.pattern}|\s|\.{2,}|{self.re_tags.pattern})" + ) + + @staticmethod + def _sentence_tokenizer_merge_matches(match): + """ Best way we found to deal with repeating groups""" + start, end = match.span() + return match.string[start:end] + "" + + def _real_sentence_tokenizer(self, string: str) -> List[str]: + string = _SpaceNormalizer.sub(" ", string.strip()) + string = self._sentence_boundaries.sub(self._sentence_tokenizer_merge_matches, string) + + for normalizer in self.normalizers: + string = normalizer.after_sentence_tokenizer(string) + + return string.split("") + + def _real_word_tokenizer(self, text: str, lower: bool = False) -> List[str]: + """ + Segments a string into a list of tokens by applying Occitan-specific regular expressions. + + :param text: string, ideally one single segment. + :returns: list of segmented tokens + """ + text = text.replace("qu'", "qu' ") # TODO Is this not already done by the regexes afterwards ? + text = text.replace("d'", "d' ") # TODO Is this not already done by the regexes afterwards ? + res = [] + text = re.sub(r'(\d)\s(\d)', r'\1\2', text) + for m in self.re_split_step_one.split(text): + if not m: + continue + elif self.normalizers[0].re.match(m): + res.append(m) + elif self.re_split_match.match(m): + res.append(m) + elif not re.match(r'^\s*$', m): + m = re.sub(r"(-[nz]-)(\P{L}*)", r"\t\1\t\2", m, flags=re.IGNORECASE) # pas d'espace + m = re.sub(r"(\P{L}|^)([dlmnst]\')", r"\1\t\2\t", m, flags=re.IGNORECASE) # espace avant + m = re.sub(r"(\P{L}|^)(\p{L}*[qnv][us]\')", r"\1\t\2\t", m, flags=re.IGNORECASE) # espace avant + m = re.sub(r"(\P{L}|^)(\p{L}*qu\')", r"\1\t\2\t", m, flags=re.IGNORECASE) # espace avant # TODO Duplicate of [qnv][us]' ? + m = re.sub(r"(\P{L}|^)(\p{L}*ent\')", r"\1\t\2\t", m, flags=re.IGNORECASE) # espace avant + m = re.sub(r"(\P{L}|^)(\p{L}*[çcbzu]\')", r"\1\t\2\t", m, flags=re.IGNORECASE) # espace avant # TODO Merge with [dlmnst] ? + m = re.sub(r"([\p{L}\d]+(\.[\p{L}\d]+)+)", r"\t\1\t", m) # espace avant et après + m = re.sub(r"\.($|\P{L})", r"\t.\1", m) + m = re.sub(r"(\D|^),", r"\1\t,\t", m) + m = re.sub(r",($|\D)", r"\t,\t\1", m) + m = re.sub(r"-(vos|ne|[st][eu]?'?|l[aoi']s?|me|d'|en|[nv]os|u)($|\P{L})", r"\t-\1\t\2", m, flags=re.IGNORECASE) # espace après # TODO Try to simplify ? + m = re.sub(r"\'([unv]\p{L}*)($|\P{L})", r"\t'\1\t\2", m, flags=re.IGNORECASE) # règle pour 'u 'us 'n 'v 'ns 'vs... # espace après + m = re.sub(r"\'([dlmnsti])($|\P{L})", r"\t'\1\t\2", m, flags=re.IGNORECASE) # règle pour 'm 't 'i 's 'ac ... # espace après + m = re.sub(r"(\p{P})(\p{P})", r"\t\1\t\2\t", m) + m = re.sub(r"", ' ', m) + m = re.sub(r"([<>])", r"\t\1\t", m) + res.extend(m.split('\t')) + + # Remove empty tokens + res = [item for item in res if item.strip()] + return res + + def normalizer(self, data: str) -> str: + for excluder in self.normalizers: + data = excluder.before_sentence_tokenizer(data) + return data + + def sentence_tokenizer(self, text: str, lower: bool = False) -> Generator[List[str], None, None]: + sentences = list() + data = self.normalizer(text) + for sent in self._real_sentence_tokenizer(data): + sent = sent.strip() + if sent: + sentences.append(self.word_tokenizer(sent)) + yield from sentences + + def replacer(self, inp: str): + for excluder in self.normalizers: + if excluder.exclude_regexp.match(inp): + if excluder.can_be_replaced: + return inp + + return unicodedata.normalize("NFKC", inp)