diff --git a/changelog.md b/changelog.md index 4e2f7f5f5..2d1998193 100644 --- a/changelog.md +++ b/changelog.md @@ -9,6 +9,7 @@ - Log the training config at the beginning of the trainings - Support a specific model output dir path for trainings (`output_model_dir`), and whether to save the model or not (`save_model`) - Specify whether to log the validation results or not (`logger=False`) +- Added support for the CoNLL format with `edsnlp.data.read_conll` and with a specific `eds.conll_dict2doc` converter ### Fixed diff --git a/docs/assets/stylesheets/extra.css b/docs/assets/stylesheets/extra.css index 41fbec3ed..ef6c443e4 100644 --- a/docs/assets/stylesheets/extra.css +++ b/docs/assets/stylesheets/extra.css @@ -188,4 +188,5 @@ a.discrete-link { .sourced-heading > a { font-size: 1rem; + align-content: center; } diff --git a/docs/data/conll.md b/docs/data/conll.md new file mode 100644 index 000000000..063943c26 --- /dev/null +++ b/docs/data/conll.md @@ -0,0 +1,38 @@ +# CoNLL + +??? abstract "TLDR" + + ```{ .python .no-check } + import edsnlp + + stream = edsnlp.data.read_conll(path) + stream = stream.map_pipeline(nlp) + ``` + +You can easily integrate CoNLL formatted files into your project by using EDS-NLP's CoNLL reader. + +There are many CoNLL formats corresponding to different shared tasks, but one of the most common is the CoNLL-U format, which is used for dependency parsing. In CoNLL files, each line corresponds to a token and contains various columns with information about the token, such as its index, form, lemma, POS tag, and dependency relation. + +EDS-NLP lets you specify the name of the `columns` if they are different from the default CoNLL-U format. If the `columns` parameter is unset, the reader looks for a comment containing `# global.columns` to infer the column names. Otherwise, the columns are + +``` +ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC +``` + +A typical CoNLL file looks like this: + +```{ title="sample.conllu" } +1 euh euh INTJ _ _ 5 discourse _ SpaceAfter=No +2 , , PUNCT _ _ 1 punct _ _ +3 il lui PRON _ Gender=Masc|Number=Sing|Person=3|PronType=Prs 5 expl:subj _ _ +... +``` + +## Reading CoNLL files {: #edsnlp.data.conll.read_conll } + +::: edsnlp.data.conll.read_conll + options: + heading_level: 3 + show_source: false + show_toc: false + show_bases: false diff --git a/edsnlp/data/__init__.py b/edsnlp/data/__init__.py index 44eb620fd..c6cbc0593 100644 --- a/edsnlp/data/__init__.py +++ b/edsnlp/data/__init__.py @@ -7,6 +7,7 @@ from .base import from_iterable, to_iterable from .standoff import read_standoff, write_standoff from .brat import read_brat, write_brat + from .conll import read_conll from .json import read_json, write_json from .parquet import read_parquet, write_parquet from .spark import from_spark, to_spark diff --git a/edsnlp/data/conll.py b/edsnlp/data/conll.py new file mode 100644 index 000000000..3cf5b8d90 --- /dev/null +++ b/edsnlp/data/conll.py @@ -0,0 +1,256 @@ +import os +import random +import warnings +from pathlib import Path +from typing import Any, Callable, Dict, Iterable, List, Optional, Union + +from fsspec import filesystem as fsspec +from loguru import logger +from typing_extensions import Literal + +from edsnlp import registry +from edsnlp.core.stream import Stream +from edsnlp.data.base import FileBasedReader +from edsnlp.data.converters import FILENAME, get_dict2doc_converter +from edsnlp.utils.collections import shuffle +from edsnlp.utils.file_system import FileSystem, normalize_fs_path, walk_match +from edsnlp.utils.stream_sentinels import DatasetEndSentinel +from edsnlp.utils.typing import AsList + +LOCAL_FS = fsspec("file") + +DEFAULT_COLUMNS = [ + "ID", + "FORM", + "LEMMA", + "UPOS", + "XPOS", + "FEATS", + "HEAD", + "DEPREL", + "DEPS", + "MISC", +] + + +def parse_conll( + path: str, + cols: Optional[List[str]] = None, + fs: FileSystem = LOCAL_FS, +) -> Iterable[Dict]: + """ + Load a .conll file and return a dictionary with the text, words, and entities. + This expects the file to contain multiple sentences, split into words, each one + described in a line. Each sentence is separated by an empty line. + + If possible, looks for a `#global.columns` comment at the start of the file to + extract the column names. + + Examples: + + ```text + ... + 11 jeune jeune ADJ _ Number=Sing 12 amod _ _ + 12 fille fille NOUN _ Gender=Fem|Number=Sing 5 obj _ _ + 13 qui qui PRON _ PronType=Rel 14 nsubj _ _ + ... + ``` + + Parameters + ---------- + path: str + Path or glob path of the brat text file (.txt, not .ann) + cols: Optional[List[str]] + List of column names to use. If None, the first line of the file will be used + fs: FileSystem + Filesystem to use + + Returns + ------- + Iterator[Dict] + """ + with fs.open(path, "r", encoding="utf-8") as f: + lines = f.readlines() + + if cols is None: + try: + cols = next( + line.split("=")[1].strip().split() + for line in lines + if line.strip("# ").startswith("global.columns") + ) + except StopIteration: + cols = DEFAULT_COLUMNS + warnings.warn( + f"No #global.columns comment found in the CoNLL file. " + f"Using default {cols}" + ) + + doc = {"words": []} + for line in lines: + line = line.strip() + if not line: + if doc["words"]: + yield doc + doc = {"words": []} + continue + if line.startswith("#"): + continue + parts = line.split("\t") + word = {k: v for k, v in zip(cols, parts) if v != "_"} + doc["words"].append(word) + + if doc["words"]: + yield doc + + +class ConllReader(FileBasedReader): + DATA_FIELDS = () + + def __init__( + self, + path: Union[str, Path], + *, + columns: Optional[List[str]] = None, + filesystem: Optional[FileSystem] = None, + loop: bool = False, + shuffle: Literal["dataset", False] = False, + seed: Optional[int] = None, + ): + super().__init__() + self.shuffle = shuffle + self.emitted_sentinels = {"dataset"} + self.rng = random.Random(seed) + self.loop = loop + self.fs, self.path = normalize_fs_path(filesystem, path) + self.columns = columns + + files = walk_match(self.fs, self.path, ".*[.]conllu?") + self.files = sorted(files) + assert len(self.files), f"No .conll files found in the directory {self.path}" + logger.info(f"The directory contains {len(self.files)} .conll files.") + + def read_records(self) -> Iterable[Any]: + while True: + files = self.files + if self.shuffle: + files = shuffle(files, self.rng) + for item in files: + for anns in parse_conll(item, cols=self.columns, fs=self.fs): + anns[FILENAME] = os.path.relpath(item, self.path).rsplit(".", 1)[0] + anns["doc_id"] = anns[FILENAME] + yield anns + yield DatasetEndSentinel() + if not self.loop: + break + + def __repr__(self): + return ( + f"{self.__class__.__name__}(" + f"path={self.path!r}, " + f"shuffle={self.shuffle}, " + f"loop={self.loop})" + ) + + +# No writer for CoNLL format yet + + +@registry.readers.register("conll") +def read_conll( + path: Union[str, Path], + *, + columns: Optional[List[str]] = None, + converter: Optional[AsList[Union[str, Callable]]] = ["conll"], + filesystem: Optional[FileSystem] = None, + shuffle: Literal["dataset", False] = False, + seed: Optional[int] = None, + loop: bool = False, + **kwargs, +) -> Stream: + """ + The ConllReader (or `edsnlp.data.read_conll`) reads a file or directory of CoNLL + files and yields documents. + + The raw output (i.e., by setting `converter=None`) will be in the following form + for a single doc: + + ``` + { + "words": [ + {"ID": "1", "FORM": ...}, + ... + ], + } + ``` + + Example + ------- + ```{ .python .no-check } + + import edsnlp + + nlp = edsnlp.blank("eds") + nlp.add_pipe(...) + doc_iterator = edsnlp.data.read_conll("path/to/conll/file/or/directory") + annotated_docs = nlp.pipe(doc_iterator) + ``` + + !!! note "Generator vs list" + + `edsnlp.data.read_conll` returns a + [Stream][edsnlp.core.stream.Stream]. + To iterate over the documents multiple times efficiently or to access them by + index, you must convert it to a list : + + ```{ .python .no-check } + docs = list(edsnlp.data.read_conll("path/to/conll/file/or/directory")) + ``` + + Parameters + ---------- + path : Union[str, Path] + Path to the directory containing the CoNLL files (will recursively look for + files in subdirectories). + columns: Optional[List[str]] + List of column names to use. If None, will try to extract to look for a + `#global.columns` comment at the start of the file to extract the column names. + shuffle: Literal["dataset", False] + Whether to shuffle the data. If "dataset", the whole dataset will be shuffled + before starting iterating on it (at the start of every epoch if looping). + seed: Optional[int] + The seed to use for shuffling. + loop: bool + Whether to loop over the data indefinitely. + nlp : Optional[PipelineProtocol] + The pipeline object (optional and likely not needed, prefer to use the + `tokenizer` directly argument instead). + tokenizer : Optional[spacy.tokenizer.Tokenizer] + The tokenizer instance used to tokenize the documents. Likely not needed since + by default it uses the current context tokenizer : + + - the tokenizer of the next pipeline run by `.map_pipeline` in a + [Stream][edsnlp.core.stream.Stream]. + - or the `eds` tokenizer by default. + converter : Optional[AsList[Union[str, Callable]]] + Converter to use to convert the documents to dictionary objects. + filesystem: Optional[FileSystem] = None, + The filesystem to use to write the files. If None, the filesystem will be + inferred from the path (e.g. `s3://` will use S3). + """ + + data = Stream( + reader=ConllReader( + path, + columns=columns, + filesystem=filesystem, + loop=loop, + shuffle=shuffle, + seed=seed, + ) + ) + if converter: + for conv in converter: + conv, kwargs = get_dict2doc_converter(conv, kwargs) + data = data.map(conv, kwargs=kwargs) + return data diff --git a/edsnlp/data/converters.py b/edsnlp/data/converters.py index 1bf1e6d2b..2059d3c8c 100644 --- a/edsnlp/data/converters.py +++ b/edsnlp/data/converters.py @@ -5,6 +5,7 @@ """ import inspect +import warnings from copy import copy from types import FunctionType from typing import ( @@ -19,6 +20,7 @@ ) import pydantic +import spacy from confit.registry import ValidatedFunction from spacy.tokenizer import Tokenizer from spacy.tokens import Doc, Span @@ -379,6 +381,54 @@ def __call__(self, doc): return obj +@registry.factory.register("eds.conll_dict2doc", spacy_compatible=False) +class ConllDict2DocConverter: + """ + TODO + """ + + def __init__( + self, + *, + tokenizer: Optional[Tokenizer] = None, + ): + self.tokenizer = tokenizer + + def __call__(self, obj, tokenizer=None): + tok = get_current_tokenizer() if self.tokenizer is None else self.tokenizer + vocab = tok.vocab + words_data = [word for word in obj["words"] if "-" not in word["ID"]] + words = [word["FORM"] for word in words_data] + spaces = ["SpaceAfter=No" not in w.get("MISC", "") for w in words_data] + doc = Doc(vocab, words=words, spaces=spaces) + + id_to_word = {word["ID"]: i for i, word in enumerate(words_data)} + for word_data, word in zip(words_data, doc): + for key, value in word_data.items(): + if key in ("ID", "FORM", "MISC"): + pass + elif key == "LEMMA": + word.lemma_ = value + elif key == "UPOS": + word.pos_ = value + elif key == "XPOS": + word.tag_ = value + elif key == "FEATS": + word.morph = spacy.tokens.morphanalysis.MorphAnalysis( + tok.vocab, + dict(feat.split("=") for feat in value.split("|")), + ) + elif key == "HEAD": + if value != "0": + word.head = doc[id_to_word[value]] + elif key == "DEPREL": + word.dep_ = value + else: + warnings.warn(f"Unused key {key} in CoNLL dict, ignoring it.") + + return doc + + @registry.factory.register("eds.omop_dict2doc", spacy_compatible=False) class OmopDict2DocConverter: """ diff --git a/edsnlp/utils/file_system.py b/edsnlp/utils/file_system.py index b93911e8e..29179cf2f 100644 --- a/edsnlp/utils/file_system.py +++ b/edsnlp/utils/file_system.py @@ -24,6 +24,8 @@ def walk_match( root: str, file_pattern: str, ) -> list: + if fs.isfile(root): + return [root] return [ os.path.join(dirpath, f) for dirpath, dirnames, files in fs.walk(root) diff --git a/mkdocs.yml b/mkdocs.yml index 023a92c4f..786e26d55 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -132,6 +132,7 @@ nav: - Data Connectors: - data/index.md - data/standoff.md + - data/conll.md - data/json.md - data/parquet.md - data/pandas.md diff --git a/pyproject.toml b/pyproject.toml index eb8e7dfb7..43043b865 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -209,6 +209,7 @@ where = ["."] "eds.split" = "edsnlp.pipes.misc.split.split:Split" "eds.standoff_dict2doc" = "edsnlp.data.converters:StandoffDict2DocConverter" "eds.standoff_doc2dict" = "edsnlp.data.converters:StandoffDoc2DictConverter" +"eds.conll_dict2doc" = "edsnlp.data.converters:ConllDict2DocConverter" "eds.omop_dict2doc" = "edsnlp.data.converters:OmopDict2DocConverter" "eds.omop_doc2dict" = "edsnlp.data.converters:OmopDoc2DictConverter" "eds.ents_doc2dict" = "edsnlp.data.converters:EntsDoc2DictConverter" @@ -295,6 +296,7 @@ where = ["."] "parquet" = "edsnlp.data:read_parquet" "standoff" = "edsnlp.data:read_standoff" "brat" = "edsnlp.data:read_brat" # alias for standoff +"conll" = "edsnlp.data:read_conll" [project.entry-points."edsnlp_writers"] "spark" = "edsnlp.data:to_spark" diff --git a/tests/data/test_conll.py b/tests/data/test_conll.py new file mode 100644 index 000000000..049806a18 --- /dev/null +++ b/tests/data/test_conll.py @@ -0,0 +1,38 @@ +from itertools import islice +from pathlib import Path + +import pytest +from typing_extensions import Literal + +import edsnlp + + +@pytest.mark.parametrize("num_cpu_workers", [0, 2]) +@pytest.mark.parametrize("shuffle", ["dataset"]) +def test_read_shuffle_loop( + num_cpu_workers: int, + shuffle: Literal["dataset", "fragment"], +): + input_file = ( + Path(__file__).parent.parent.resolve() / "training" / "rhapsodie_sample.conllu" + ) + notes = edsnlp.data.read_conll( + input_file, + shuffle=shuffle, + seed=42, + loop=True, + ).set_processing(num_cpu_workers=num_cpu_workers) + notes = list(islice(notes, 6)) + assert len(notes) == 6 + # 32 ce ce PRON _ Gender=Masc|Number=Sing|Person=3|PronType=Dem 30 obl:arg _ _ # noqa: E501 + word_attrs = { + "text": "ce", + "lemma_": "ce", + "pos_": "PRON", + "dep_": "obl:arg", + "morph": "Gender=Masc|Number=Sing|Person=3|PronType=Dem", + "head": "profité", + } + word = notes[0][31] + for attr, val in word_attrs.items(): + assert str(getattr(word, attr)) == val diff --git a/tests/training/rhapsodie_sample.conllu b/tests/training/rhapsodie_sample.conllu new file mode 100644 index 000000000..76b0e42d2 --- /dev/null +++ b/tests/training/rhapsodie_sample.conllu @@ -0,0 +1,67 @@ +# global.columns = ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC +# macrosyntax = "euh" il y avait ( donc ) une "euh" jeune fille { qui regardait dans { une boutique | apparemment une pâtisserie } | qui semblait avoir faim | qui { a profité de ce que le livreur s'éloigne pour "euh" voler { un | une } baguette "euh" | a rencontré ( donc ) Charlot à ce moment-là | lui est rentrée dedans } } // +# sent_id = Rhap_M0024-1 +# text = euh, il y avait donc une, euh, jeune fille qui regardait dans une boutique, apparemment une pâtisserie, qui semblait avoir faim, qui a profité de ce que le livreur s'éloigne pour, euh, voler un, une baguette, euh, a rencontré donc Charlot à ce moment-là, lui est rentrée dedans. +1 euh euh INTJ _ _ 5 discourse _ SpaceAfter=No +2 , , PUNCT _ _ 1 punct _ _ +3 il lui PRON _ Gender=Masc|Number=Sing|Person=3|PronType=Prs 5 expl:subj _ _ +4 y y PRON _ Person=3|PronType=Prs 5 expl:comp _ _ +5 avait avoir VERB _ Mood=Ind|Number=Sing|Person=3|Tense=Imp|VerbForm=Fin 0 root _ _ +6 donc donc ADV _ _ 5 discourse _ _ +7 une un DET _ Definite=Ind|Gender=Fem|Number=Sing|PronType=Art 12 det _ SpaceAfter=No +8 , , PUNCT _ _ 9 punct _ _ +9 euh euh INTJ _ _ 7 discourse _ SpaceAfter=No +10 , , PUNCT _ _ 7 punct _ _ +11 jeune jeune ADJ _ Number=Sing 12 amod _ _ +12 fille fille NOUN _ Gender=Fem|Number=Sing 5 obj _ _ +13 qui qui PRON _ PronType=Rel 14 nsubj _ _ +14 regardait regarder VERB _ Mood=Ind|Number=Sing|Person=3|Tense=Imp|VerbForm=Fin 12 acl:relcl _ _ +15 dans dans ADP _ _ 17 case _ _ +16 une un DET _ Definite=Ind|Gender=Fem|Number=Sing|PronType=Art 17 det _ _ +17 boutique boutique NOUN _ Gender=Fem|Number=Sing 14 obl:arg _ SpaceAfter=No +18 , , PUNCT _ _ 21 punct _ _ +19 apparemment apparemment ADV _ _ 21 advmod _ _ +20 une un DET _ Definite=Ind|Gender=Fem|Number=Sing|PronType=Art 21 det _ _ +21 pâtisserie pâtisserie NOUN _ Gender=Fem|Number=Sing 17 appos _ SpaceAfter=No +22 , , PUNCT _ _ 24 punct _ _ +23 qui qui PRON _ PronType=Rel 24 nsubj _ _ +24 semblait sembler VERB _ Mood=Ind|Number=Sing|Person=3|Tense=Imp|VerbForm=Fin 14 conj _ _ +25 avoir avoir VERB _ VerbForm=Inf 24 xcomp _ Subject=SubjRaising +26 faim faim NOUN _ Gender=Fem|Number=Sing 25 obj _ SpaceAfter=No +27 , , PUNCT _ _ 30 punct _ _ +28 qui qui PRON _ PronType=Rel 30 nsubj _ _ +29 a avoir AUX _ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 30 aux:tense _ _ +30 profité profiter VERB _ Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part 14 conj _ _ +31 de de ADP _ _ 32 case _ _ +32 ce ce PRON _ Gender=Masc|Number=Sing|Person=3|PronType=Dem 30 obl:arg _ _ +33 que que SCONJ _ _ 37 mark _ _ +34 le le DET _ Definite=Def|Gender=Masc|Number=Sing|PronType=Art 35 det _ _ +35 livreur livreur NOUN _ Gender=Masc|Number=Sing 37 nsubj _ _ +36 s' soi PRON _ Person=3|PronType=Prs 37 obj _ SpaceAfter=No +37 éloigne éloigner VERB _ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 32 acl _ _ +38 pour pour ADP _ _ 42 mark _ SpaceAfter=No +39 , , PUNCT _ _ 42 punct _ _ +40 euh euh INTJ _ _ 42 discourse _ SpaceAfter=No +41 , , PUNCT _ _ 40 punct _ _ +42 voler voler VERB _ VerbForm=Inf 30 advcl _ Subject=NoRaising +43 un un DET _ Definite=Ind|Gender=Masc|Number=Sing|PronType=Art 45 reparandum _ SpaceAfter=No +44 , , PUNCT _ _ 43 punct _ _ +45 une un DET _ Definite=Ind|Gender=Fem|Number=Sing|PronType=Art 46 det _ _ +46 baguette baguette NOUN _ Gender=Fem|Number=Sing 42 obj _ SpaceAfter=No +47 , , PUNCT _ _ 48 punct _ _ +48 euh euh INTJ _ _ 46 discourse _ SpaceAfter=No +49 , , PUNCT _ _ 51 punct _ _ +50 a avoir AUX _ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 51 aux:tense _ _ +51 rencontré rencontrer VERB _ Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part 30 conj _ _ +52 donc donc ADV _ _ 53 discourse _ _ +53 Charlot Charlot PROPN _ _ 51 obj _ _ +54 à à ADP _ _ 56 case _ _ +55 ce ce DET _ Gender=Masc|Number=Sing|PronType=Dem 56 det _ _ +56 moment moment NOUN _ Gender=Masc|Number=Sing 51 obl:mod _ SpaceAfter=No +57 -là là ADV _ _ 56 advmod _ SpaceAfter=No +58 , , PUNCT _ _ 61 punct _ _ +59 lui lui PRON _ Gender=Masc|Number=Sing|Person=3|PronType=Prs 61 iobj _ _ +60 est être AUX _ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 61 aux:tense _ _ +61 rentrée rentrer VERB _ Gender=Fem|Number=Sing|Tense=Past|VerbForm=Part 51 conj _ _ +62 dedans dedans ADV _ _ 61 obj _ SpaceAfter=No +63 . . PUNCT _ _ 5 punct _ _