Skip to content

Commit

Permalink
feat: added conll format
Browse files Browse the repository at this point in the history
  • Loading branch information
percevalw committed Dec 12, 2024
1 parent 5d651e3 commit a4c1a8a
Show file tree
Hide file tree
Showing 11 changed files with 457 additions and 0 deletions.
1 change: 1 addition & 0 deletions changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
- Log the training config at the beginning of the trainings
- Support a specific model output dir path for trainings (`output_model_dir`), and whether to save the model or not (`save_model`)
- Specify whether to log the validation results or not (`logger=False`)
- Added support for the CoNLL format with `edsnlp.data.read_conll` and with a specific `eds.conll_dict2doc` converter

### Fixed

Expand Down
1 change: 1 addition & 0 deletions docs/assets/stylesheets/extra.css
Original file line number Diff line number Diff line change
Expand Up @@ -188,4 +188,5 @@ a.discrete-link {

.sourced-heading > a {
font-size: 1rem;
align-content: center;
}
38 changes: 38 additions & 0 deletions docs/data/conll.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# CoNLL

??? abstract "TLDR"

```{ .python .no-check }
import edsnlp

stream = edsnlp.data.read_conll(path)
stream = stream.map_pipeline(nlp)
```

You can easily integrate CoNLL formatted files into your project by using EDS-NLP's CoNLL reader.

There are many CoNLL formats corresponding to different shared tasks, but one of the most common is the CoNLL-U format, which is used for dependency parsing. In CoNLL files, each line corresponds to a token and contains various columns with information about the token, such as its index, form, lemma, POS tag, and dependency relation.

EDS-NLP lets you specify the name of the `columns` if they are different from the default CoNLL-U format. If the `columns` parameter is unset, the reader looks for a comment containing `# global.columns` to infer the column names. Otherwise, the columns are

```
ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC
```

A typical CoNLL file looks like this:

```{ title="sample.conllu" }
1 euh euh INTJ _ _ 5 discourse _ SpaceAfter=No
2 , , PUNCT _ _ 1 punct _ _
3 il lui PRON _ Gender=Masc|Number=Sing|Person=3|PronType=Prs 5 expl:subj _ _
...
```

## Reading CoNLL files {: #edsnlp.data.conll.read_conll }

::: edsnlp.data.conll.read_conll
options:
heading_level: 3
show_source: false
show_toc: false
show_bases: false
1 change: 1 addition & 0 deletions edsnlp/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from .base import from_iterable, to_iterable
from .standoff import read_standoff, write_standoff
from .brat import read_brat, write_brat
from .conll import read_conll
from .json import read_json, write_json
from .parquet import read_parquet, write_parquet
from .spark import from_spark, to_spark
Expand Down
256 changes: 256 additions & 0 deletions edsnlp/data/conll.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,256 @@
import os
import random
import warnings
from pathlib import Path
from typing import Any, Callable, Dict, Iterable, List, Optional, Union

from fsspec import filesystem as fsspec
from loguru import logger
from typing_extensions import Literal

from edsnlp import registry
from edsnlp.core.stream import Stream
from edsnlp.data.base import FileBasedReader
from edsnlp.data.converters import FILENAME, get_dict2doc_converter
from edsnlp.utils.collections import shuffle
from edsnlp.utils.file_system import FileSystem, normalize_fs_path, walk_match
from edsnlp.utils.stream_sentinels import DatasetEndSentinel
from edsnlp.utils.typing import AsList

LOCAL_FS = fsspec("file")

DEFAULT_COLUMNS = [
"ID",
"FORM",
"LEMMA",
"UPOS",
"XPOS",
"FEATS",
"HEAD",
"DEPREL",
"DEPS",
"MISC",
]


def parse_conll(
path: str,
cols: Optional[List[str]] = None,
fs: FileSystem = LOCAL_FS,
) -> Iterable[Dict]:
"""
Load a .conll file and return a dictionary with the text, words, and entities.
This expects the file to contain multiple sentences, split into words, each one
described in a line. Each sentence is separated by an empty line.
If possible, looks for a `#global.columns` comment at the start of the file to
extract the column names.
Examples:
```text
...
11 jeune jeune ADJ _ Number=Sing 12 amod _ _
12 fille fille NOUN _ Gender=Fem|Number=Sing 5 obj _ _
13 qui qui PRON _ PronType=Rel 14 nsubj _ _
...
```
Parameters
----------
path: str
Path or glob path of the brat text file (.txt, not .ann)
cols: Optional[List[str]]
List of column names to use. If None, the first line of the file will be used
fs: FileSystem
Filesystem to use
Returns
-------
Iterator[Dict]
"""
with fs.open(path, "r", encoding="utf-8") as f:
lines = f.readlines()

if cols is None:
try:
cols = next(
line.split("=")[1].strip().split()
for line in lines
if line.strip("# ").startswith("global.columns")
)
except StopIteration:
cols = DEFAULT_COLUMNS
warnings.warn(
f"No #global.columns comment found in the CoNLL file. "
f"Using default {cols}"
)

doc = {"words": []}
for line in lines:
line = line.strip()
if not line:
if doc["words"]:
yield doc
doc = {"words": []}
continue
if line.startswith("#"):
continue
parts = line.split("\t")
word = {k: v for k, v in zip(cols, parts) if v != "_"}
doc["words"].append(word)

if doc["words"]:
yield doc


class ConllReader(FileBasedReader):
DATA_FIELDS = ()

def __init__(
self,
path: Union[str, Path],
*,
columns: Optional[List[str]] = None,
filesystem: Optional[FileSystem] = None,
loop: bool = False,
shuffle: Literal["dataset", False] = False,
seed: Optional[int] = None,
):
super().__init__()
self.shuffle = shuffle
self.emitted_sentinels = {"dataset"}
self.rng = random.Random(seed)
self.loop = loop
self.fs, self.path = normalize_fs_path(filesystem, path)
self.columns = columns

files = walk_match(self.fs, self.path, ".*[.]conllu?")
self.files = sorted(files)
assert len(self.files), f"No .conll files found in the directory {self.path}"
logger.info(f"The directory contains {len(self.files)} .conll files.")

def read_records(self) -> Iterable[Any]:
while True:
files = self.files
if self.shuffle:
files = shuffle(files, self.rng)
for item in files:
for anns in parse_conll(item, cols=self.columns, fs=self.fs):
anns[FILENAME] = os.path.relpath(item, self.path).rsplit(".", 1)[0]
anns["doc_id"] = anns[FILENAME]
yield anns
yield DatasetEndSentinel()
if not self.loop:
break

def __repr__(self):
return (
f"{self.__class__.__name__}("
f"path={self.path!r}, "
f"shuffle={self.shuffle}, "
f"loop={self.loop})"
)


# No writer for CoNLL format yet


@registry.readers.register("conll")
def read_conll(
path: Union[str, Path],
*,
columns: Optional[List[str]] = None,
converter: Optional[AsList[Union[str, Callable]]] = ["conll"],
filesystem: Optional[FileSystem] = None,
shuffle: Literal["dataset", False] = False,
seed: Optional[int] = None,
loop: bool = False,
**kwargs,
) -> Stream:
"""
The ConllReader (or `edsnlp.data.read_conll`) reads a file or directory of CoNLL
files and yields documents.
The raw output (i.e., by setting `converter=None`) will be in the following form
for a single doc:
```
{
"words": [
{"ID": "1", "FORM": ...},
...
],
}
```
Example
-------
```{ .python .no-check }
import edsnlp
nlp = edsnlp.blank("eds")
nlp.add_pipe(...)
doc_iterator = edsnlp.data.read_conll("path/to/conll/file/or/directory")
annotated_docs = nlp.pipe(doc_iterator)
```
!!! note "Generator vs list"
`edsnlp.data.read_conll` returns a
[Stream][edsnlp.core.stream.Stream].
To iterate over the documents multiple times efficiently or to access them by
index, you must convert it to a list :
```{ .python .no-check }
docs = list(edsnlp.data.read_conll("path/to/conll/file/or/directory"))
```
Parameters
----------
path : Union[str, Path]
Path to the directory containing the CoNLL files (will recursively look for
files in subdirectories).
columns: Optional[List[str]]
List of column names to use. If None, will try to extract to look for a
`#global.columns` comment at the start of the file to extract the column names.
shuffle: Literal["dataset", False]
Whether to shuffle the data. If "dataset", the whole dataset will be shuffled
before starting iterating on it (at the start of every epoch if looping).
seed: Optional[int]
The seed to use for shuffling.
loop: bool
Whether to loop over the data indefinitely.
nlp : Optional[PipelineProtocol]
The pipeline object (optional and likely not needed, prefer to use the
`tokenizer` directly argument instead).
tokenizer : Optional[spacy.tokenizer.Tokenizer]
The tokenizer instance used to tokenize the documents. Likely not needed since
by default it uses the current context tokenizer :
- the tokenizer of the next pipeline run by `.map_pipeline` in a
[Stream][edsnlp.core.stream.Stream].
- or the `eds` tokenizer by default.
converter : Optional[AsList[Union[str, Callable]]]
Converter to use to convert the documents to dictionary objects.
filesystem: Optional[FileSystem] = None,
The filesystem to use to write the files. If None, the filesystem will be
inferred from the path (e.g. `s3://` will use S3).
"""

data = Stream(
reader=ConllReader(
path,
columns=columns,
filesystem=filesystem,
loop=loop,
shuffle=shuffle,
seed=seed,
)
)
if converter:
for conv in converter:
conv, kwargs = get_dict2doc_converter(conv, kwargs)
data = data.map(conv, kwargs=kwargs)
return data
50 changes: 50 additions & 0 deletions edsnlp/data/converters.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
"""

import inspect
import warnings
from copy import copy
from types import FunctionType
from typing import (
Expand All @@ -19,6 +20,7 @@
)

import pydantic
import spacy
from confit.registry import ValidatedFunction
from spacy.tokenizer import Tokenizer
from spacy.tokens import Doc, Span
Expand Down Expand Up @@ -379,6 +381,54 @@ def __call__(self, doc):
return obj


@registry.factory.register("eds.conll_dict2doc", spacy_compatible=False)
class ConllDict2DocConverter:
"""
TODO
"""

def __init__(
self,
*,
tokenizer: Optional[Tokenizer] = None,
):
self.tokenizer = tokenizer

def __call__(self, obj, tokenizer=None):
tok = get_current_tokenizer() if self.tokenizer is None else self.tokenizer
vocab = tok.vocab
words_data = [word for word in obj["words"] if "-" not in word["ID"]]
words = [word["FORM"] for word in words_data]
spaces = ["SpaceAfter=No" not in w.get("MISC", "") for w in words_data]
doc = Doc(vocab, words=words, spaces=spaces)

id_to_word = {word["ID"]: i for i, word in enumerate(words_data)}
for word_data, word in zip(words_data, doc):
for key, value in word_data.items():
if key in ("ID", "FORM", "MISC"):
pass
elif key == "LEMMA":
word.lemma_ = value
elif key == "UPOS":
word.pos_ = value
elif key == "XPOS":
word.tag_ = value
elif key == "FEATS":
word.morph = spacy.tokens.morphanalysis.MorphAnalysis(
tok.vocab,
dict(feat.split("=") for feat in value.split("|")),
)
elif key == "HEAD":
if value != "0":
word.head = doc[id_to_word[value]]
elif key == "DEPREL":
word.dep_ = value
else:
warnings.warn(f"Unused key {key} in CoNLL dict, ignoring it.")

return doc


@registry.factory.register("eds.omop_dict2doc", spacy_compatible=False)
class OmopDict2DocConverter:
"""
Expand Down
2 changes: 2 additions & 0 deletions edsnlp/utils/file_system.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ def walk_match(
root: str,
file_pattern: str,
) -> list:
if fs.isfile(root):
return [root]
return [
os.path.join(dirpath, f)
for dirpath, dirnames, files in fs.walk(root)
Expand Down
Loading

0 comments on commit a4c1a8a

Please sign in to comment.