Skip to content

Commit

Permalink
fix: update spacy patch & remove ineffective and buggy registry override
Browse files Browse the repository at this point in the history
  • Loading branch information
percevalw committed Dec 13, 2023
1 parent d14d4d0 commit cbca60c
Showing 1 changed file with 12 additions and 10 deletions.
22 changes: 12 additions & 10 deletions edsnlp/patch_spacy.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ def __init__(
max_length: int = 10**6,
meta: Dict[str, Any] = {},
create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None,
create_vectors: Optional[Callable[["Vocab"], Any]] = None,
batch_size: int = 1000,
**kwargs,
) -> None:
Expand Down Expand Up @@ -142,8 +143,7 @@ def __init__(

# EDS-NLP: disable spacy default call to load every factory
# since some of them may be missing dependencies (like torch)
# util.registry._entry_point_factories.get_all()
util.registry.factories = util.registry._entry_point_factories
util.registry._entry_point_factories.get_all()

self._config = DEFAULT_CONFIG.merge(self.default_config)
self._meta = dict(meta)
Expand All @@ -158,21 +158,23 @@ def __init__(
if vocab is True:
vectors_name = meta.get("vectors", {}).get("name")
vocab = create_vocab(self.lang, self.Defaults, vectors_name=vectors_name)
if (self.lang and vocab.lang) and (self.lang != vocab.lang):
raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang))
if not create_vectors and "@vectors" in self._config["nlp"]["vectors"]:
vectors_cfg = {"vectors": self._config["nlp"]["vectors"]}
create_vectors = registry.resolve(vectors_cfg)["vectors"]

Check warning on line 163 in edsnlp/patch_spacy.py

View check run for this annotation

Codecov / codecov/patch

edsnlp/patch_spacy.py#L162-L163

Added lines #L162 - L163 were not covered by tests
vocab.vectors = create_vectors(vocab)
else:
if (self.lang and vocab.lang) and (self.lang != vocab.lang):

Check warning on line 166 in edsnlp/patch_spacy.py

View check run for this annotation

Codecov / codecov/patch

edsnlp/patch_spacy.py#L166

Added line #L166 was not covered by tests
raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang))
self.vocab: Vocab = vocab
if self.lang is None:
self.lang = self.vocab.lang
self._components: List[Tuple[str, Callable[[Doc], Doc]]] = []
self._disabled: Set[str] = set()
self.max_length = max_length
# Create the default tokenizer from the default config
create_tokenizer = (
create_tokenizer
or registry.resolve({"tokenizer": self._config["nlp"]["tokenizer"]})[
"tokenizer"
]
)
if not create_tokenizer:
tokenizer_cfg = {"tokenizer": self._config["nlp"]["tokenizer"]}
create_tokenizer = registry.resolve(tokenizer_cfg)["tokenizer"]

Check warning on line 177 in edsnlp/patch_spacy.py

View check run for this annotation

Codecov / codecov/patch

edsnlp/patch_spacy.py#L176-L177

Added lines #L176 - L177 were not covered by tests
self.tokenizer = create_tokenizer(self)
self.batch_size = batch_size
self.default_error_handler = raise_error
Expand Down

0 comments on commit cbca60c

Please sign in to comment.