Skip to content

Commit

Permalink
refacto: deprecate batch_size argument of Pipeline
Browse files Browse the repository at this point in the history
  • Loading branch information
percevalw committed Nov 14, 2024
1 parent 7c58309 commit 989b70c
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 11 deletions.
1 change: 1 addition & 0 deletions changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
- `eds.span_context_getter`'s parameter `context_sents` is no longer optional and must be explicitly set to 0 to disable sentence context
- In multi-GPU setups, streams that contain torch components are now stripped of their parameter tensors when sent to CPU Workers since these workers only perform preprocessing and postprocessing and should therefore not need the model parameters.
- The `batch_size` argument of `Pipeline` is deprecated and is not used anymore. Use the `batch_size` argument of `stream.map_pipeline` instead.
### Fixed
Expand Down
18 changes: 7 additions & 11 deletions edsnlp/core/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def __init__(
lang: str,
create_tokenizer: Optional[Callable[[Self], Tokenizer]] = None,
vocab: Union[bool, Vocab] = True,
batch_size: Optional[int] = 128,
batch_size: Optional[int] = None,
vocab_config: Type[BaseDefaults] = None,
meta: Dict[str, Any] = None,
pipeline: Optional[Sequence[str]] = None,
Expand All @@ -119,8 +119,6 @@ def __init__(
Function that creates a tokenizer for the pipeline
vocab: Union[bool, Vocab]
Whether to create a new vocab or use an existing one
batch_size: Optional[int]
Batch size to use in the `.pipe()` method
vocab_config: Type[BaseDefaults]
Configuration for the vocab
meta: Dict[str, Any]
Expand All @@ -129,6 +127,12 @@ def __init__(
spacy_blank_cls = get_lang_class(lang)

self.Defaults = spacy_blank_cls.Defaults
if batch_size is not None:
warnings.warn(
"The 'batch_size' argument is deprecated. Use the 'batch_size' "
"argument in `stream.map_pipeline` instead.",
DeprecationWarning,
)
self.batch_size = batch_size
if (vocab is not True) and (vocab_config is not None):
raise ValueError(
Expand Down Expand Up @@ -397,7 +401,6 @@ def __call__(self, text: Union[str, Doc]) -> Doc:
def pipe(
self,
inputs: Union[Iterable, Stream],
batch_size: Optional[int] = None,
n_process: int = None,
**kwargs,
) -> Stream:
Expand All @@ -409,9 +412,6 @@ def pipe(
----------
inputs: Iterable[Union[str, Doc]]
The inputs to create the Docs from, or Docs directly.
batch_size: Optional[int]
The batch size to use. If not provided, the batch size of the pipeline
object will be used.
n_process: int
Deprecated. Use the ".set(num_cpu_workers=n_process)" method on the returned
data stream instead.
Expand All @@ -423,10 +423,6 @@ def pipe(
Stream
"""

if batch_size is None:
batch_size = self.batch_size
kwargs = {"batch_size": batch_size, **kwargs}

stream = edsnlp.data.from_iterable(inputs)
stream = stream.map_pipeline(self, **kwargs)
if n_process is not None:
Expand Down

0 comments on commit 989b70c

Please sign in to comment.