From e4df766f56598e824d09910dc19a932916c1d06b Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Thu, 4 Jan 2024 11:42:26 -0800 Subject: [PATCH 01/78] refactor init Signed-off-by: Sidhant Kohli --- refactor/__init__.py | 36 +++++ refactor/_base.py | 87 ++++++++++++ refactor/_constants.py | 41 ++++++ refactor/mapper/__init__.py | 21 +++ refactor/mapper/_dtypes.py | 192 ++++++++++++++++++++++++++ refactor/mapper/async_server.py | 129 +++++++++++++++++ refactor/mapper/example.py | 40 ++++++ refactor/mapper/map.py | 92 ++++++++++++ refactor/mapper/proto/__init__.py | 0 refactor/mapper/proto/map.proto | 43 ++++++ refactor/mapper/proto/map_pb2.py | 38 +++++ refactor/mapper/proto/map_pb2_grpc.py | 123 +++++++++++++++++ refactor/mapper/server.py | 112 +++++++++++++++ refactor/shared/server.py | 81 +++++++++++ refactor/types.py | 7 + 15 files changed, 1042 insertions(+) create mode 100644 refactor/__init__.py create mode 100644 refactor/_base.py create mode 100644 refactor/_constants.py create mode 100644 refactor/mapper/__init__.py create mode 100644 refactor/mapper/_dtypes.py create mode 100644 refactor/mapper/async_server.py create mode 100644 refactor/mapper/example.py create mode 100644 refactor/mapper/map.py create mode 100644 refactor/mapper/proto/__init__.py create mode 100644 refactor/mapper/proto/map.proto create mode 100644 refactor/mapper/proto/map_pb2.py create mode 100644 refactor/mapper/proto/map_pb2_grpc.py create mode 100644 refactor/mapper/server.py create mode 100644 refactor/shared/server.py create mode 100644 refactor/types.py diff --git a/refactor/__init__.py b/refactor/__init__.py new file mode 100644 index 00000000..bd60d0b6 --- /dev/null +++ b/refactor/__init__.py @@ -0,0 +1,36 @@ +import logging +import os +import sys + +if os.getenv("PYTHONDEBUG"): + os.environ["PYTHONASYNCIODEBUG"] = "1" + + +class StdoutFilter(logging.Filter): + """ + Filter logs with level less than logging.ERROR so they will go to stdout instead + of default stderr + """ + + def filter(self, record: logging.LogRecord) -> bool: + return record.levelno < logging.ERROR + + +def setup_logging(name): + formatter = logging.Formatter( + fmt="%(asctime)s %(levelname)-8s %(message)s", datefmt="%Y-%m-%d %H:%M:%S" + ) + logger = logging.getLogger(name) + + stdout_handler = logging.StreamHandler(sys.stdout) + stdout_handler.setFormatter(formatter) + stdout_handler.addFilter(StdoutFilter()) + stdout_handler.setLevel(logging.INFO) + logger.addHandler(stdout_handler) + + stderr_handler = logging.StreamHandler(sys.stderr) + stderr_handler.setFormatter(formatter) + stderr_handler.setLevel(logging.ERROR) + logger.addHandler(stderr_handler) + + return logger diff --git a/refactor/_base.py b/refactor/_base.py new file mode 100644 index 00000000..4920d55c --- /dev/null +++ b/refactor/_base.py @@ -0,0 +1,87 @@ +from abc import ABCMeta + + +class NumaflowPythonUDF(metaclass=ABCMeta): + """ + Base class for all Numaflow Python SDK based UDFs. + + Args: + is_async: If True, the UDF is executed in an asynchronous manner. + pl_conf: PipelineConf object + _vtx: Vertex/UDF name + """ + + __slots__ = ("is_async", "pl_conf", "_vtx") + + def __init__( + self, + is_async: bool = False, + pl_conf: Optional[PipelineConf] = None, + _vtx: Optional[str] = "numalogic-udf", + ): + self._vtx = _vtx + self.is_async = is_async + self.pl_conf = pl_conf or PipelineConf() + + def __call__( + self, keys: list[str], datum: Datum + ) -> Union[Coroutine[None, None, Messages], Messages]: + return self.aexec(keys, datum) if self.is_async else self.exec(keys, datum) + # + # # TODO: remove, and have an update config method + # def register_conf(self, config_id: str, conf: StreamConf) -> None: + # """ + # Register config with the UDF. + # + # Args: + # config_id: Config ID + # conf: StreamConf object + # """ + # self.pl_conf.stream_confs[config_id] = conf + # + # def _get_default_stream_conf(self, config_id) -> StreamConf: + # """Get the default config.""" + # try: + # return self.pl_conf.stream_confs[_DEFAULT_CONF_ID] + # except KeyError: + # err_msg = f"Config with ID {config_id} or {_DEFAULT_CONF_ID} not found!" + # raise ConfigNotFoundError(err_msg) from None + # + # def _get_default_ml_pipeline_conf(self, config_id, pipeline_id) -> MLPipelineConf: + # """Get the default pipeline config.""" + # try: + # return self.pl_conf.stream_confs[_DEFAULT_CONF_ID].ml_pipelines[_DEFAULT_CONF_ID] + # except KeyError: + # err_msg = ( + # f"Pipeline with ID {pipeline_id} or {_DEFAULT_CONF_ID} " + # f"not found for config ID {config_id}!" + # ) + # raise ConfigNotFoundError(err_msg) from None + + def exec(self, keys: list[str], datum: Datum) -> Messages: + """ + Called when the UDF is executed in a synchronous manner. + + Args: + keys: list of keys. + datum: Datum object. + + Returns + ------- + Messages instance + """ + raise NotImplementedError("exec method not implemented") + + async def aexec(self, keys: list[str], datum: Datum) -> Messages: + """ + Called when the UDF is executed in an asynchronous manner. + + Args: + keys: list of keys. + datum: Datum object. + + Returns + ------- + Messages instance + """ + raise NotImplementedError("aexec method not implemented") \ No newline at end of file diff --git a/refactor/_constants.py b/refactor/_constants.py new file mode 100644 index 00000000..399ce57a --- /dev/null +++ b/refactor/_constants.py @@ -0,0 +1,41 @@ +import logging +import multiprocessing +import os +from enum import Enum + +from refactor import setup_logging + +MAP_SOCK_PATH = "/var/run/numaflow/map.sock" +MAP_STREAM_SOCK_PATH = "/var/run/numaflow/mapstream.sock" +REDUCE_SOCK_PATH = "/var/run/numaflow/reduce.sock" +SOURCE_TRANSFORMER_SOCK_PATH = "/var/run/numaflow/sourcetransform.sock" +SINK_SOCK_PATH = "/var/run/numaflow/sink.sock" +MULTIPROC_MAP_SOCK_PORT = 55551 +MULTIPROC_MAP_SOCK_ADDR = "0.0.0.0" +SIDE_INPUT_SOCK_PATH = "/var/run/numaflow/sideinput.sock" +SOURCE_SOCK_PATH = "/var/run/numaflow/source.sock" + +# TODO: need to make sure the DATUM_KEY value is the same as +# https://github.com/numaproj/numaflow-go/blob/main/pkg/function/configs.go#L6 +WIN_START_TIME = "x-numaflow-win-start-time" +WIN_END_TIME = "x-numaflow-win-end-time" +MAX_MESSAGE_SIZE = 1024 * 1024 * 64 +# TODO: None instead of "EOF" ? +STREAM_EOF = "EOF" +DELIMITER = ":" +DROP = "U+005C__DROP__" + +_PROCESS_COUNT = multiprocessing.cpu_count() +MAX_THREADS = int(os.getenv("MAX_THREADS", 0)) or (_PROCESS_COUNT * 4) + +_LOGGER = setup_logging(__name__) +if os.getenv("PYTHONDEBUG"): + _LOGGER.setLevel(logging.DEBUG) + + +class ServerType(str, Enum): + """ + Enumerate grpc server connection protocol. + """ + Sync = "sync" + Async = "async" diff --git a/refactor/mapper/__init__.py b/refactor/mapper/__init__.py new file mode 100644 index 00000000..afea1a65 --- /dev/null +++ b/refactor/mapper/__init__.py @@ -0,0 +1,21 @@ +from refactor.mapper._dtypes import ( + Message, + Messages, + Datum, + DROP, +) +from refactor.mapper.async_server import AsyncMapper +# from pynumaflow.mapper.multiproc_server import MultiProcMapper +from refactor.mapper.server import Mapper +from refactor._constants import ServerType + +__all__ = [ + "Message", + "Messages", + "Datum", + "DROP", + "Mapper", + "AsyncMapper", + "ServerType", + # "MultiProcMapper", +] diff --git a/refactor/mapper/_dtypes.py b/refactor/mapper/_dtypes.py new file mode 100644 index 00000000..a39d68f2 --- /dev/null +++ b/refactor/mapper/_dtypes.py @@ -0,0 +1,192 @@ +from collections.abc import Iterator, Sequence, Awaitable +from dataclasses import dataclass +from datetime import datetime +from typing import TypeVar, Callable, Union +from warnings import warn + +from pynumaflow._constants import DROP + +M = TypeVar("M", bound="Message") +Ms = TypeVar("Ms", bound="Messages") + + +@dataclass(init=False) +class Message: + """ + Basic datatype for data passing to the next vertex/vertices. + + Args: + value: data in bytes + keys: []string keys for vertex (optional) + tags: []string tags for conditional forwarding (optional) + """ + + __slots__ = ("_value", "_keys", "_tags") + + _value: bytes + _keys: list[str] + _tags: list[str] + + def __init__(self, value: bytes, keys: list[str] = None, tags: list[str] = None): + """ + Creates a Message object to send value to a vertex. + """ + self._keys = keys or [] + self._tags = tags or [] + self._value = value or b"" + + # returns the Message Object which will be dropped + @classmethod + def to_drop(cls: type[M]) -> M: + return cls(b"", None, [DROP]) + + @property + def value(self) -> bytes: + return self._value + + @property + def keys(self) -> list[str]: + return self._keys + + @property + def tags(self) -> list[str]: + return self._tags + + +class Messages(Sequence[M]): + """ + Class to define a list of Message objects. + + Args: + messages: list of Message objects. + """ + + __slots__ = ("_messages",) + + def __init__(self, *messages: M): + self._messages = list(messages) or [] + + def __str__(self) -> str: + return str(self._messages) + + def __repr__(self) -> str: + return str(self) + + def __len__(self) -> int: + return len(self._messages) + + def __iter__(self) -> Iterator[M]: + return iter(self._messages) + + def __getitem__(self, index: int) -> M: + if isinstance(index, slice): + raise TypeError("Slicing is not supported for Messages") + return self._messages[index] + + def append(self, message: Message) -> None: + self._messages.append(message) + + def items(self) -> list[Message]: + warn( + "Using items is deprecated and will be removed in v0.5. " + "Iterate or index the Messages object instead.", + DeprecationWarning, + stacklevel=2, + ) + return self._messages + + +@dataclass(init=False) +class Datum: + """ + Class to define the important information for the event. + Args: + keys: the keys of the event. + value: the payload of the event. + event_time: the event time of the event. + watermark: the watermark of the event. + >>> # Example usage + >>> from pynumaflow.mapper import Datum + >>> from datetime import datetime, timezone + >>> payload = bytes("test_mock_message", encoding="utf-8") + >>> t1 = datetime.fromtimestamp(1662998400, timezone.utc) + >>> t2 = datetime.fromtimestamp(1662998460, timezone.utc) + >>> d = Datum( + ... keys=["test_key"], + ... value=payload, + ... event_time=t1, + ... watermark=t2, + ... ) + """ + + __slots__ = ("_keys", "_value", "_event_time", "_watermark") + + _keys: list[str] + _value: bytes + _event_time: datetime + _watermark: datetime + + def __init__( + self, + keys: list[str], + value: bytes, + event_time: datetime, + watermark: datetime, + ): + self._keys = keys or list() + self._value = value or b"" + if not isinstance(event_time, datetime): + raise TypeError(f"Wrong data type: {type(event_time)} for Datum.event_time") + self._event_time = event_time + if not isinstance(watermark, datetime): + raise TypeError(f"Wrong data type: {type(watermark)} for Datum.watermark") + self._watermark = watermark + + def keys(self) -> list[str]: + """Returns the keys of the event""" + return self._keys + + @property + def value(self) -> bytes: + """Returns the value of the event.""" + return self._value + + @property + def event_time(self) -> datetime: + """Returns the event time of the event.""" + return self._event_time + + @property + def watermark(self) -> datetime: + """Returns the watermark of the event.""" + return self._watermark + + +MapSyncCallable = Callable[[list[str], Datum], Messages] +MapAsyncCallable = Callable[[list[str], Datum], Awaitable[Messages]] + + +class MapperClass: + """ + Provides an interface to write a Mapper + which will be exposed over a Synchronous gRPC server. + + Args: + + """ + + def __call__(self, *args, **kwargs): + """ + Allow to call handler function directly if class instance is sent + """ + return self.handler(*args, **kwargs) + + def handler(self, keys: list[str], datum: Datum) -> Messages: + """ + Write a handler function which implements the MapCallable interface. + """ + raise NotImplementedError + + +MapCallable = Union[MapperClass, MapSyncCallable, MapAsyncCallable] + diff --git a/refactor/mapper/async_server.py b/refactor/mapper/async_server.py new file mode 100644 index 00000000..6a15d7e7 --- /dev/null +++ b/refactor/mapper/async_server.py @@ -0,0 +1,129 @@ +import logging +import multiprocessing +import os + + +import grpc +from google.protobuf import empty_pb2 as _empty_pb2 + +from pynumaflow import setup_logging +from pynumaflow._constants import ( + MAX_MESSAGE_SIZE, + MAP_SOCK_PATH, +) +from pynumaflow.mapper import Datum +from pynumaflow.mapper._dtypes import MapAsyncCallable +from pynumaflow.mapper.proto import map_pb2 +from pynumaflow.mapper.proto import map_pb2_grpc +from pynumaflow.types import NumaflowServicerContext +from pynumaflow.info.server import get_sdk_version, write as info_server_write +from pynumaflow.info.types import ServerInfo, Protocol, Language, SERVER_INFO_FILE_PATH + +_LOGGER = setup_logging(__name__) +if os.getenv("PYTHONDEBUG"): + _LOGGER.setLevel(logging.DEBUG) + +_PROCESS_COUNT = multiprocessing.cpu_count() +MAX_THREADS = int(os.getenv("MAX_THREADS", 0)) or (_PROCESS_COUNT * 4) + + +class AsyncMapper(map_pb2_grpc.MapServicer): + """ + Provides an interface to write an Async Mapper + which will be exposed over gRPC. + + Args: + handler: Function callable following the type signature of MapCallable + sock_path: Path to the UNIX Domain Socket + max_message_size: The max message size in bytes the server can receive and send + max_threads: The max number of threads to be spawned; + defaults to number of processors x4 + + Example invocation: + >>> from typing import Iterator + >>> from pynumaflow.mapper import Messages, Message\ + ... Datum, AsyncMapper + ... import aiorun + ... + >>> async def map_handler(key: [str], datum: Datum) -> Messages: + ... val = datum.value + ... _ = datum.event_time + ... _ = datum.watermark + ... messages = Messages(Message(val, keys=keys)) + ... return messages + ... + >>> grpc_server = AsyncMapper(handler=map_handler) + >>> aiorun.run(grpc_server.start()) + """ + + def __init__( + self, + handler: MapAsyncCallable, + sock_path=MAP_SOCK_PATH, + max_message_size=MAX_MESSAGE_SIZE, + max_threads=MAX_THREADS, + ): + self.__map_handler: MapAsyncCallable = handler + self.sock_path = f"unix://{sock_path}" + self._max_message_size = max_message_size + self._max_threads = max_threads + self.cleanup_coroutines = [] + # Collection for storing strong references to all running tasks. + # Event loop only keeps a weak reference, which can cause it to + # get lost during execution. + self.background_tasks = set() + + self._server_options = [ + ("grpc.max_send_message_length", self._max_message_size), + ("grpc.max_receive_message_length", self._max_message_size), + ] + + async def MapFn( + self, request: map_pb2.MapRequest, context: NumaflowServicerContext + ) -> map_pb2.MapResponse: + """ + Applies a function to each datum element. + The pascal case function name comes from the proto map_pb2_grpc.py file. + """ + # proto repeated field(keys) is of type google._upb._message.RepeatedScalarContainer + # we need to explicitly convert it to list + try: + res = await self.__invoke_map( + list(request.keys), + Datum( + keys=list(request.keys), + value=request.value, + event_time=request.event_time.ToDatetime(), + watermark=request.watermark.ToDatetime(), + ), + ) + except Exception as e: + context.set_code(grpc.StatusCode.UNKNOWN) + context.set_details(str(e)) + return map_pb2.MapResponse(results=[]) + + return map_pb2.MapResponse(results=res) + + async def __invoke_map(self, keys: list[str], req: Datum): + """ + Invokes the user defined function. + """ + try: + msgs = await self.__map_handler(keys, req) + except Exception as err: + _LOGGER.critical("UDFError, re-raising the error", exc_info=True) + raise err + datums = [] + for msg in msgs: + datums.append(map_pb2.MapResponse.Result(keys=msg.keys, value=msg.value, tags=msg.tags)) + + return datums + + async def IsReady( + self, request: _empty_pb2.Empty, context: NumaflowServicerContext + ) -> map_pb2.ReadyResponse: + """ + IsReady is the heartbeat endpoint for gRPC. + The pascal case function name comes from the proto map_pb2_grpc.py file. + """ + return map_pb2.ReadyResponse(ready=True) diff --git a/refactor/mapper/example.py b/refactor/mapper/example.py new file mode 100644 index 00000000..e736c9d8 --- /dev/null +++ b/refactor/mapper/example.py @@ -0,0 +1,40 @@ +""" +Write a class which implements the MapperClass interface. +""" +from refactor.mapper import ServerType +from refactor.mapper import Datum, Messages, Message +from refactor.mapper.map import MapServer +from refactor.mapper._dtypes import MapperClass + + +class ExampleMapperClass(MapperClass): + """ + Provides an interface to write a Mapper + """ + def handler(self, keys: [str], datum: Datum) -> Messages: + """ + Write a handler function which implements the MapCallable interface. + """ + val = datum.value + _ = datum.event_time + _ = datum.watermark + messages = Messages(Message(val, keys=keys)) + return messages + + +def handler_new(keys: [str], datum: Datum) -> Messages: + """ + Write a handler function which implements the MapCallable interface. + """ + val = datum.value + _ = datum.event_time + _ = datum.watermark + messages = Messages(Message(val, keys=keys)) + return messages + + +# Write a main function to create a new MapServer instance. +if __name__ == "__main__": + map_instance = ExampleMapperClass() + grpc_server = MapServer(mapper_instance=map_instance, server_type=ServerType.Async) + grpc_server.start() diff --git a/refactor/mapper/map.py b/refactor/mapper/map.py new file mode 100644 index 00000000..dae339d3 --- /dev/null +++ b/refactor/mapper/map.py @@ -0,0 +1,92 @@ +import aiorun + +from pynumaflow.info.types import Protocol +from refactor._constants import MAX_THREADS, MAX_MESSAGE_SIZE, _LOGGER, MAP_SOCK_PATH, ServerType +from refactor.mapper import Mapper, AsyncMapper +from refactor.mapper._dtypes import MapCallable +from refactor.mapper.proto import map_pb2_grpc +from refactor.shared.server import prepare_server, write_info_file, NumaflowServer + + +class MapServer(NumaflowServer): + """ + Create a new grpc Server instance. + """ + + def __init__(self, mapper_instance: MapCallable, + sock_path=MAP_SOCK_PATH, + max_message_size=MAX_MESSAGE_SIZE, + max_threads=MAX_THREADS, + server_type=ServerType.Sync): + """ + Create a new grpc Server instance. + A new servicer instance is created and attached to the server. + The server instance is returned. + + max_message_size: The max message size in bytes the server can receive and send + max_threads: The max number of threads to be spawned; + defaults to number of processors x4 + """ + self.sock_path = f"unix://{sock_path}" + self.max_threads = max_threads + self.max_message_size = max_message_size + self.server_type = server_type + self.server = self.get_server(server_type=server_type, mapper_instance=mapper_instance) + + def start(self) -> None: + """ + Starts the gRPC server on the given UNIX socket with given max threads. + """ + if self.server_type == ServerType.Sync: + self.exec() + elif self.server_type == ServerType.Async: + aiorun.run(self.aexec()) + else: + raise NotImplementedError + + def exec(self): + """ + Starts the gRPC server on the given UNIX socket with given max threads.s + """ + self.server.start() + write_info_file(Protocol.UDS) + _LOGGER.info( + "Sync GRPC Server listening on: %s with max threads: %s", self.sock_path, self.max_threads + ) + self.server.wait_for_termination() + + async def aexec(self): + """ + Starts the gRPC server on the given UNIX socket with given max threads.s + """ + await self.server.start() + write_info_file(Protocol.UDS) + _LOGGER.info( + "Async GRPC Server listening on: %s with max threads: %s", self.sock_path, self.max_threads + ) + + async def server_graceful_shutdown(): + """ + Shuts down the server with 5 seconds of grace period. During the + grace period, the server won't accept new connections and allow + existing RPCs to continue within the grace period. + """ + _LOGGER.info("Starting graceful shutdown...") + await self.server.stop(5) + + self.server.cleanup_coroutines.append(server_graceful_shutdown()) + await self.server.wait_for_termination() + + def get_server(self, server_type, mapper_instance: MapCallable): + if server_type == ServerType.Sync: + map_servicer = Mapper(handler=mapper_instance) + elif server_type == ServerType.Async: + map_servicer = AsyncMapper(handler=mapper_instance) + else: + raise NotImplementedError + + server = prepare_server(sock_path=self.sock_path, + max_threads=self.max_threads, + max_message_size=self.max_message_size) + map_pb2_grpc.add_MapServicer_to_server(map_servicer, server) + return server diff --git a/refactor/mapper/proto/__init__.py b/refactor/mapper/proto/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/refactor/mapper/proto/map.proto b/refactor/mapper/proto/map.proto new file mode 100644 index 00000000..a8ab49be --- /dev/null +++ b/refactor/mapper/proto/map.proto @@ -0,0 +1,43 @@ +syntax = "proto3"; + +import "google/protobuf/empty.proto"; +import "google/protobuf/timestamp.proto"; + +package map.v1; + +service Map { + // MapFn applies a function to each map request element. + rpc MapFn(MapRequest) returns (MapResponse); + + // IsReady is the heartbeat endpoint for gRPC. + rpc IsReady(google.protobuf.Empty) returns (ReadyResponse); +} + +/** + * MapRequest represents a request element. + */ +message MapRequest { + repeated string keys = 1; + bytes value = 2; + google.protobuf.Timestamp event_time = 3; + google.protobuf.Timestamp watermark = 4; +} + +/** + * MapResponse represents a response element. + */ +message MapResponse { + message Result { + repeated string keys = 1; + bytes value = 2; + repeated string tags = 3; + } + repeated Result results = 1; +} + +/** + * ReadyResponse is the health check result. + */ +message ReadyResponse { + bool ready = 1; +} \ No newline at end of file diff --git a/refactor/mapper/proto/map_pb2.py b/refactor/mapper/proto/map_pb2.py new file mode 100644 index 00000000..ddb812df --- /dev/null +++ b/refactor/mapper/proto/map_pb2.py @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: map.proto +"""Generated protocol buffer code.""" +from google.protobuf import descriptor as _descriptor +from google.protobuf import descriptor_pool as _descriptor_pool +from google.protobuf import symbol_database as _symbol_database +from google.protobuf.internal import builder as _builder + +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + +from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2 +from google.protobuf import timestamp_pb2 as google_dot_protobuf_dot_timestamp__pb2 + + +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( + b'\n\tmap.proto\x12\x06map.v1\x1a\x1bgoogle/protobuf/empty.proto\x1a\x1fgoogle/protobuf/timestamp.proto"\x88\x01\n\nMapRequest\x12\x0c\n\x04keys\x18\x01 \x03(\t\x12\r\n\x05value\x18\x02 \x01(\x0c\x12.\n\nevent_time\x18\x03 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12-\n\twatermark\x18\x04 \x01(\x0b\x32\x1a.google.protobuf.Timestamp"o\n\x0bMapResponse\x12+\n\x07results\x18\x01 \x03(\x0b\x32\x1a.map.v1.MapResponse.Result\x1a\x33\n\x06Result\x12\x0c\n\x04keys\x18\x01 \x03(\t\x12\r\n\x05value\x18\x02 \x01(\x0c\x12\x0c\n\x04tags\x18\x03 \x03(\t"\x1e\n\rReadyResponse\x12\r\n\x05ready\x18\x01 \x01(\x08\x32q\n\x03Map\x12\x30\n\x05MapFn\x12\x12.map.v1.MapRequest\x1a\x13.map.v1.MapResponse\x12\x38\n\x07IsReady\x12\x16.google.protobuf.Empty\x1a\x15.map.v1.ReadyResponseb\x06proto3' +) + +_globals = globals() +_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) +_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "map_pb2", _globals) +if _descriptor._USE_C_DESCRIPTORS == False: + DESCRIPTOR._options = None + _globals["_MAPREQUEST"]._serialized_start = 84 + _globals["_MAPREQUEST"]._serialized_end = 220 + _globals["_MAPRESPONSE"]._serialized_start = 222 + _globals["_MAPRESPONSE"]._serialized_end = 333 + _globals["_MAPRESPONSE_RESULT"]._serialized_start = 282 + _globals["_MAPRESPONSE_RESULT"]._serialized_end = 333 + _globals["_READYRESPONSE"]._serialized_start = 335 + _globals["_READYRESPONSE"]._serialized_end = 365 + _globals["_MAP"]._serialized_start = 367 + _globals["_MAP"]._serialized_end = 480 +# @@protoc_insertion_point(module_scope) diff --git a/refactor/mapper/proto/map_pb2_grpc.py b/refactor/mapper/proto/map_pb2_grpc.py new file mode 100644 index 00000000..da8edc68 --- /dev/null +++ b/refactor/mapper/proto/map_pb2_grpc.py @@ -0,0 +1,123 @@ +# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT! +"""Client and server classes corresponding to protobuf-defined services.""" +import grpc + +from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2 +from . import map_pb2 as map__pb2 + + +class MapStub(object): + """Missing associated documentation comment in .proto file.""" + + def __init__(self, channel): + """Constructor. + + Args: + channel: A grpc.Channel. + """ + self.MapFn = channel.unary_unary( + "/map.v1.Map/MapFn", + request_serializer=map__pb2.MapRequest.SerializeToString, + response_deserializer=map__pb2.MapResponse.FromString, + ) + self.IsReady = channel.unary_unary( + "/map.v1.Map/IsReady", + request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, + response_deserializer=map__pb2.ReadyResponse.FromString, + ) + + +class MapServicer(object): + """Missing associated documentation comment in .proto file.""" + + def MapFn(self, request, context): + """MapFn applies a function to each map request element.""" + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details("Method not implemented!") + raise NotImplementedError("Method not implemented!") + + def IsReady(self, request, context): + """IsReady is the heartbeat endpoint for gRPC.""" + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details("Method not implemented!") + raise NotImplementedError("Method not implemented!") + + +def add_MapServicer_to_server(servicer, server): + rpc_method_handlers = { + "MapFn": grpc.unary_unary_rpc_method_handler( + servicer.MapFn, + request_deserializer=map__pb2.MapRequest.FromString, + response_serializer=map__pb2.MapResponse.SerializeToString, + ), + "IsReady": grpc.unary_unary_rpc_method_handler( + servicer.IsReady, + request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + response_serializer=map__pb2.ReadyResponse.SerializeToString, + ), + } + generic_handler = grpc.method_handlers_generic_handler("map.v1.Map", rpc_method_handlers) + server.add_generic_rpc_handlers((generic_handler,)) + + +# This class is part of an EXPERIMENTAL API. +class Map(object): + """Missing associated documentation comment in .proto file.""" + + @staticmethod + def MapFn( + request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None, + ): + return grpc.experimental.unary_unary( + request, + target, + "/map.v1.Map/MapFn", + map__pb2.MapRequest.SerializeToString, + map__pb2.MapResponse.FromString, + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + ) + + @staticmethod + def IsReady( + request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None, + ): + return grpc.experimental.unary_unary( + request, + target, + "/map.v1.Map/IsReady", + google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, + map__pb2.ReadyResponse.FromString, + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + ) diff --git a/refactor/mapper/server.py b/refactor/mapper/server.py new file mode 100644 index 00000000..8566a9d9 --- /dev/null +++ b/refactor/mapper/server.py @@ -0,0 +1,112 @@ +import logging +import multiprocessing +import os +from concurrent.futures import ThreadPoolExecutor + +import grpc +from google.protobuf import empty_pb2 as _empty_pb2 +from pynumaflow.info.server import get_sdk_version, write as info_server_write +from pynumaflow.info.types import ServerInfo, Protocol, Language, SERVER_INFO_FILE_PATH + +from refactor import setup_logging +from refactor._constants import ( + MAX_MESSAGE_SIZE, + MAP_SOCK_PATH, +) +from refactor.mapper import Datum +from refactor.mapper._dtypes import MapCallable +from refactor.mapper.proto import map_pb2 +from refactor.mapper.proto import map_pb2_grpc +from refactor.types import NumaflowServicerContext +from refactor._constants import MAX_THREADS + +_LOGGER = setup_logging(__name__) +if os.getenv("PYTHONDEBUG"): + _LOGGER.setLevel(logging.DEBUG) + + +class Mapper(map_pb2_grpc.MapServicer): + """ + Provides an interface to write a Mapper + which will be exposed over a Synchronous gRPC server. + + Args: + handler: Function callable following the type signature of MapCallable + max_message_size: The max message size in bytes the server can receive and send + max_threads: The max number of threads to be spawned; + defaults to number of processors x4 + + Example invocation: + >>> from typing import Iterator + >>> from pynumaflow.mapper import Messages, Message\ + ... Datum, Mapper + ... + >>> def map_handler(key: [str], datum: Datum) -> Messages: + ... val = datum.value + ... _ = datum.event_time + ... _ = datum.watermark + ... messages = Messages(Message(val, keys=keys)) + ... return messages + ... + >>> grpc_server = Mapper(handler=map_handler) + >>> grpc_server.start() + """ + + def __init__( + self, + handler: MapCallable, + sock_path=MAP_SOCK_PATH, + max_message_size=MAX_MESSAGE_SIZE, + max_threads=MAX_THREADS, + ): + self.__map_handler: MapCallable = handler + self.sock_path = f"unix://{sock_path}" + self._max_message_size = max_message_size + self._max_threads = max_threads + self.cleanup_coroutines = [] + + self._server_options = [ + ("grpc.max_send_message_length", self._max_message_size), + ("grpc.max_receive_message_length", self._max_message_size), + ] + + def MapFn( + self, request: map_pb2.MapRequest, context: NumaflowServicerContext + ) -> map_pb2.MapResponse: + """ + Applies a function to each datum element. + The pascal case function name comes from the proto map_pb2_grpc.py file. + """ + # proto repeated field(keys) is of type google._upb._message.RepeatedScalarContainer + # we need to explicitly convert it to list + try: + msgs = self.__map_handler( + list(request.keys), + Datum( + keys=list(request.keys), + value=request.value, + event_time=request.event_time.ToDatetime(), + watermark=request.watermark.ToDatetime(), + ), + ) + except Exception as err: + _LOGGER.critical("UDFError, re-raising the error", exc_info=True) + context.set_code(grpc.StatusCode.UNKNOWN) + context.set_details(str(err)) + return map_pb2.MapResponse(results=[]) + + datums = [] + + for msg in msgs: + datums.append(map_pb2.MapResponse.Result(keys=msg.keys, value=msg.value, tags=msg.tags)) + + return map_pb2.MapResponse(results=datums) + + def IsReady( + self, request: _empty_pb2.Empty, context: NumaflowServicerContext + ) -> map_pb2.ReadyResponse: + """ + IsReady is the heartbeat endpoint for gRPC. + The pascal case function name comes from the proto map_pb2_grpc.py file. + """ + return map_pb2.ReadyResponse(ready=True) diff --git a/refactor/shared/server.py b/refactor/shared/server.py new file mode 100644 index 00000000..433bd498 --- /dev/null +++ b/refactor/shared/server.py @@ -0,0 +1,81 @@ +from abc import abstractmethod +from concurrent.futures import ThreadPoolExecutor +import grpc +from refactor._constants import MAX_MESSAGE_SIZE, MAX_THREADS, ServerType, _LOGGER +from pynumaflow.info.server import get_sdk_version, write as info_server_write +from pynumaflow.info.types import ServerInfo, Protocol, Language, SERVER_INFO_FILE_PATH + + +class NumaflowServer: + """ + Provides an interface to write a Numaflow Server + which will be exposed over gRPC. + + Members: + sock_path: Path to the UNIX Domain Socket + max_message_size: The max message size in bytes the server can receive and send + max_threads: The max number of threads to be spawned; + defaults to number of processors x4 + """ + + @abstractmethod + def start(self): + """ + Start the server + """ + raise NotImplementedError + + +def prepare_server(sock_path: str, + server_type: ServerType, + max_message_size=MAX_MESSAGE_SIZE, + max_threads=MAX_THREADS, + ): + """ + Create a new grpc Server instance. + A new servicer instance is created and attached to the server. + The server instance is returned. + + """ + _server_options = [ + ("grpc.max_send_message_length", max_message_size), + ("grpc.max_receive_message_length", max_message_size), + ] + server = grpc.server( + ThreadPoolExecutor(max_workers=max_threads), options=_server_options) + if server_type == ServerType.Async: + server = grpc.aio.server(options=_server_options) + server.add_insecure_port(sock_path) + return server + + +def write_info_file(protocol: Protocol) -> None: + """ + Write the server info file to the given path. + """ + serv_info = ServerInfo( + protocol=protocol, + language=Language.PYTHON, + version=get_sdk_version(), + ) + info_server_write(server_info=serv_info, info_file=SERVER_INFO_FILE_PATH) + + +async def __serve_async(self, server) -> None: + + async def server_graceful_shutdown(): + """ + Shuts down the server with 5 seconds of grace period. During the + grace period, the server won't accept new connections and allow + existing RPCs to continue within the grace period. + """ + _LOGGER.info("Starting graceful shutdown...") + await server.stop(5) + + self.cleanup_coroutines.append(server_graceful_shutdown()) + await server.wait_for_termination() + +async def start(self) -> None: + """Starts the Async gRPC mapper on the given UNIX socket.""" + server = grpc.aio.server(options=self._server_options) + await self.__serve_async(server) \ No newline at end of file diff --git a/refactor/types.py b/refactor/types.py new file mode 100644 index 00000000..e028ea54 --- /dev/null +++ b/refactor/types.py @@ -0,0 +1,7 @@ +from typing import Union, NewType + +import grpc + +NumaflowServicerContext = NewType( + "NumaflowServicerContext", Union[grpc.aio.ServicerContext, grpc.ServicerContext] +) From 382bdf24b13279307478dbac6e4c81d67ab51539 Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Thu, 4 Jan 2024 11:53:41 -0800 Subject: [PATCH 02/78] init Signed-off-by: Sidhant Kohli --- {refactor => pynumaflow-old}/__init__.py | 0 {refactor => pynumaflow-old}/_constants.py | 22 -------- {pynumaflow => pynumaflow-old}/exceptions.py | 0 .../info/__init__.py | 0 {pynumaflow => pynumaflow-old}/info/server.py | 0 {pynumaflow => pynumaflow-old}/info/types.py | 0 pynumaflow-old/mapper/__init__.py | 19 +++++++ .../mapper/_dtypes.py | 40 +++----------- .../mapper/async_server.py | 32 +++++++++++ .../mapper/multiproc_server.py | 0 .../mapper}/proto/__init__.py | 0 .../mapper/proto/map.proto | 0 .../mapper/proto/map_pb2.py | 0 .../mapper/proto/map_pb2_grpc.py | 0 {refactor => pynumaflow-old}/mapper/server.py | 54 +++++++++++++------ .../mapstreamer/__init__.py | 0 .../mapstreamer/_dtypes.py | 0 .../mapstreamer/async_server.py | 0 .../mapstreamer}/proto/__init__.py | 0 .../mapstreamer/proto/mapstream.proto | 0 .../mapstreamer/proto/mapstream_pb2.py | 0 .../mapstreamer/proto/mapstream_pb2_grpc.py | 0 .../reducer/__init__.py | 0 .../reducer/_dtypes.py | 0 .../reducer/async_server.py | 0 .../reducer/asynciter.py | 0 .../reducer}/proto/__init__.py | 0 .../reducer/proto/reduce.proto | 0 .../reducer/proto/reduce_pb2.py | 0 .../reducer/proto/reduce_pb2_grpc.py | 0 .../sideinput/__init__.py | 0 .../sideinput/_dtypes.py | 0 .../sideinput}/proto/__init__.py | 0 .../sideinput/proto/sideinput.proto | 0 .../sideinput/proto/sideinput_pb2.py | 0 .../sideinput/proto/sideinput_pb2_grpc.py | 0 .../sideinput/server.py | 0 .../sinker/__init__.py | 0 .../sinker/_dtypes.py | 0 .../sinker/async_sink.py | 0 .../sinker}/proto/__init__.py | 0 .../sinker/proto/sink.proto | 0 .../sinker/proto/sink_pb2.py | 0 .../sinker/proto/sink_pb2_grpc.py | 0 .../sinker/server.py | 0 .../sourcer/__init__.py | 0 .../sourcer/_dtypes.py | 0 .../sourcer/async_server.py | 0 .../sourcer}/proto/__init__.py | 0 .../sourcer/proto/source.proto | 0 .../sourcer/proto/source_pb2.py | 0 .../sourcer/proto/source_pb2_grpc.py | 0 .../sourcer/server.py | 0 .../sourcetransformer/__init__.py | 0 .../sourcetransformer/_dtypes.py | 0 .../sourcetransformer/multiproc_server.py | 0 .../sourcetransformer}/proto/__init__.py | 0 .../sourcetransformer/proto/transform.proto | 0 .../sourcetransformer/proto/transform_pb2.py | 0 .../proto/transform_pb2_grpc.py | 0 .../sourcetransformer/server.py | 0 {refactor => pynumaflow-old}/types.py | 0 {refactor => pynumaflow}/_base.py | 0 pynumaflow/_constants.py | 22 ++++++++ pynumaflow/mapper/__init__.py | 6 ++- pynumaflow/mapper/_dtypes.py | 40 +++++++++++--- pynumaflow/mapper/async_server.py | 32 ----------- {refactor => pynumaflow}/mapper/example.py | 8 +-- {refactor => pynumaflow}/mapper/map.py | 10 ++-- pynumaflow/mapper/server.py | 40 +++----------- {refactor => pynumaflow}/shared/server.py | 2 +- refactor/mapper/__init__.py | 21 -------- 72 files changed, 174 insertions(+), 174 deletions(-) rename {refactor => pynumaflow-old}/__init__.py (100%) rename {refactor => pynumaflow-old}/_constants.py (63%) rename {pynumaflow => pynumaflow-old}/exceptions.py (100%) rename {pynumaflow => pynumaflow-old}/info/__init__.py (100%) rename {pynumaflow => pynumaflow-old}/info/server.py (100%) rename {pynumaflow => pynumaflow-old}/info/types.py (100%) create mode 100644 pynumaflow-old/mapper/__init__.py rename {refactor => pynumaflow-old}/mapper/_dtypes.py (83%) rename {refactor => pynumaflow-old}/mapper/async_server.py (77%) rename {pynumaflow => pynumaflow-old}/mapper/multiproc_server.py (100%) rename {pynumaflow/mapstreamer => pynumaflow-old/mapper}/proto/__init__.py (100%) rename {refactor => pynumaflow-old}/mapper/proto/map.proto (100%) rename {refactor => pynumaflow-old}/mapper/proto/map_pb2.py (100%) rename {refactor => pynumaflow-old}/mapper/proto/map_pb2_grpc.py (100%) rename {refactor => pynumaflow-old}/mapper/server.py (68%) rename {pynumaflow => pynumaflow-old}/mapstreamer/__init__.py (100%) rename {pynumaflow => pynumaflow-old}/mapstreamer/_dtypes.py (100%) rename {pynumaflow => pynumaflow-old}/mapstreamer/async_server.py (100%) rename {pynumaflow/reducer => pynumaflow-old/mapstreamer}/proto/__init__.py (100%) rename {pynumaflow => pynumaflow-old}/mapstreamer/proto/mapstream.proto (100%) rename {pynumaflow => pynumaflow-old}/mapstreamer/proto/mapstream_pb2.py (100%) rename {pynumaflow => pynumaflow-old}/mapstreamer/proto/mapstream_pb2_grpc.py (100%) rename {pynumaflow => pynumaflow-old}/reducer/__init__.py (100%) rename {pynumaflow => pynumaflow-old}/reducer/_dtypes.py (100%) rename {pynumaflow => pynumaflow-old}/reducer/async_server.py (100%) rename {pynumaflow => pynumaflow-old}/reducer/asynciter.py (100%) rename {pynumaflow/sideinput => pynumaflow-old/reducer}/proto/__init__.py (100%) rename {pynumaflow => pynumaflow-old}/reducer/proto/reduce.proto (100%) rename {pynumaflow => pynumaflow-old}/reducer/proto/reduce_pb2.py (100%) rename {pynumaflow => pynumaflow-old}/reducer/proto/reduce_pb2_grpc.py (100%) rename {pynumaflow => pynumaflow-old}/sideinput/__init__.py (100%) rename {pynumaflow => pynumaflow-old}/sideinput/_dtypes.py (100%) rename {pynumaflow/sinker => pynumaflow-old/sideinput}/proto/__init__.py (100%) rename {pynumaflow => pynumaflow-old}/sideinput/proto/sideinput.proto (100%) rename {pynumaflow => pynumaflow-old}/sideinput/proto/sideinput_pb2.py (100%) rename {pynumaflow => pynumaflow-old}/sideinput/proto/sideinput_pb2_grpc.py (100%) rename {pynumaflow => pynumaflow-old}/sideinput/server.py (100%) rename {pynumaflow => pynumaflow-old}/sinker/__init__.py (100%) rename {pynumaflow => pynumaflow-old}/sinker/_dtypes.py (100%) rename {pynumaflow => pynumaflow-old}/sinker/async_sink.py (100%) rename {pynumaflow/sourcer => pynumaflow-old/sinker}/proto/__init__.py (100%) rename {pynumaflow => pynumaflow-old}/sinker/proto/sink.proto (100%) rename {pynumaflow => pynumaflow-old}/sinker/proto/sink_pb2.py (100%) rename {pynumaflow => pynumaflow-old}/sinker/proto/sink_pb2_grpc.py (100%) rename {pynumaflow => pynumaflow-old}/sinker/server.py (100%) rename {pynumaflow => pynumaflow-old}/sourcer/__init__.py (100%) rename {pynumaflow => pynumaflow-old}/sourcer/_dtypes.py (100%) rename {pynumaflow => pynumaflow-old}/sourcer/async_server.py (100%) rename {pynumaflow/sourcetransformer => pynumaflow-old/sourcer}/proto/__init__.py (100%) rename {pynumaflow => pynumaflow-old}/sourcer/proto/source.proto (100%) rename {pynumaflow => pynumaflow-old}/sourcer/proto/source_pb2.py (100%) rename {pynumaflow => pynumaflow-old}/sourcer/proto/source_pb2_grpc.py (100%) rename {pynumaflow => pynumaflow-old}/sourcer/server.py (100%) rename {pynumaflow => pynumaflow-old}/sourcetransformer/__init__.py (100%) rename {pynumaflow => pynumaflow-old}/sourcetransformer/_dtypes.py (100%) rename {pynumaflow => pynumaflow-old}/sourcetransformer/multiproc_server.py (100%) rename {refactor/mapper => pynumaflow-old/sourcetransformer}/proto/__init__.py (100%) rename {pynumaflow => pynumaflow-old}/sourcetransformer/proto/transform.proto (100%) rename {pynumaflow => pynumaflow-old}/sourcetransformer/proto/transform_pb2.py (100%) rename {pynumaflow => pynumaflow-old}/sourcetransformer/proto/transform_pb2_grpc.py (100%) rename {pynumaflow => pynumaflow-old}/sourcetransformer/server.py (100%) rename {refactor => pynumaflow-old}/types.py (100%) rename {refactor => pynumaflow}/_base.py (100%) rename {refactor => pynumaflow}/mapper/example.py (84%) rename {refactor => pynumaflow}/mapper/map.py (90%) rename {refactor => pynumaflow}/shared/server.py (96%) delete mode 100644 refactor/mapper/__init__.py diff --git a/refactor/__init__.py b/pynumaflow-old/__init__.py similarity index 100% rename from refactor/__init__.py rename to pynumaflow-old/__init__.py diff --git a/refactor/_constants.py b/pynumaflow-old/_constants.py similarity index 63% rename from refactor/_constants.py rename to pynumaflow-old/_constants.py index 399ce57a..253d0401 100644 --- a/refactor/_constants.py +++ b/pynumaflow-old/_constants.py @@ -1,10 +1,3 @@ -import logging -import multiprocessing -import os -from enum import Enum - -from refactor import setup_logging - MAP_SOCK_PATH = "/var/run/numaflow/map.sock" MAP_STREAM_SOCK_PATH = "/var/run/numaflow/mapstream.sock" REDUCE_SOCK_PATH = "/var/run/numaflow/reduce.sock" @@ -24,18 +17,3 @@ STREAM_EOF = "EOF" DELIMITER = ":" DROP = "U+005C__DROP__" - -_PROCESS_COUNT = multiprocessing.cpu_count() -MAX_THREADS = int(os.getenv("MAX_THREADS", 0)) or (_PROCESS_COUNT * 4) - -_LOGGER = setup_logging(__name__) -if os.getenv("PYTHONDEBUG"): - _LOGGER.setLevel(logging.DEBUG) - - -class ServerType(str, Enum): - """ - Enumerate grpc server connection protocol. - """ - Sync = "sync" - Async = "async" diff --git a/pynumaflow/exceptions.py b/pynumaflow-old/exceptions.py similarity index 100% rename from pynumaflow/exceptions.py rename to pynumaflow-old/exceptions.py diff --git a/pynumaflow/info/__init__.py b/pynumaflow-old/info/__init__.py similarity index 100% rename from pynumaflow/info/__init__.py rename to pynumaflow-old/info/__init__.py diff --git a/pynumaflow/info/server.py b/pynumaflow-old/info/server.py similarity index 100% rename from pynumaflow/info/server.py rename to pynumaflow-old/info/server.py diff --git a/pynumaflow/info/types.py b/pynumaflow-old/info/types.py similarity index 100% rename from pynumaflow/info/types.py rename to pynumaflow-old/info/types.py diff --git a/pynumaflow-old/mapper/__init__.py b/pynumaflow-old/mapper/__init__.py new file mode 100644 index 00000000..374b123f --- /dev/null +++ b/pynumaflow-old/mapper/__init__.py @@ -0,0 +1,19 @@ +from pynumaflow.mapper._dtypes import ( + Message, + Messages, + Datum, + DROP, +) +from pynumaflow.mapper.async_server import AsyncMapper +from pynumaflow.mapper.multiproc_server import MultiProcMapper +from pynumaflow.mapper.server import Mapper + +__all__ = [ + "Message", + "Messages", + "Datum", + "DROP", + "Mapper", + "AsyncMapper", + "MultiProcMapper", +] diff --git a/refactor/mapper/_dtypes.py b/pynumaflow-old/mapper/_dtypes.py similarity index 83% rename from refactor/mapper/_dtypes.py rename to pynumaflow-old/mapper/_dtypes.py index a39d68f2..92556a9d 100644 --- a/refactor/mapper/_dtypes.py +++ b/pynumaflow-old/mapper/_dtypes.py @@ -1,7 +1,7 @@ from collections.abc import Iterator, Sequence, Awaitable from dataclasses import dataclass from datetime import datetime -from typing import TypeVar, Callable, Union +from typing import TypeVar, Callable from warnings import warn from pynumaflow._constants import DROP @@ -127,11 +127,11 @@ class Datum: _watermark: datetime def __init__( - self, - keys: list[str], - value: bytes, - event_time: datetime, - watermark: datetime, + self, + keys: list[str], + value: bytes, + event_time: datetime, + watermark: datetime, ): self._keys = keys or list() self._value = value or b"" @@ -162,31 +162,5 @@ def watermark(self) -> datetime: return self._watermark -MapSyncCallable = Callable[[list[str], Datum], Messages] +MapCallable = Callable[[list[str], Datum], Messages] MapAsyncCallable = Callable[[list[str], Datum], Awaitable[Messages]] - - -class MapperClass: - """ - Provides an interface to write a Mapper - which will be exposed over a Synchronous gRPC server. - - Args: - - """ - - def __call__(self, *args, **kwargs): - """ - Allow to call handler function directly if class instance is sent - """ - return self.handler(*args, **kwargs) - - def handler(self, keys: list[str], datum: Datum) -> Messages: - """ - Write a handler function which implements the MapCallable interface. - """ - raise NotImplementedError - - -MapCallable = Union[MapperClass, MapSyncCallable, MapAsyncCallable] - diff --git a/refactor/mapper/async_server.py b/pynumaflow-old/mapper/async_server.py similarity index 77% rename from refactor/mapper/async_server.py rename to pynumaflow-old/mapper/async_server.py index 6a15d7e7..2c479848 100644 --- a/refactor/mapper/async_server.py +++ b/pynumaflow-old/mapper/async_server.py @@ -127,3 +127,35 @@ async def IsReady( The pascal case function name comes from the proto map_pb2_grpc.py file. """ return map_pb2.ReadyResponse(ready=True) + + async def __serve_async(self, server) -> None: + map_pb2_grpc.add_MapServicer_to_server( + AsyncMapper(handler=self.__map_handler), + server, + ) + server.add_insecure_port(self.sock_path) + _LOGGER.info("gRPC Async Map Server listening on: %s", self.sock_path) + await server.start() + serv_info = ServerInfo( + protocol=Protocol.UDS, + language=Language.PYTHON, + version=get_sdk_version(), + ) + info_server_write(server_info=serv_info, info_file=SERVER_INFO_FILE_PATH) + + async def server_graceful_shutdown(): + """ + Shuts down the server with 5 seconds of grace period. During the + grace period, the server won't accept new connections and allow + existing RPCs to continue within the grace period. + """ + _LOGGER.info("Starting graceful shutdown...") + await server.stop(5) + + self.cleanup_coroutines.append(server_graceful_shutdown()) + await server.wait_for_termination() + + async def start(self) -> None: + """Starts the Async gRPC mapper on the given UNIX socket.""" + server = grpc.aio.server(options=self._server_options) + await self.__serve_async(server) diff --git a/pynumaflow/mapper/multiproc_server.py b/pynumaflow-old/mapper/multiproc_server.py similarity index 100% rename from pynumaflow/mapper/multiproc_server.py rename to pynumaflow-old/mapper/multiproc_server.py diff --git a/pynumaflow/mapstreamer/proto/__init__.py b/pynumaflow-old/mapper/proto/__init__.py similarity index 100% rename from pynumaflow/mapstreamer/proto/__init__.py rename to pynumaflow-old/mapper/proto/__init__.py diff --git a/refactor/mapper/proto/map.proto b/pynumaflow-old/mapper/proto/map.proto similarity index 100% rename from refactor/mapper/proto/map.proto rename to pynumaflow-old/mapper/proto/map.proto diff --git a/refactor/mapper/proto/map_pb2.py b/pynumaflow-old/mapper/proto/map_pb2.py similarity index 100% rename from refactor/mapper/proto/map_pb2.py rename to pynumaflow-old/mapper/proto/map_pb2.py diff --git a/refactor/mapper/proto/map_pb2_grpc.py b/pynumaflow-old/mapper/proto/map_pb2_grpc.py similarity index 100% rename from refactor/mapper/proto/map_pb2_grpc.py rename to pynumaflow-old/mapper/proto/map_pb2_grpc.py diff --git a/refactor/mapper/server.py b/pynumaflow-old/mapper/server.py similarity index 68% rename from refactor/mapper/server.py rename to pynumaflow-old/mapper/server.py index 8566a9d9..0ae779ee 100644 --- a/refactor/mapper/server.py +++ b/pynumaflow-old/mapper/server.py @@ -8,23 +8,26 @@ from pynumaflow.info.server import get_sdk_version, write as info_server_write from pynumaflow.info.types import ServerInfo, Protocol, Language, SERVER_INFO_FILE_PATH -from refactor import setup_logging -from refactor._constants import ( +from pynumaflow import setup_logging +from pynumaflow._constants import ( MAX_MESSAGE_SIZE, MAP_SOCK_PATH, ) -from refactor.mapper import Datum -from refactor.mapper._dtypes import MapCallable -from refactor.mapper.proto import map_pb2 -from refactor.mapper.proto import map_pb2_grpc -from refactor.types import NumaflowServicerContext -from refactor._constants import MAX_THREADS +from pynumaflow.mapper import Datum +from pynumaflow.mapper._dtypes import MapCallable +from pynumaflow.mapper.proto import map_pb2 +from pynumaflow.mapper.proto import map_pb2_grpc +from pynumaflow.types import NumaflowServicerContext _LOGGER = setup_logging(__name__) if os.getenv("PYTHONDEBUG"): _LOGGER.setLevel(logging.DEBUG) +_PROCESS_COUNT = multiprocessing.cpu_count() +MAX_THREADS = int(os.getenv("MAX_THREADS", 0)) or (_PROCESS_COUNT * 4) + + class Mapper(map_pb2_grpc.MapServicer): """ Provides an interface to write a Mapper @@ -53,11 +56,11 @@ class Mapper(map_pb2_grpc.MapServicer): """ def __init__( - self, - handler: MapCallable, - sock_path=MAP_SOCK_PATH, - max_message_size=MAX_MESSAGE_SIZE, - max_threads=MAX_THREADS, + self, + handler: MapCallable, + sock_path=MAP_SOCK_PATH, + max_message_size=MAX_MESSAGE_SIZE, + max_threads=MAX_THREADS, ): self.__map_handler: MapCallable = handler self.sock_path = f"unix://{sock_path}" @@ -71,7 +74,7 @@ def __init__( ] def MapFn( - self, request: map_pb2.MapRequest, context: NumaflowServicerContext + self, request: map_pb2.MapRequest, context: NumaflowServicerContext ) -> map_pb2.MapResponse: """ Applies a function to each datum element. @@ -103,10 +106,31 @@ def MapFn( return map_pb2.MapResponse(results=datums) def IsReady( - self, request: _empty_pb2.Empty, context: NumaflowServicerContext + self, request: _empty_pb2.Empty, context: NumaflowServicerContext ) -> map_pb2.ReadyResponse: """ IsReady is the heartbeat endpoint for gRPC. The pascal case function name comes from the proto map_pb2_grpc.py file. """ return map_pb2.ReadyResponse(ready=True) + + def start(self) -> None: + """ + Starts the gRPC server on the given UNIX socket with given max threads. + """ + server = grpc.server( + ThreadPoolExecutor(max_workers=self._max_threads), options=self._server_options + ) + map_pb2_grpc.add_MapServicer_to_server(self, server) + server.add_insecure_port(self.sock_path) + server.start() + serv_info = ServerInfo( + protocol=Protocol.UDS, + language=Language.PYTHON, + version=get_sdk_version(), + ) + info_server_write(server_info=serv_info, info_file=SERVER_INFO_FILE_PATH) + _LOGGER.info( + "GRPC Server listening on: %s with max threads: %s", self.sock_path, self._max_threads + ) + server.wait_for_termination() diff --git a/pynumaflow/mapstreamer/__init__.py b/pynumaflow-old/mapstreamer/__init__.py similarity index 100% rename from pynumaflow/mapstreamer/__init__.py rename to pynumaflow-old/mapstreamer/__init__.py diff --git a/pynumaflow/mapstreamer/_dtypes.py b/pynumaflow-old/mapstreamer/_dtypes.py similarity index 100% rename from pynumaflow/mapstreamer/_dtypes.py rename to pynumaflow-old/mapstreamer/_dtypes.py diff --git a/pynumaflow/mapstreamer/async_server.py b/pynumaflow-old/mapstreamer/async_server.py similarity index 100% rename from pynumaflow/mapstreamer/async_server.py rename to pynumaflow-old/mapstreamer/async_server.py diff --git a/pynumaflow/reducer/proto/__init__.py b/pynumaflow-old/mapstreamer/proto/__init__.py similarity index 100% rename from pynumaflow/reducer/proto/__init__.py rename to pynumaflow-old/mapstreamer/proto/__init__.py diff --git a/pynumaflow/mapstreamer/proto/mapstream.proto b/pynumaflow-old/mapstreamer/proto/mapstream.proto similarity index 100% rename from pynumaflow/mapstreamer/proto/mapstream.proto rename to pynumaflow-old/mapstreamer/proto/mapstream.proto diff --git a/pynumaflow/mapstreamer/proto/mapstream_pb2.py b/pynumaflow-old/mapstreamer/proto/mapstream_pb2.py similarity index 100% rename from pynumaflow/mapstreamer/proto/mapstream_pb2.py rename to pynumaflow-old/mapstreamer/proto/mapstream_pb2.py diff --git a/pynumaflow/mapstreamer/proto/mapstream_pb2_grpc.py b/pynumaflow-old/mapstreamer/proto/mapstream_pb2_grpc.py similarity index 100% rename from pynumaflow/mapstreamer/proto/mapstream_pb2_grpc.py rename to pynumaflow-old/mapstreamer/proto/mapstream_pb2_grpc.py diff --git a/pynumaflow/reducer/__init__.py b/pynumaflow-old/reducer/__init__.py similarity index 100% rename from pynumaflow/reducer/__init__.py rename to pynumaflow-old/reducer/__init__.py diff --git a/pynumaflow/reducer/_dtypes.py b/pynumaflow-old/reducer/_dtypes.py similarity index 100% rename from pynumaflow/reducer/_dtypes.py rename to pynumaflow-old/reducer/_dtypes.py diff --git a/pynumaflow/reducer/async_server.py b/pynumaflow-old/reducer/async_server.py similarity index 100% rename from pynumaflow/reducer/async_server.py rename to pynumaflow-old/reducer/async_server.py diff --git a/pynumaflow/reducer/asynciter.py b/pynumaflow-old/reducer/asynciter.py similarity index 100% rename from pynumaflow/reducer/asynciter.py rename to pynumaflow-old/reducer/asynciter.py diff --git a/pynumaflow/sideinput/proto/__init__.py b/pynumaflow-old/reducer/proto/__init__.py similarity index 100% rename from pynumaflow/sideinput/proto/__init__.py rename to pynumaflow-old/reducer/proto/__init__.py diff --git a/pynumaflow/reducer/proto/reduce.proto b/pynumaflow-old/reducer/proto/reduce.proto similarity index 100% rename from pynumaflow/reducer/proto/reduce.proto rename to pynumaflow-old/reducer/proto/reduce.proto diff --git a/pynumaflow/reducer/proto/reduce_pb2.py b/pynumaflow-old/reducer/proto/reduce_pb2.py similarity index 100% rename from pynumaflow/reducer/proto/reduce_pb2.py rename to pynumaflow-old/reducer/proto/reduce_pb2.py diff --git a/pynumaflow/reducer/proto/reduce_pb2_grpc.py b/pynumaflow-old/reducer/proto/reduce_pb2_grpc.py similarity index 100% rename from pynumaflow/reducer/proto/reduce_pb2_grpc.py rename to pynumaflow-old/reducer/proto/reduce_pb2_grpc.py diff --git a/pynumaflow/sideinput/__init__.py b/pynumaflow-old/sideinput/__init__.py similarity index 100% rename from pynumaflow/sideinput/__init__.py rename to pynumaflow-old/sideinput/__init__.py diff --git a/pynumaflow/sideinput/_dtypes.py b/pynumaflow-old/sideinput/_dtypes.py similarity index 100% rename from pynumaflow/sideinput/_dtypes.py rename to pynumaflow-old/sideinput/_dtypes.py diff --git a/pynumaflow/sinker/proto/__init__.py b/pynumaflow-old/sideinput/proto/__init__.py similarity index 100% rename from pynumaflow/sinker/proto/__init__.py rename to pynumaflow-old/sideinput/proto/__init__.py diff --git a/pynumaflow/sideinput/proto/sideinput.proto b/pynumaflow-old/sideinput/proto/sideinput.proto similarity index 100% rename from pynumaflow/sideinput/proto/sideinput.proto rename to pynumaflow-old/sideinput/proto/sideinput.proto diff --git a/pynumaflow/sideinput/proto/sideinput_pb2.py b/pynumaflow-old/sideinput/proto/sideinput_pb2.py similarity index 100% rename from pynumaflow/sideinput/proto/sideinput_pb2.py rename to pynumaflow-old/sideinput/proto/sideinput_pb2.py diff --git a/pynumaflow/sideinput/proto/sideinput_pb2_grpc.py b/pynumaflow-old/sideinput/proto/sideinput_pb2_grpc.py similarity index 100% rename from pynumaflow/sideinput/proto/sideinput_pb2_grpc.py rename to pynumaflow-old/sideinput/proto/sideinput_pb2_grpc.py diff --git a/pynumaflow/sideinput/server.py b/pynumaflow-old/sideinput/server.py similarity index 100% rename from pynumaflow/sideinput/server.py rename to pynumaflow-old/sideinput/server.py diff --git a/pynumaflow/sinker/__init__.py b/pynumaflow-old/sinker/__init__.py similarity index 100% rename from pynumaflow/sinker/__init__.py rename to pynumaflow-old/sinker/__init__.py diff --git a/pynumaflow/sinker/_dtypes.py b/pynumaflow-old/sinker/_dtypes.py similarity index 100% rename from pynumaflow/sinker/_dtypes.py rename to pynumaflow-old/sinker/_dtypes.py diff --git a/pynumaflow/sinker/async_sink.py b/pynumaflow-old/sinker/async_sink.py similarity index 100% rename from pynumaflow/sinker/async_sink.py rename to pynumaflow-old/sinker/async_sink.py diff --git a/pynumaflow/sourcer/proto/__init__.py b/pynumaflow-old/sinker/proto/__init__.py similarity index 100% rename from pynumaflow/sourcer/proto/__init__.py rename to pynumaflow-old/sinker/proto/__init__.py diff --git a/pynumaflow/sinker/proto/sink.proto b/pynumaflow-old/sinker/proto/sink.proto similarity index 100% rename from pynumaflow/sinker/proto/sink.proto rename to pynumaflow-old/sinker/proto/sink.proto diff --git a/pynumaflow/sinker/proto/sink_pb2.py b/pynumaflow-old/sinker/proto/sink_pb2.py similarity index 100% rename from pynumaflow/sinker/proto/sink_pb2.py rename to pynumaflow-old/sinker/proto/sink_pb2.py diff --git a/pynumaflow/sinker/proto/sink_pb2_grpc.py b/pynumaflow-old/sinker/proto/sink_pb2_grpc.py similarity index 100% rename from pynumaflow/sinker/proto/sink_pb2_grpc.py rename to pynumaflow-old/sinker/proto/sink_pb2_grpc.py diff --git a/pynumaflow/sinker/server.py b/pynumaflow-old/sinker/server.py similarity index 100% rename from pynumaflow/sinker/server.py rename to pynumaflow-old/sinker/server.py diff --git a/pynumaflow/sourcer/__init__.py b/pynumaflow-old/sourcer/__init__.py similarity index 100% rename from pynumaflow/sourcer/__init__.py rename to pynumaflow-old/sourcer/__init__.py diff --git a/pynumaflow/sourcer/_dtypes.py b/pynumaflow-old/sourcer/_dtypes.py similarity index 100% rename from pynumaflow/sourcer/_dtypes.py rename to pynumaflow-old/sourcer/_dtypes.py diff --git a/pynumaflow/sourcer/async_server.py b/pynumaflow-old/sourcer/async_server.py similarity index 100% rename from pynumaflow/sourcer/async_server.py rename to pynumaflow-old/sourcer/async_server.py diff --git a/pynumaflow/sourcetransformer/proto/__init__.py b/pynumaflow-old/sourcer/proto/__init__.py similarity index 100% rename from pynumaflow/sourcetransformer/proto/__init__.py rename to pynumaflow-old/sourcer/proto/__init__.py diff --git a/pynumaflow/sourcer/proto/source.proto b/pynumaflow-old/sourcer/proto/source.proto similarity index 100% rename from pynumaflow/sourcer/proto/source.proto rename to pynumaflow-old/sourcer/proto/source.proto diff --git a/pynumaflow/sourcer/proto/source_pb2.py b/pynumaflow-old/sourcer/proto/source_pb2.py similarity index 100% rename from pynumaflow/sourcer/proto/source_pb2.py rename to pynumaflow-old/sourcer/proto/source_pb2.py diff --git a/pynumaflow/sourcer/proto/source_pb2_grpc.py b/pynumaflow-old/sourcer/proto/source_pb2_grpc.py similarity index 100% rename from pynumaflow/sourcer/proto/source_pb2_grpc.py rename to pynumaflow-old/sourcer/proto/source_pb2_grpc.py diff --git a/pynumaflow/sourcer/server.py b/pynumaflow-old/sourcer/server.py similarity index 100% rename from pynumaflow/sourcer/server.py rename to pynumaflow-old/sourcer/server.py diff --git a/pynumaflow/sourcetransformer/__init__.py b/pynumaflow-old/sourcetransformer/__init__.py similarity index 100% rename from pynumaflow/sourcetransformer/__init__.py rename to pynumaflow-old/sourcetransformer/__init__.py diff --git a/pynumaflow/sourcetransformer/_dtypes.py b/pynumaflow-old/sourcetransformer/_dtypes.py similarity index 100% rename from pynumaflow/sourcetransformer/_dtypes.py rename to pynumaflow-old/sourcetransformer/_dtypes.py diff --git a/pynumaflow/sourcetransformer/multiproc_server.py b/pynumaflow-old/sourcetransformer/multiproc_server.py similarity index 100% rename from pynumaflow/sourcetransformer/multiproc_server.py rename to pynumaflow-old/sourcetransformer/multiproc_server.py diff --git a/refactor/mapper/proto/__init__.py b/pynumaflow-old/sourcetransformer/proto/__init__.py similarity index 100% rename from refactor/mapper/proto/__init__.py rename to pynumaflow-old/sourcetransformer/proto/__init__.py diff --git a/pynumaflow/sourcetransformer/proto/transform.proto b/pynumaflow-old/sourcetransformer/proto/transform.proto similarity index 100% rename from pynumaflow/sourcetransformer/proto/transform.proto rename to pynumaflow-old/sourcetransformer/proto/transform.proto diff --git a/pynumaflow/sourcetransformer/proto/transform_pb2.py b/pynumaflow-old/sourcetransformer/proto/transform_pb2.py similarity index 100% rename from pynumaflow/sourcetransformer/proto/transform_pb2.py rename to pynumaflow-old/sourcetransformer/proto/transform_pb2.py diff --git a/pynumaflow/sourcetransformer/proto/transform_pb2_grpc.py b/pynumaflow-old/sourcetransformer/proto/transform_pb2_grpc.py similarity index 100% rename from pynumaflow/sourcetransformer/proto/transform_pb2_grpc.py rename to pynumaflow-old/sourcetransformer/proto/transform_pb2_grpc.py diff --git a/pynumaflow/sourcetransformer/server.py b/pynumaflow-old/sourcetransformer/server.py similarity index 100% rename from pynumaflow/sourcetransformer/server.py rename to pynumaflow-old/sourcetransformer/server.py diff --git a/refactor/types.py b/pynumaflow-old/types.py similarity index 100% rename from refactor/types.py rename to pynumaflow-old/types.py diff --git a/refactor/_base.py b/pynumaflow/_base.py similarity index 100% rename from refactor/_base.py rename to pynumaflow/_base.py diff --git a/pynumaflow/_constants.py b/pynumaflow/_constants.py index 253d0401..5af5023c 100644 --- a/pynumaflow/_constants.py +++ b/pynumaflow/_constants.py @@ -1,3 +1,10 @@ +import logging +import multiprocessing +import os +from enum import Enum + +from pynumaflow import setup_logging + MAP_SOCK_PATH = "/var/run/numaflow/map.sock" MAP_STREAM_SOCK_PATH = "/var/run/numaflow/mapstream.sock" REDUCE_SOCK_PATH = "/var/run/numaflow/reduce.sock" @@ -17,3 +24,18 @@ STREAM_EOF = "EOF" DELIMITER = ":" DROP = "U+005C__DROP__" + +_PROCESS_COUNT = multiprocessing.cpu_count() +MAX_THREADS = int(os.getenv("MAX_THREADS", 0)) or (_PROCESS_COUNT * 4) + +_LOGGER = setup_logging(__name__) +if os.getenv("PYTHONDEBUG"): + _LOGGER.setLevel(logging.DEBUG) + + +class ServerType(str, Enum): + """ + Enumerate grpc server connection protocol. + """ + Sync = "sync" + Async = "async" diff --git a/pynumaflow/mapper/__init__.py b/pynumaflow/mapper/__init__.py index 374b123f..74363153 100644 --- a/pynumaflow/mapper/__init__.py +++ b/pynumaflow/mapper/__init__.py @@ -5,8 +5,9 @@ DROP, ) from pynumaflow.mapper.async_server import AsyncMapper -from pynumaflow.mapper.multiproc_server import MultiProcMapper +# from pynumaflow.mapper.multiproc_server import MultiProcMapper from pynumaflow.mapper.server import Mapper +from pynumaflow._constants import ServerType __all__ = [ "Message", @@ -15,5 +16,6 @@ "DROP", "Mapper", "AsyncMapper", - "MultiProcMapper", + "ServerType", + # "MultiProcMapper", ] diff --git a/pynumaflow/mapper/_dtypes.py b/pynumaflow/mapper/_dtypes.py index 92556a9d..a39d68f2 100644 --- a/pynumaflow/mapper/_dtypes.py +++ b/pynumaflow/mapper/_dtypes.py @@ -1,7 +1,7 @@ from collections.abc import Iterator, Sequence, Awaitable from dataclasses import dataclass from datetime import datetime -from typing import TypeVar, Callable +from typing import TypeVar, Callable, Union from warnings import warn from pynumaflow._constants import DROP @@ -127,11 +127,11 @@ class Datum: _watermark: datetime def __init__( - self, - keys: list[str], - value: bytes, - event_time: datetime, - watermark: datetime, + self, + keys: list[str], + value: bytes, + event_time: datetime, + watermark: datetime, ): self._keys = keys or list() self._value = value or b"" @@ -162,5 +162,31 @@ def watermark(self) -> datetime: return self._watermark -MapCallable = Callable[[list[str], Datum], Messages] +MapSyncCallable = Callable[[list[str], Datum], Messages] MapAsyncCallable = Callable[[list[str], Datum], Awaitable[Messages]] + + +class MapperClass: + """ + Provides an interface to write a Mapper + which will be exposed over a Synchronous gRPC server. + + Args: + + """ + + def __call__(self, *args, **kwargs): + """ + Allow to call handler function directly if class instance is sent + """ + return self.handler(*args, **kwargs) + + def handler(self, keys: list[str], datum: Datum) -> Messages: + """ + Write a handler function which implements the MapCallable interface. + """ + raise NotImplementedError + + +MapCallable = Union[MapperClass, MapSyncCallable, MapAsyncCallable] + diff --git a/pynumaflow/mapper/async_server.py b/pynumaflow/mapper/async_server.py index 2c479848..6a15d7e7 100644 --- a/pynumaflow/mapper/async_server.py +++ b/pynumaflow/mapper/async_server.py @@ -127,35 +127,3 @@ async def IsReady( The pascal case function name comes from the proto map_pb2_grpc.py file. """ return map_pb2.ReadyResponse(ready=True) - - async def __serve_async(self, server) -> None: - map_pb2_grpc.add_MapServicer_to_server( - AsyncMapper(handler=self.__map_handler), - server, - ) - server.add_insecure_port(self.sock_path) - _LOGGER.info("gRPC Async Map Server listening on: %s", self.sock_path) - await server.start() - serv_info = ServerInfo( - protocol=Protocol.UDS, - language=Language.PYTHON, - version=get_sdk_version(), - ) - info_server_write(server_info=serv_info, info_file=SERVER_INFO_FILE_PATH) - - async def server_graceful_shutdown(): - """ - Shuts down the server with 5 seconds of grace period. During the - grace period, the server won't accept new connections and allow - existing RPCs to continue within the grace period. - """ - _LOGGER.info("Starting graceful shutdown...") - await server.stop(5) - - self.cleanup_coroutines.append(server_graceful_shutdown()) - await server.wait_for_termination() - - async def start(self) -> None: - """Starts the Async gRPC mapper on the given UNIX socket.""" - server = grpc.aio.server(options=self._server_options) - await self.__serve_async(server) diff --git a/refactor/mapper/example.py b/pynumaflow/mapper/example.py similarity index 84% rename from refactor/mapper/example.py rename to pynumaflow/mapper/example.py index e736c9d8..5cb41588 100644 --- a/refactor/mapper/example.py +++ b/pynumaflow/mapper/example.py @@ -1,10 +1,10 @@ """ Write a class which implements the MapperClass interface. """ -from refactor.mapper import ServerType -from refactor.mapper import Datum, Messages, Message -from refactor.mapper.map import MapServer -from refactor.mapper._dtypes import MapperClass +from pynumaflow.mapper import ServerType +from pynumaflow.mapper import Datum, Messages, Message +from pynumaflow.mapper.map import MapServer +from pynumaflow.mapper._dtypes import MapperClass class ExampleMapperClass(MapperClass): diff --git a/refactor/mapper/map.py b/pynumaflow/mapper/map.py similarity index 90% rename from refactor/mapper/map.py rename to pynumaflow/mapper/map.py index dae339d3..907e30f2 100644 --- a/refactor/mapper/map.py +++ b/pynumaflow/mapper/map.py @@ -1,11 +1,11 @@ import aiorun from pynumaflow.info.types import Protocol -from refactor._constants import MAX_THREADS, MAX_MESSAGE_SIZE, _LOGGER, MAP_SOCK_PATH, ServerType -from refactor.mapper import Mapper, AsyncMapper -from refactor.mapper._dtypes import MapCallable -from refactor.mapper.proto import map_pb2_grpc -from refactor.shared.server import prepare_server, write_info_file, NumaflowServer +from pynumaflow._constants import MAX_THREADS, MAX_MESSAGE_SIZE, _LOGGER, MAP_SOCK_PATH, ServerType +from pynumaflow.mapper import Mapper, AsyncMapper +from pynumaflow.mapper._dtypes import MapCallable +from pynumaflow.mapper.proto import map_pb2_grpc +from pynumaflow.shared.server import prepare_server, write_info_file, NumaflowServer class MapServer(NumaflowServer): diff --git a/pynumaflow/mapper/server.py b/pynumaflow/mapper/server.py index 0ae779ee..2cb9af58 100644 --- a/pynumaflow/mapper/server.py +++ b/pynumaflow/mapper/server.py @@ -18,16 +18,13 @@ from pynumaflow.mapper.proto import map_pb2 from pynumaflow.mapper.proto import map_pb2_grpc from pynumaflow.types import NumaflowServicerContext +from pynumaflow._constants import MAX_THREADS _LOGGER = setup_logging(__name__) if os.getenv("PYTHONDEBUG"): _LOGGER.setLevel(logging.DEBUG) -_PROCESS_COUNT = multiprocessing.cpu_count() -MAX_THREADS = int(os.getenv("MAX_THREADS", 0)) or (_PROCESS_COUNT * 4) - - class Mapper(map_pb2_grpc.MapServicer): """ Provides an interface to write a Mapper @@ -56,11 +53,11 @@ class Mapper(map_pb2_grpc.MapServicer): """ def __init__( - self, - handler: MapCallable, - sock_path=MAP_SOCK_PATH, - max_message_size=MAX_MESSAGE_SIZE, - max_threads=MAX_THREADS, + self, + handler: MapCallable, + sock_path=MAP_SOCK_PATH, + max_message_size=MAX_MESSAGE_SIZE, + max_threads=MAX_THREADS, ): self.__map_handler: MapCallable = handler self.sock_path = f"unix://{sock_path}" @@ -74,7 +71,7 @@ def __init__( ] def MapFn( - self, request: map_pb2.MapRequest, context: NumaflowServicerContext + self, request: map_pb2.MapRequest, context: NumaflowServicerContext ) -> map_pb2.MapResponse: """ Applies a function to each datum element. @@ -106,31 +103,10 @@ def MapFn( return map_pb2.MapResponse(results=datums) def IsReady( - self, request: _empty_pb2.Empty, context: NumaflowServicerContext + self, request: _empty_pb2.Empty, context: NumaflowServicerContext ) -> map_pb2.ReadyResponse: """ IsReady is the heartbeat endpoint for gRPC. The pascal case function name comes from the proto map_pb2_grpc.py file. """ return map_pb2.ReadyResponse(ready=True) - - def start(self) -> None: - """ - Starts the gRPC server on the given UNIX socket with given max threads. - """ - server = grpc.server( - ThreadPoolExecutor(max_workers=self._max_threads), options=self._server_options - ) - map_pb2_grpc.add_MapServicer_to_server(self, server) - server.add_insecure_port(self.sock_path) - server.start() - serv_info = ServerInfo( - protocol=Protocol.UDS, - language=Language.PYTHON, - version=get_sdk_version(), - ) - info_server_write(server_info=serv_info, info_file=SERVER_INFO_FILE_PATH) - _LOGGER.info( - "GRPC Server listening on: %s with max threads: %s", self.sock_path, self._max_threads - ) - server.wait_for_termination() diff --git a/refactor/shared/server.py b/pynumaflow/shared/server.py similarity index 96% rename from refactor/shared/server.py rename to pynumaflow/shared/server.py index 433bd498..79a4c65e 100644 --- a/refactor/shared/server.py +++ b/pynumaflow/shared/server.py @@ -1,7 +1,7 @@ from abc import abstractmethod from concurrent.futures import ThreadPoolExecutor import grpc -from refactor._constants import MAX_MESSAGE_SIZE, MAX_THREADS, ServerType, _LOGGER +from pynumaflow._constants import MAX_MESSAGE_SIZE, MAX_THREADS, ServerType, _LOGGER from pynumaflow.info.server import get_sdk_version, write as info_server_write from pynumaflow.info.types import ServerInfo, Protocol, Language, SERVER_INFO_FILE_PATH diff --git a/refactor/mapper/__init__.py b/refactor/mapper/__init__.py deleted file mode 100644 index afea1a65..00000000 --- a/refactor/mapper/__init__.py +++ /dev/null @@ -1,21 +0,0 @@ -from refactor.mapper._dtypes import ( - Message, - Messages, - Datum, - DROP, -) -from refactor.mapper.async_server import AsyncMapper -# from pynumaflow.mapper.multiproc_server import MultiProcMapper -from refactor.mapper.server import Mapper -from refactor._constants import ServerType - -__all__ = [ - "Message", - "Messages", - "Datum", - "DROP", - "Mapper", - "AsyncMapper", - "ServerType", - # "MultiProcMapper", -] From 87420ca34d50fd1c860c7292a1eeba91380b74d1 Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Thu, 4 Jan 2024 12:03:13 -0800 Subject: [PATCH 03/78] move infor Signed-off-by: Sidhant Kohli --- pynumaflow/_base.py | 87 ------------------------------- pynumaflow/_constants.py | 1 + pynumaflow/info/__init__.py | 0 pynumaflow/info/server.py | 59 +++++++++++++++++++++ pynumaflow/info/types.py | 49 +++++++++++++++++ pynumaflow/mapper/__init__.py | 1 + pynumaflow/mapper/_dtypes.py | 13 +++-- pynumaflow/mapper/async_server.py | 2 - pynumaflow/mapper/example.py | 1 + pynumaflow/mapper/map.py | 41 +++++++++------ pynumaflow/mapper/server.py | 18 +++---- pynumaflow/shared/server.py | 24 ++++----- 12 files changed, 161 insertions(+), 135 deletions(-) delete mode 100644 pynumaflow/_base.py create mode 100644 pynumaflow/info/__init__.py create mode 100644 pynumaflow/info/server.py create mode 100644 pynumaflow/info/types.py diff --git a/pynumaflow/_base.py b/pynumaflow/_base.py deleted file mode 100644 index 4920d55c..00000000 --- a/pynumaflow/_base.py +++ /dev/null @@ -1,87 +0,0 @@ -from abc import ABCMeta - - -class NumaflowPythonUDF(metaclass=ABCMeta): - """ - Base class for all Numaflow Python SDK based UDFs. - - Args: - is_async: If True, the UDF is executed in an asynchronous manner. - pl_conf: PipelineConf object - _vtx: Vertex/UDF name - """ - - __slots__ = ("is_async", "pl_conf", "_vtx") - - def __init__( - self, - is_async: bool = False, - pl_conf: Optional[PipelineConf] = None, - _vtx: Optional[str] = "numalogic-udf", - ): - self._vtx = _vtx - self.is_async = is_async - self.pl_conf = pl_conf or PipelineConf() - - def __call__( - self, keys: list[str], datum: Datum - ) -> Union[Coroutine[None, None, Messages], Messages]: - return self.aexec(keys, datum) if self.is_async else self.exec(keys, datum) - # - # # TODO: remove, and have an update config method - # def register_conf(self, config_id: str, conf: StreamConf) -> None: - # """ - # Register config with the UDF. - # - # Args: - # config_id: Config ID - # conf: StreamConf object - # """ - # self.pl_conf.stream_confs[config_id] = conf - # - # def _get_default_stream_conf(self, config_id) -> StreamConf: - # """Get the default config.""" - # try: - # return self.pl_conf.stream_confs[_DEFAULT_CONF_ID] - # except KeyError: - # err_msg = f"Config with ID {config_id} or {_DEFAULT_CONF_ID} not found!" - # raise ConfigNotFoundError(err_msg) from None - # - # def _get_default_ml_pipeline_conf(self, config_id, pipeline_id) -> MLPipelineConf: - # """Get the default pipeline config.""" - # try: - # return self.pl_conf.stream_confs[_DEFAULT_CONF_ID].ml_pipelines[_DEFAULT_CONF_ID] - # except KeyError: - # err_msg = ( - # f"Pipeline with ID {pipeline_id} or {_DEFAULT_CONF_ID} " - # f"not found for config ID {config_id}!" - # ) - # raise ConfigNotFoundError(err_msg) from None - - def exec(self, keys: list[str], datum: Datum) -> Messages: - """ - Called when the UDF is executed in a synchronous manner. - - Args: - keys: list of keys. - datum: Datum object. - - Returns - ------- - Messages instance - """ - raise NotImplementedError("exec method not implemented") - - async def aexec(self, keys: list[str], datum: Datum) -> Messages: - """ - Called when the UDF is executed in an asynchronous manner. - - Args: - keys: list of keys. - datum: Datum object. - - Returns - ------- - Messages instance - """ - raise NotImplementedError("aexec method not implemented") \ No newline at end of file diff --git a/pynumaflow/_constants.py b/pynumaflow/_constants.py index 5af5023c..3fb97e48 100644 --- a/pynumaflow/_constants.py +++ b/pynumaflow/_constants.py @@ -37,5 +37,6 @@ class ServerType(str, Enum): """ Enumerate grpc server connection protocol. """ + Sync = "sync" Async = "async" diff --git a/pynumaflow/info/__init__.py b/pynumaflow/info/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pynumaflow/info/server.py b/pynumaflow/info/server.py new file mode 100644 index 00000000..abbb7cad --- /dev/null +++ b/pynumaflow/info/server.py @@ -0,0 +1,59 @@ +import os +from importlib.metadata import version +from typing import Any + +from pynumaflow import setup_logging +from pynumaflow.info.types import ServerInfo, EOF +import json +import logging + +_LOGGER = setup_logging(__name__) +if os.getenv("PYTHONDEBUG"): + _LOGGER.setLevel(logging.DEBUG) + + +def get_sdk_version() -> str: + """ + Return the pynumaflow SDK version + """ + try: + return version("pynumaflow") + except Exception as e: + # Adding this to handle the case for local test/CI where pynumaflow + # will not be installed as a package + _LOGGER.error("Could not read SDK version %r", e, exc_info=True) + return "" + + +def write(server_info: ServerInfo, info_file: str): + """ + Write the ServerInfo to a file , shared with the client (numa container). + + args: + serv: The ServerInfo object to be shared + info_file: the shared file path + """ + try: + data = server_info.__dict__ + with open(info_file, "w+") as f: + json.dump(data, f, ensure_ascii=False) + f.write(EOF) + except Exception as err: + _LOGGER.critical("Could not write data to Info-Server %r", err, exc_info=True) + raise err + + +def get_metadata_env(envs: list[tuple[str, str]]) -> dict[str, Any]: + """ + Extract the environment var value from the provided list, + and assign them to the given key in the metadata + + args: + envs: List of tuples (key, env_var) + """ + meta = {} + for key, val in envs: + res = os.getenv(val, None) + if res: + meta[key] = res + return meta diff --git a/pynumaflow/info/types.py b/pynumaflow/info/types.py new file mode 100644 index 00000000..0e640082 --- /dev/null +++ b/pynumaflow/info/types.py @@ -0,0 +1,49 @@ +from dataclasses import dataclass, field +from enum import Enum + +# Constants for using in the info-server +# Need to keep consistent with all SDKs and client +SERVER_INFO_FILE_PATH = "/var/run/numaflow/server-info" +EOF = "U+005C__END__" + +# Env variables to be passed in the info server metadata. +# These need to be accessed in the client using the same key. +# Format - (key, env_var) +METADATA_ENVS = [("CPU_LIMIT", "NUMAFLOW_CPU_LIMIT")] + + +class Protocol(str, Enum): + """ + Enumerate grpc server connection protocol. + """ + + UDS = "uds" + TCP = "tcp" + + +class Language(str, Enum): + """ + Enumerate Numaflow SDK language. + """ + + GO = "go" + PYTHON = "python" + JAVA = "java" + + +@dataclass +class ServerInfo: + """ + ServerInfo is used for the gRPC server to provide the information such as protocol, + sdk version, language, metadata to the client. + Args: + protocol: Protocol to use (UDS or TCP) + language: Language used by the server(Python, Golang, Java) + version: Numaflow sdk version used by the server + metadata: Any additional information to be provided (env vars) + """ + + protocol: Protocol + language: Language + version: str + metadata: dict = field(default_factory=dict) diff --git a/pynumaflow/mapper/__init__.py b/pynumaflow/mapper/__init__.py index 74363153..cff034ea 100644 --- a/pynumaflow/mapper/__init__.py +++ b/pynumaflow/mapper/__init__.py @@ -5,6 +5,7 @@ DROP, ) from pynumaflow.mapper.async_server import AsyncMapper + # from pynumaflow.mapper.multiproc_server import MultiProcMapper from pynumaflow.mapper.server import Mapper from pynumaflow._constants import ServerType diff --git a/pynumaflow/mapper/_dtypes.py b/pynumaflow/mapper/_dtypes.py index a39d68f2..ffc3b853 100644 --- a/pynumaflow/mapper/_dtypes.py +++ b/pynumaflow/mapper/_dtypes.py @@ -127,11 +127,11 @@ class Datum: _watermark: datetime def __init__( - self, - keys: list[str], - value: bytes, - event_time: datetime, - watermark: datetime, + self, + keys: list[str], + value: bytes, + event_time: datetime, + watermark: datetime, ): self._keys = keys or list() self._value = value or b"" @@ -177,7 +177,7 @@ class MapperClass: def __call__(self, *args, **kwargs): """ - Allow to call handler function directly if class instance is sent + Allow to call handler function directly if class instance is sent """ return self.handler(*args, **kwargs) @@ -189,4 +189,3 @@ def handler(self, keys: list[str], datum: Datum) -> Messages: MapCallable = Union[MapperClass, MapSyncCallable, MapAsyncCallable] - diff --git a/pynumaflow/mapper/async_server.py b/pynumaflow/mapper/async_server.py index 6a15d7e7..b016ad08 100644 --- a/pynumaflow/mapper/async_server.py +++ b/pynumaflow/mapper/async_server.py @@ -16,8 +16,6 @@ from pynumaflow.mapper.proto import map_pb2 from pynumaflow.mapper.proto import map_pb2_grpc from pynumaflow.types import NumaflowServicerContext -from pynumaflow.info.server import get_sdk_version, write as info_server_write -from pynumaflow.info.types import ServerInfo, Protocol, Language, SERVER_INFO_FILE_PATH _LOGGER = setup_logging(__name__) if os.getenv("PYTHONDEBUG"): diff --git a/pynumaflow/mapper/example.py b/pynumaflow/mapper/example.py index 5cb41588..c32d4ff7 100644 --- a/pynumaflow/mapper/example.py +++ b/pynumaflow/mapper/example.py @@ -11,6 +11,7 @@ class ExampleMapperClass(MapperClass): """ Provides an interface to write a Mapper """ + def handler(self, keys: [str], datum: Datum) -> Messages: """ Write a handler function which implements the MapCallable interface. diff --git a/pynumaflow/mapper/map.py b/pynumaflow/mapper/map.py index 907e30f2..e531fdf4 100644 --- a/pynumaflow/mapper/map.py +++ b/pynumaflow/mapper/map.py @@ -13,19 +13,22 @@ class MapServer(NumaflowServer): Create a new grpc Server instance. """ - def __init__(self, mapper_instance: MapCallable, - sock_path=MAP_SOCK_PATH, - max_message_size=MAX_MESSAGE_SIZE, - max_threads=MAX_THREADS, - server_type=ServerType.Sync): + def __init__( + self, + mapper_instance: MapCallable, + sock_path=MAP_SOCK_PATH, + max_message_size=MAX_MESSAGE_SIZE, + max_threads=MAX_THREADS, + server_type=ServerType.Sync, + ): """ - Create a new grpc Server instance. - A new servicer instance is created and attached to the server. - The server instance is returned. + Create a new grpc Server instance. + A new servicer instance is created and attached to the server. + The server instance is returned. - max_message_size: The max message size in bytes the server can receive and send - max_threads: The max number of threads to be spawned; - defaults to number of processors x4 + max_message_size: The max message size in bytes the server can receive and send + max_threads: The max number of threads to be spawned; + defaults to number of processors x4 """ self.sock_path = f"unix://{sock_path}" self.max_threads = max_threads @@ -51,7 +54,9 @@ def exec(self): self.server.start() write_info_file(Protocol.UDS) _LOGGER.info( - "Sync GRPC Server listening on: %s with max threads: %s", self.sock_path, self.max_threads + "Sync GRPC Server listening on: %s with max threads: %s", + self.sock_path, + self.max_threads, ) self.server.wait_for_termination() @@ -62,7 +67,9 @@ async def aexec(self): await self.server.start() write_info_file(Protocol.UDS) _LOGGER.info( - "Async GRPC Server listening on: %s with max threads: %s", self.sock_path, self.max_threads + "Async GRPC Server listening on: %s with max threads: %s", + self.sock_path, + self.max_threads, ) async def server_graceful_shutdown(): @@ -85,8 +92,10 @@ def get_server(self, server_type, mapper_instance: MapCallable): else: raise NotImplementedError - server = prepare_server(sock_path=self.sock_path, - max_threads=self.max_threads, - max_message_size=self.max_message_size) + server = prepare_server( + sock_path=self.sock_path, + max_threads=self.max_threads, + max_message_size=self.max_message_size, + ) map_pb2_grpc.add_MapServicer_to_server(map_servicer, server) return server diff --git a/pynumaflow/mapper/server.py b/pynumaflow/mapper/server.py index 2cb9af58..4b3cd522 100644 --- a/pynumaflow/mapper/server.py +++ b/pynumaflow/mapper/server.py @@ -1,12 +1,8 @@ import logging -import multiprocessing import os -from concurrent.futures import ThreadPoolExecutor import grpc from google.protobuf import empty_pb2 as _empty_pb2 -from pynumaflow.info.server import get_sdk_version, write as info_server_write -from pynumaflow.info.types import ServerInfo, Protocol, Language, SERVER_INFO_FILE_PATH from pynumaflow import setup_logging from pynumaflow._constants import ( @@ -53,11 +49,11 @@ class Mapper(map_pb2_grpc.MapServicer): """ def __init__( - self, - handler: MapCallable, - sock_path=MAP_SOCK_PATH, - max_message_size=MAX_MESSAGE_SIZE, - max_threads=MAX_THREADS, + self, + handler: MapCallable, + sock_path=MAP_SOCK_PATH, + max_message_size=MAX_MESSAGE_SIZE, + max_threads=MAX_THREADS, ): self.__map_handler: MapCallable = handler self.sock_path = f"unix://{sock_path}" @@ -71,7 +67,7 @@ def __init__( ] def MapFn( - self, request: map_pb2.MapRequest, context: NumaflowServicerContext + self, request: map_pb2.MapRequest, context: NumaflowServicerContext ) -> map_pb2.MapResponse: """ Applies a function to each datum element. @@ -103,7 +99,7 @@ def MapFn( return map_pb2.MapResponse(results=datums) def IsReady( - self, request: _empty_pb2.Empty, context: NumaflowServicerContext + self, request: _empty_pb2.Empty, context: NumaflowServicerContext ) -> map_pb2.ReadyResponse: """ IsReady is the heartbeat endpoint for gRPC. diff --git a/pynumaflow/shared/server.py b/pynumaflow/shared/server.py index 79a4c65e..66013def 100644 --- a/pynumaflow/shared/server.py +++ b/pynumaflow/shared/server.py @@ -26,23 +26,23 @@ def start(self): raise NotImplementedError -def prepare_server(sock_path: str, - server_type: ServerType, - max_message_size=MAX_MESSAGE_SIZE, - max_threads=MAX_THREADS, - ): +def prepare_server( + sock_path: str, + server_type: ServerType, + max_message_size=MAX_MESSAGE_SIZE, + max_threads=MAX_THREADS, +): """ - Create a new grpc Server instance. - A new servicer instance is created and attached to the server. - The server instance is returned. + Create a new grpc Server instance. + A new servicer instance is created and attached to the server. + The server instance is returned. """ _server_options = [ ("grpc.max_send_message_length", max_message_size), ("grpc.max_receive_message_length", max_message_size), ] - server = grpc.server( - ThreadPoolExecutor(max_workers=max_threads), options=_server_options) + server = grpc.server(ThreadPoolExecutor(max_workers=max_threads), options=_server_options) if server_type == ServerType.Async: server = grpc.aio.server(options=_server_options) server.add_insecure_port(sock_path) @@ -62,7 +62,6 @@ def write_info_file(protocol: Protocol) -> None: async def __serve_async(self, server) -> None: - async def server_graceful_shutdown(): """ Shuts down the server with 5 seconds of grace period. During the @@ -75,7 +74,8 @@ async def server_graceful_shutdown(): self.cleanup_coroutines.append(server_graceful_shutdown()) await server.wait_for_termination() + async def start(self) -> None: """Starts the Async gRPC mapper on the given UNIX socket.""" server = grpc.aio.server(options=self._server_options) - await self.__serve_async(server) \ No newline at end of file + await self.__serve_async(server) From 136da24e4c9334ef095eb1a6ab783cd540433de3 Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Thu, 4 Jan 2024 12:14:39 -0800 Subject: [PATCH 04/78] move infor Signed-off-by: Sidhant Kohli --- pynumaflow/mapper/map.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pynumaflow/mapper/map.py b/pynumaflow/mapper/map.py index e531fdf4..7b710b3d 100644 --- a/pynumaflow/mapper/map.py +++ b/pynumaflow/mapper/map.py @@ -96,6 +96,7 @@ def get_server(self, server_type, mapper_instance: MapCallable): sock_path=self.sock_path, max_threads=self.max_threads, max_message_size=self.max_message_size, + server_type=server_type, ) map_pb2_grpc.add_MapServicer_to_server(map_servicer, server) return server From 58b6496da12f3c5a89bfc2298a25414c00cac928 Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Thu, 4 Jan 2024 14:45:47 -0800 Subject: [PATCH 05/78] await change Signed-off-by: Sidhant Kohli --- pynumaflow/mapper/async_server.py | 4 ++-- pynumaflow/mapper/map.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/pynumaflow/mapper/async_server.py b/pynumaflow/mapper/async_server.py index b016ad08..0ab1abed 100644 --- a/pynumaflow/mapper/async_server.py +++ b/pynumaflow/mapper/async_server.py @@ -12,7 +12,7 @@ MAP_SOCK_PATH, ) from pynumaflow.mapper import Datum -from pynumaflow.mapper._dtypes import MapAsyncCallable +from pynumaflow.mapper._dtypes import MapAsyncCallable, MapCallable from pynumaflow.mapper.proto import map_pb2 from pynumaflow.mapper.proto import map_pb2_grpc from pynumaflow.types import NumaflowServicerContext @@ -61,7 +61,7 @@ def __init__( max_message_size=MAX_MESSAGE_SIZE, max_threads=MAX_THREADS, ): - self.__map_handler: MapAsyncCallable = handler + self.__map_handler: MapCallable = handler self.sock_path = f"unix://{sock_path}" self._max_message_size = max_message_size self._max_threads = max_threads diff --git a/pynumaflow/mapper/map.py b/pynumaflow/mapper/map.py index 7b710b3d..019ec247 100644 --- a/pynumaflow/mapper/map.py +++ b/pynumaflow/mapper/map.py @@ -64,7 +64,8 @@ async def aexec(self): """ Starts the gRPC server on the given UNIX socket with given max threads.s """ - await self.server.start() + aiorun.run(self.server.start()) + # await self.server.start() write_info_file(Protocol.UDS) _LOGGER.info( "Async GRPC Server listening on: %s with max threads: %s", From 83d366a867ab89e4dc49d7fee2313a744e1e3b3e Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Thu, 4 Jan 2024 15:11:57 -0800 Subject: [PATCH 06/78] await change Signed-off-by: Sidhant Kohli --- pynumaflow/mapper/map.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pynumaflow/mapper/map.py b/pynumaflow/mapper/map.py index 019ec247..fe677c85 100644 --- a/pynumaflow/mapper/map.py +++ b/pynumaflow/mapper/map.py @@ -43,7 +43,7 @@ def start(self) -> None: if self.server_type == ServerType.Sync: self.exec() elif self.server_type == ServerType.Async: - aiorun.run(self.aexec()) + await self.aexec() else: raise NotImplementedError From ceee882d9a25d606fe5e99b516678bcac95a1830 Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Thu, 4 Jan 2024 15:24:28 -0800 Subject: [PATCH 07/78] change to create task Signed-off-by: Sidhant Kohli --- pynumaflow/mapper/map.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/pynumaflow/mapper/map.py b/pynumaflow/mapper/map.py index fe677c85..a5a1ea14 100644 --- a/pynumaflow/mapper/map.py +++ b/pynumaflow/mapper/map.py @@ -1,3 +1,5 @@ +import asyncio + import aiorun from pynumaflow.info.types import Protocol @@ -34,6 +36,7 @@ def __init__( self.max_threads = max_threads self.max_message_size = max_message_size self.server_type = server_type + self.background_tasks = set() self.server = self.get_server(server_type=server_type, mapper_instance=mapper_instance) def start(self) -> None: @@ -43,7 +46,7 @@ def start(self) -> None: if self.server_type == ServerType.Sync: self.exec() elif self.server_type == ServerType.Async: - await self.aexec() + aiorun.run(self.aexec()) else: raise NotImplementedError @@ -64,8 +67,18 @@ async def aexec(self): """ Starts the gRPC server on the given UNIX socket with given max threads.s """ - aiorun.run(self.server.start()) - # await self.server.start() + # aiorun.run(self.server.start()) + response_task = asyncio.create_task( + self.server.start(), + ) + + # Save a reference to the result of this function, to avoid a + # task disappearing mid-execution. + self.background_tasks.add(response_task) + response_task.add_done_callback(lambda t: self.background_tasks.remove(t)) + + await response_task + write_info_file(Protocol.UDS) _LOGGER.info( "Async GRPC Server listening on: %s with max threads: %s", From b8282048c1f19d9702da5fa08cbc544f25d8070d Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Thu, 4 Jan 2024 15:47:12 -0800 Subject: [PATCH 08/78] change to create task Signed-off-by: Sidhant Kohli --- pynumaflow/mapper/map.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/pynumaflow/mapper/map.py b/pynumaflow/mapper/map.py index a5a1ea14..07dc9f2f 100644 --- a/pynumaflow/mapper/map.py +++ b/pynumaflow/mapper/map.py @@ -1,4 +1,5 @@ import asyncio +from asyncio import events import aiorun @@ -46,7 +47,12 @@ def start(self) -> None: if self.server_type == ServerType.Sync: self.exec() elif self.server_type == ServerType.Async: - aiorun.run(self.aexec()) + try: + loop = events.get_running_loop() + except RuntimeError: + loop = None + _LOGGER.info("Starting Async GRPC Server...", loop) + aiorun.run(self.aexec(), loop=loop) else: raise NotImplementedError @@ -68,6 +74,11 @@ async def aexec(self): Starts the gRPC server on the given UNIX socket with given max threads.s """ # aiorun.run(self.server.start()) + try: + loop = events.get_running_loop() + except RuntimeError: + loop = None + _LOGGER.info("Loopsie...", loop) response_task = asyncio.create_task( self.server.start(), ) From 837de076a4f04e7f44dbc53dba13c98fde951de7 Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Thu, 4 Jan 2024 16:00:37 -0800 Subject: [PATCH 09/78] ev loop Signed-off-by: Sidhant Kohli --- pynumaflow/mapper/map.py | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/pynumaflow/mapper/map.py b/pynumaflow/mapper/map.py index 07dc9f2f..a7143a49 100644 --- a/pynumaflow/mapper/map.py +++ b/pynumaflow/mapper/map.py @@ -10,6 +10,7 @@ from pynumaflow.mapper.proto import map_pb2_grpc from pynumaflow.shared.server import prepare_server, write_info_file, NumaflowServer +_loop = None class MapServer(NumaflowServer): """ @@ -50,9 +51,10 @@ def start(self) -> None: try: loop = events.get_running_loop() except RuntimeError: - loop = None - _LOGGER.info("Starting Async GRPC Server...", loop) - aiorun.run(self.aexec(), loop=loop) + loop = asyncio.new_event_loop() + global _loop + _loop = loop + aiorun.run(self.aexec(), loop=_loop) else: raise NotImplementedError @@ -74,21 +76,18 @@ async def aexec(self): Starts the gRPC server on the given UNIX socket with given max threads.s """ # aiorun.run(self.server.start()) - try: - loop = events.get_running_loop() - except RuntimeError: - loop = None - _LOGGER.info("Loopsie...", loop) - response_task = asyncio.create_task( - self.server.start(), - ) - - # Save a reference to the result of this function, to avoid a - # task disappearing mid-execution. - self.background_tasks.add(response_task) - response_task.add_done_callback(lambda t: self.background_tasks.remove(t)) - - await response_task + global _loop + asyncio.run_coroutine_threadsafe(self.server.start(), _loop) + # response_task = asyncio.create_task( + # self.server.start(), + # ) + # + # # Save a reference to the result of this function, to avoid a + # # task disappearing mid-execution. + # self.background_tasks.add(response_task) + # response_task.add_done_callback(lambda t: self.background_tasks.remove(t)) + # + # await response_task write_info_file(Protocol.UDS) _LOGGER.info( From c4f03763d525c87fea4b6374d8fa1af89a2297a9 Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Thu, 4 Jan 2024 16:19:44 -0800 Subject: [PATCH 10/78] cleanup Signed-off-by: Sidhant Kohli --- pynumaflow/mapper/_dtypes.py | 6 ++++-- pynumaflow/mapper/map.py | 17 ++++++++++------- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/pynumaflow/mapper/_dtypes.py b/pynumaflow/mapper/_dtypes.py index ffc3b853..cbddc3bc 100644 --- a/pynumaflow/mapper/_dtypes.py +++ b/pynumaflow/mapper/_dtypes.py @@ -1,3 +1,4 @@ +from abc import ABCMeta, abstractmethod from collections.abc import Iterator, Sequence, Awaitable from dataclasses import dataclass from datetime import datetime @@ -166,7 +167,7 @@ def watermark(self) -> datetime: MapAsyncCallable = Callable[[list[str], Datum], Awaitable[Messages]] -class MapperClass: +class MapperClass(metaclass=ABCMeta): """ Provides an interface to write a Mapper which will be exposed over a Synchronous gRPC server. @@ -181,11 +182,12 @@ def __call__(self, *args, **kwargs): """ return self.handler(*args, **kwargs) + @abstractmethod def handler(self, keys: list[str], datum: Datum) -> Messages: """ Write a handler function which implements the MapCallable interface. """ - raise NotImplementedError + pass MapCallable = Union[MapperClass, MapSyncCallable, MapAsyncCallable] diff --git a/pynumaflow/mapper/map.py b/pynumaflow/mapper/map.py index a7143a49..c9e28380 100644 --- a/pynumaflow/mapper/map.py +++ b/pynumaflow/mapper/map.py @@ -12,18 +12,19 @@ _loop = None + class MapServer(NumaflowServer): """ Create a new grpc Server instance. """ def __init__( - self, - mapper_instance: MapCallable, - sock_path=MAP_SOCK_PATH, - max_message_size=MAX_MESSAGE_SIZE, - max_threads=MAX_THREADS, - server_type=ServerType.Sync, + self, + mapper_instance: MapCallable, + sock_path=MAP_SOCK_PATH, + max_message_size=MAX_MESSAGE_SIZE, + max_threads=MAX_THREADS, + server_type=ServerType.Sync, ): """ Create a new grpc Server instance. @@ -39,6 +40,7 @@ def __init__( self.max_message_size = max_message_size self.server_type = server_type self.background_tasks = set() + self.cleanup_coroutines = [] self.server = self.get_server(server_type=server_type, mapper_instance=mapper_instance) def start(self) -> None: @@ -88,6 +90,7 @@ async def aexec(self): # response_task.add_done_callback(lambda t: self.background_tasks.remove(t)) # # await response_task + # await self.server.start() write_info_file(Protocol.UDS) _LOGGER.info( @@ -105,7 +108,7 @@ async def server_graceful_shutdown(): _LOGGER.info("Starting graceful shutdown...") await self.server.stop(5) - self.server.cleanup_coroutines.append(server_graceful_shutdown()) + self.cleanup_coroutines.append(server_graceful_shutdown()) await self.server.wait_for_termination() def get_server(self, server_type, mapper_instance: MapCallable): From b385e253af063866593987d20e1a5055be6615ef Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Thu, 4 Jan 2024 16:30:43 -0800 Subject: [PATCH 11/78] cleanup Signed-off-by: Sidhant Kohli --- pynumaflow/mapper/map.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pynumaflow/mapper/map.py b/pynumaflow/mapper/map.py index c9e28380..bb635d31 100644 --- a/pynumaflow/mapper/map.py +++ b/pynumaflow/mapper/map.py @@ -109,7 +109,7 @@ async def server_graceful_shutdown(): await self.server.stop(5) self.cleanup_coroutines.append(server_graceful_shutdown()) - await self.server.wait_for_termination() + asyncio.run_coroutine_threadsafe(self.server.wait_for_termination(), _loop) def get_server(self, server_type, mapper_instance: MapCallable): if server_type == ServerType.Sync: From a22673bb2f43ee1daa07b03840c21245f6eaddf3 Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Fri, 5 Jan 2024 10:16:19 -0800 Subject: [PATCH 12/78] cleanup Signed-off-by: Sidhant Kohli --- pynumaflow/mapper/map.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pynumaflow/mapper/map.py b/pynumaflow/mapper/map.py index bb635d31..2202ddf8 100644 --- a/pynumaflow/mapper/map.py +++ b/pynumaflow/mapper/map.py @@ -78,8 +78,8 @@ async def aexec(self): Starts the gRPC server on the given UNIX socket with given max threads.s """ # aiorun.run(self.server.start()) - global _loop - asyncio.run_coroutine_threadsafe(self.server.start(), _loop) + # global _loop + # asyncio.run_coroutine_threadsafe(self.server.start(), _loop) # response_task = asyncio.create_task( # self.server.start(), # ) @@ -90,11 +90,11 @@ async def aexec(self): # response_task.add_done_callback(lambda t: self.background_tasks.remove(t)) # # await response_task - # await self.server.start() + await self.server.start() write_info_file(Protocol.UDS) _LOGGER.info( - "Async GRPC Server listening on: %s with max threads: %s", + "Async Map GRPC Server listening on: %s with max threads: %s", self.sock_path, self.max_threads, ) @@ -109,7 +109,8 @@ async def server_graceful_shutdown(): await self.server.stop(5) self.cleanup_coroutines.append(server_graceful_shutdown()) - asyncio.run_coroutine_threadsafe(self.server.wait_for_termination(), _loop) + # asyncio.run_coroutine_threadsafe(self.server.wait_for_termination(), _loop) + await self.server.wait_for_termination() def get_server(self, server_type, mapper_instance: MapCallable): if server_type == ServerType.Sync: From fe52e9f02de5a6d12ac23019305dd0567bd538a7 Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Fri, 5 Jan 2024 10:53:32 -0800 Subject: [PATCH 13/78] server on same thread Signed-off-by: Sidhant Kohli --- pynumaflow/mapper/map.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/pynumaflow/mapper/map.py b/pynumaflow/mapper/map.py index 2202ddf8..87a23832 100644 --- a/pynumaflow/mapper/map.py +++ b/pynumaflow/mapper/map.py @@ -2,6 +2,7 @@ from asyncio import events import aiorun +import grpc from pynumaflow.info.types import Protocol from pynumaflow._constants import MAX_THREADS, MAX_MESSAGE_SIZE, _LOGGER, MAP_SOCK_PATH, ServerType @@ -38,6 +39,7 @@ def __init__( self.sock_path = f"unix://{sock_path}" self.max_threads = max_threads self.max_message_size = max_message_size + self.mapper_instance = mapper_instance self.server_type = server_type self.background_tasks = set() self.cleanup_coroutines = [] @@ -54,9 +56,9 @@ def start(self) -> None: loop = events.get_running_loop() except RuntimeError: loop = asyncio.new_event_loop() - global _loop - _loop = loop - aiorun.run(self.aexec(), loop=_loop) + # global _loop + # _loop = loop + aiorun.run(self.aexec()) else: raise NotImplementedError @@ -73,10 +75,15 @@ def exec(self): ) self.server.wait_for_termination() - async def aexec(self): + async def aexec(self) -> None: """ Starts the gRPC server on the given UNIX socket with given max threads.s """ + server = grpc.aio.server() + server.add_insecure_port(self.sock_path) + map_servicer = AsyncMapper(handler=self.mapper_instance) + map_pb2_grpc.add_MapServicer_to_server(map_servicer, server) + # aiorun.run(self.server.start()) # global _loop # asyncio.run_coroutine_threadsafe(self.server.start(), _loop) @@ -94,7 +101,7 @@ async def aexec(self): write_info_file(Protocol.UDS) _LOGGER.info( - "Async Map GRPC Server listening on: %s with max threads: %s", + "Async Map New GRPC Server listening on: %s with max threads: %s", self.sock_path, self.max_threads, ) From e2c315d418834e17651a8acec8fcf27f8786665b Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Fri, 5 Jan 2024 11:00:11 -0800 Subject: [PATCH 14/78] server on same thread Signed-off-by: Sidhant Kohli --- pynumaflow/mapper/map.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pynumaflow/mapper/map.py b/pynumaflow/mapper/map.py index 87a23832..ec34649e 100644 --- a/pynumaflow/mapper/map.py +++ b/pynumaflow/mapper/map.py @@ -79,10 +79,10 @@ async def aexec(self) -> None: """ Starts the gRPC server on the given UNIX socket with given max threads.s """ - server = grpc.aio.server() - server.add_insecure_port(self.sock_path) + server_new = grpc.aio.server() + server_new.add_insecure_port(self.sock_path) map_servicer = AsyncMapper(handler=self.mapper_instance) - map_pb2_grpc.add_MapServicer_to_server(map_servicer, server) + map_pb2_grpc.add_MapServicer_to_server(map_servicer, server_new) # aiorun.run(self.server.start()) # global _loop @@ -97,7 +97,7 @@ async def aexec(self) -> None: # response_task.add_done_callback(lambda t: self.background_tasks.remove(t)) # # await response_task - await self.server.start() + await server_new.start() write_info_file(Protocol.UDS) _LOGGER.info( @@ -113,11 +113,11 @@ async def server_graceful_shutdown(): existing RPCs to continue within the grace period. """ _LOGGER.info("Starting graceful shutdown...") - await self.server.stop(5) + await server_new.stop(5) self.cleanup_coroutines.append(server_graceful_shutdown()) # asyncio.run_coroutine_threadsafe(self.server.wait_for_termination(), _loop) - await self.server.wait_for_termination() + await server_new.wait_for_termination() def get_server(self, server_type, mapper_instance: MapCallable): if server_type == ServerType.Sync: From 28b5c708c3c9417c80fe3fac49beef0a322d2656 Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Fri, 5 Jan 2024 11:11:30 -0800 Subject: [PATCH 15/78] server on same thread Signed-off-by: Sidhant Kohli --- pynumaflow/mapper/map.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pynumaflow/mapper/map.py b/pynumaflow/mapper/map.py index ec34649e..16f51a23 100644 --- a/pynumaflow/mapper/map.py +++ b/pynumaflow/mapper/map.py @@ -39,6 +39,10 @@ def __init__( self.sock_path = f"unix://{sock_path}" self.max_threads = max_threads self.max_message_size = max_message_size + self._server_options = [ + ("grpc.max_send_message_length", max_message_size), + ("grpc.max_receive_message_length", max_message_size), + ] self.mapper_instance = mapper_instance self.server_type = server_type self.background_tasks = set() @@ -56,8 +60,7 @@ def start(self) -> None: loop = events.get_running_loop() except RuntimeError: loop = asyncio.new_event_loop() - # global _loop - # _loop = loop + _LOGGER.info("Starting Async Map Server with aiorun...") aiorun.run(self.aexec()) else: raise NotImplementedError From da9d1dbcc5c91b9dcf7c5b099a6debc722521e01 Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Fri, 5 Jan 2024 11:40:23 -0800 Subject: [PATCH 16/78] Export class Signed-off-by: Sidhant Kohli --- pynumaflow/mapper/__init__.py | 3 +++ pynumaflow/mapper/map.py | 13 ------------- 2 files changed, 3 insertions(+), 13 deletions(-) diff --git a/pynumaflow/mapper/__init__.py b/pynumaflow/mapper/__init__.py index cff034ea..5df50133 100644 --- a/pynumaflow/mapper/__init__.py +++ b/pynumaflow/mapper/__init__.py @@ -3,6 +3,8 @@ Messages, Datum, DROP, + MapperClass + ) from pynumaflow.mapper.async_server import AsyncMapper @@ -18,5 +20,6 @@ "Mapper", "AsyncMapper", "ServerType", + "MapperClass", # "MultiProcMapper", ] diff --git a/pynumaflow/mapper/map.py b/pynumaflow/mapper/map.py index 16f51a23..360857af 100644 --- a/pynumaflow/mapper/map.py +++ b/pynumaflow/mapper/map.py @@ -87,19 +87,6 @@ async def aexec(self) -> None: map_servicer = AsyncMapper(handler=self.mapper_instance) map_pb2_grpc.add_MapServicer_to_server(map_servicer, server_new) - # aiorun.run(self.server.start()) - # global _loop - # asyncio.run_coroutine_threadsafe(self.server.start(), _loop) - # response_task = asyncio.create_task( - # self.server.start(), - # ) - # - # # Save a reference to the result of this function, to avoid a - # # task disappearing mid-execution. - # self.background_tasks.add(response_task) - # response_task.add_done_callback(lambda t: self.background_tasks.remove(t)) - # - # await response_task await server_new.start() write_info_file(Protocol.UDS) From 27967bf992b7275dce0544fdf8071752816132d1 Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Fri, 5 Jan 2024 15:42:56 -0800 Subject: [PATCH 17/78] Class refactor Signed-off-by: Sidhant Kohli --- pynumaflow/_constants.py | 3 +- pynumaflow/exceptions.py | 6 + pynumaflow/mapper/__init__.py | 9 +- pynumaflow/mapper/map.py | 175 ++++++++++++++-------- pynumaflow/mapper/multiproc_server.py | 98 ++++++++++++ pynumaflow/mapper/server.py | 38 +---- pynumaflow/mapper/utils.py | 36 +++++ pynumaflow/shared/server.py | 207 +++++++++++++++++++++++--- 8 files changed, 445 insertions(+), 127 deletions(-) create mode 100644 pynumaflow/exceptions.py create mode 100644 pynumaflow/mapper/multiproc_server.py create mode 100644 pynumaflow/mapper/utils.py diff --git a/pynumaflow/_constants.py b/pynumaflow/_constants.py index 3fb97e48..e1ab5463 100644 --- a/pynumaflow/_constants.py +++ b/pynumaflow/_constants.py @@ -26,7 +26,7 @@ DROP = "U+005C__DROP__" _PROCESS_COUNT = multiprocessing.cpu_count() -MAX_THREADS = int(os.getenv("MAX_THREADS", 0)) or (_PROCESS_COUNT * 4) +MAX_THREADS = int(os.getenv("MAX_THREADS", "4")) _LOGGER = setup_logging(__name__) if os.getenv("PYTHONDEBUG"): @@ -40,3 +40,4 @@ class ServerType(str, Enum): Sync = "sync" Async = "async" + Multiproc = "multiproc" diff --git a/pynumaflow/exceptions.py b/pynumaflow/exceptions.py new file mode 100644 index 00000000..30dc9a76 --- /dev/null +++ b/pynumaflow/exceptions.py @@ -0,0 +1,6 @@ +class NoPublicConstructorError(TypeError): + """Raise when using ClassName() to create objects while public constructor is not supported""" + + +class SocketError(Exception): + """To raise an error while creating socket or setting its property""" diff --git a/pynumaflow/mapper/__init__.py b/pynumaflow/mapper/__init__.py index 5df50133..3109f812 100644 --- a/pynumaflow/mapper/__init__.py +++ b/pynumaflow/mapper/__init__.py @@ -1,11 +1,4 @@ -from pynumaflow.mapper._dtypes import ( - Message, - Messages, - Datum, - DROP, - MapperClass - -) +from pynumaflow.mapper._dtypes import Message, Messages, Datum, DROP, MapperClass from pynumaflow.mapper.async_server import AsyncMapper # from pynumaflow.mapper.multiproc_server import MultiProcMapper diff --git a/pynumaflow/mapper/map.py b/pynumaflow/mapper/map.py index 360857af..5cd74542 100644 --- a/pynumaflow/mapper/map.py +++ b/pynumaflow/mapper/map.py @@ -1,17 +1,19 @@ -import asyncio -from asyncio import events +import os import aiorun import grpc -from pynumaflow.info.types import Protocol from pynumaflow._constants import MAX_THREADS, MAX_MESSAGE_SIZE, _LOGGER, MAP_SOCK_PATH, ServerType from pynumaflow.mapper import Mapper, AsyncMapper from pynumaflow.mapper._dtypes import MapCallable from pynumaflow.mapper.proto import map_pb2_grpc -from pynumaflow.shared.server import prepare_server, write_info_file, NumaflowServer - -_loop = None +from pynumaflow.shared.server import ( + prepare_server, + NumaflowServer, + start_async_server, + start_sync_server, + start_multiproc_server, +) class MapServer(NumaflowServer): @@ -20,12 +22,12 @@ class MapServer(NumaflowServer): """ def __init__( - self, - mapper_instance: MapCallable, - sock_path=MAP_SOCK_PATH, - max_message_size=MAX_MESSAGE_SIZE, - max_threads=MAX_THREADS, - server_type=ServerType.Sync, + self, + mapper_instance: MapCallable, + sock_path=MAP_SOCK_PATH, + max_message_size=MAX_MESSAGE_SIZE, + max_threads=MAX_THREADS, + server_type=ServerType.Sync, ): """ Create a new grpc Server instance. @@ -37,17 +39,30 @@ def __init__( defaults to number of processors x4 """ self.sock_path = f"unix://{sock_path}" - self.max_threads = max_threads + self.max_threads = min(max_threads, int(os.getenv("MAX_THREADS", "4"))) self.max_message_size = max_message_size - self._server_options = [ - ("grpc.max_send_message_length", max_message_size), - ("grpc.max_receive_message_length", max_message_size), - ] + self.mapper_instance = mapper_instance self.server_type = server_type self.background_tasks = set() self.cleanup_coroutines = [] - self.server = self.get_server(server_type=server_type, mapper_instance=mapper_instance) + + self._server_options = [ + ("grpc.max_send_message_length", self.max_message_size), + ("grpc.max_receive_message_length", self.max_message_size), + ("grpc.so_reuseport", 1), + ("grpc.so_reuseaddr", 1), + ] + # Set the number of processes to be spawned to the number of CPUs or + # the value of the env var NUM_CPU_MULTIPROC defined by the user + # Setting the max value to 2 * CPU count + # Used for multiproc server + self._process_count = min( + int(os.getenv("NUM_CPU_MULTIPROC", str(os.cpu_count()))), 2 * os.cpu_count() + ) + + # Get the server instance based on the server type and assign it to self.server + # self.server = self.get_server(server_type=server_type, mapper_instance=mapper_instance) def start(self) -> None: """ @@ -56,72 +71,112 @@ def start(self) -> None: if self.server_type == ServerType.Sync: self.exec() elif self.server_type == ServerType.Async: - try: - loop = events.get_running_loop() - except RuntimeError: - loop = asyncio.new_event_loop() _LOGGER.info("Starting Async Map Server with aiorun...") aiorun.run(self.aexec()) + elif self.server_type == ServerType.Multiproc: + self.exec_multiproc() + raise NotImplementedError + else: raise NotImplementedError def exec(self): """ - Starts the gRPC server on the given UNIX socket with given max threads.s + Starts the Synchronous gRPC server on the given UNIX socket with given max threads. """ - self.server.start() - write_info_file(Protocol.UDS) + server = prepare_server(self.sock_path, self.max_threads, self._server_options) + map_servicer = self.get_servicer( + mapper_instance=self.mapper_instance, server_type=self.server_type + ) + map_pb2_grpc.add_MapServicer_to_server(map_servicer, server) + # server.start() + # write_info_file(Protocol.UDS) + # _LOGGER.info( + # "Sync GRPC Server listening on: %s with max threads: %s", + # self.sock_path, + # self.max_threads, + # ) + # server.wait_for_termination() + # Log the server start _LOGGER.info( "Sync GRPC Server listening on: %s with max threads: %s", self.sock_path, self.max_threads, ) - self.server.wait_for_termination() + start_sync_server(server=server) - async def aexec(self) -> None: + def exec_multiproc(self): """ - Starts the gRPC server on the given UNIX socket with given max threads.s + Starts the gRPC server on the given UNIX socket with given max threads. """ - server_new = grpc.aio.server() - server_new.add_insecure_port(self.sock_path) - map_servicer = AsyncMapper(handler=self.mapper_instance) - map_pb2_grpc.add_MapServicer_to_server(map_servicer, server_new) - - await server_new.start() + servers, server_ports = prepare_server( + server_type=self.server_type, + max_threads=self.max_threads, + server_options=self._server_options, + process_count=self._process_count, + sock_path=self.sock_path, + ) - write_info_file(Protocol.UDS) - _LOGGER.info( - "Async Map New GRPC Server listening on: %s with max threads: %s", - self.sock_path, - self.max_threads, + map_servicer = self.get_servicer( + mapper_instance=self.mapper_instance, server_type=self.server_type ) + for server in servers: + map_pb2_grpc.add_MapServicer_to_server(map_servicer, server) - async def server_graceful_shutdown(): - """ - Shuts down the server with 5 seconds of grace period. During the - grace period, the server won't accept new connections and allow - existing RPCs to continue within the grace period. - """ - _LOGGER.info("Starting graceful shutdown...") - await server_new.stop(5) + start_multiproc_server( + servers=servers, server_ports=server_ports, max_threads=self.max_threads + ) + # server.start() + # write_info_file(Protocol.UDS) + # _LOGGER.info( + # "Sync GRPC Server listening on: %s with max threads: %s", + # self.sock_path, + # self.max_threads, + # ) + # server.wait_for_termination() - self.cleanup_coroutines.append(server_graceful_shutdown()) - # asyncio.run_coroutine_threadsafe(self.server.wait_for_termination(), _loop) - await server_new.wait_for_termination() + async def aexec(self) -> None: + """ + Starts the Async gRPC server on the given UNIX socket with given max threads.s + """ + server_new = grpc.aio.server() + server_new.add_insecure_port(self.sock_path) + map_servicer = self.get_servicer( + mapper_instance=self.mapper_instance, server_type=self.server_type + ) + map_pb2_grpc.add_MapServicer_to_server(map_servicer, server_new) - def get_server(self, server_type, mapper_instance: MapCallable): + await start_async_server(server_new, self.sock_path, self.max_threads, self._server_options) + + # await server_new.start() + # + # write_info_file(Protocol.UDS) + # _LOGGER.info( + # "Async Map New GRPC Server listening on: %s with max threads: %s", + # self.sock_path, + # self.max_threads, + # ) + # + # async def server_graceful_shutdown(): + # """ + # Shuts down the server with 5 seconds of grace period. During the + # grace period, the server won't accept new connections and allow + # existing RPCs to continue within the grace period. + # """ + # _LOGGER.info("Starting graceful shutdown...") + # await server_new.stop(5) + # + # self.cleanup_coroutines.append(server_graceful_shutdown()) + # # asyncio.run_coroutine_threadsafe(self.server.wait_for_termination(), _loop) + # await server_new.wait_for_termination() + + def get_servicer(self, mapper_instance: MapCallable, server_type: ServerType): if server_type == ServerType.Sync: map_servicer = Mapper(handler=mapper_instance) elif server_type == ServerType.Async: map_servicer = AsyncMapper(handler=mapper_instance) + elif server_type == ServerType.Multiproc: + map_servicer = Mapper(handler=mapper_instance) else: raise NotImplementedError - - server = prepare_server( - sock_path=self.sock_path, - max_threads=self.max_threads, - max_message_size=self.max_message_size, - server_type=server_type, - ) - map_pb2_grpc.add_MapServicer_to_server(map_servicer, server) - return server + return map_servicer diff --git a/pynumaflow/mapper/multiproc_server.py b/pynumaflow/mapper/multiproc_server.py new file mode 100644 index 00000000..72f69891 --- /dev/null +++ b/pynumaflow/mapper/multiproc_server.py @@ -0,0 +1,98 @@ +import logging +import os + +from google.protobuf import empty_pb2 as _empty_pb2 + +from pynumaflow import setup_logging +from pynumaflow._constants import ( + MAX_MESSAGE_SIZE, + MAP_SOCK_PATH, + MAX_THREADS, +) +from pynumaflow.mapper._dtypes import MapCallable +from pynumaflow.mapper.proto import map_pb2 +from pynumaflow.mapper.proto import map_pb2_grpc +from pynumaflow.mapper.utils import _map_fn_util +from pynumaflow.types import NumaflowServicerContext + +_LOGGER = setup_logging(__name__) +if os.getenv("PYTHONDEBUG"): + _LOGGER.setLevel(logging.DEBUG) + + +class MultiProcMapper(map_pb2_grpc.MapServicer): + """ + Provides an interface to write a Multi Proc Mapper + which will be exposed over gRPC. + + Args: + handler: Function callable following the type signature of MapCallable + max_message_size: The max message size in bytes the server can receive and send + + Example invocation: + >>> from typing import Iterator + >>> from pynumaflow.mapper import Messages, Message \ + ... Datum, MultiProcMapper + ... + >>> def map_handler(keys: list[str], datum: Datum) -> Messages: + ... val = datum.value + ... _ = datum.event_time + ... _ = datum.watermark + ... messages = Messages(Message(val, keys=keys)) + ... return messages + ... + >>> grpc_server = MultiProcMapper(handler=map_handler) + >>> grpc_server.start() + """ + + __slots__ = ( + "__map_handler", + "_max_message_size", + "_server_options", + "_process_count", + "_threads_per_proc", + ) + + def __init__( + self, + handler: MapCallable, + sock_path=MAP_SOCK_PATH, + max_message_size=MAX_MESSAGE_SIZE, + max_threads=MAX_THREADS, + ): + self.__map_handler: MapCallable = handler + self.sock_path = f"unix://{sock_path}" + self._max_message_size = max_message_size + self._max_threads = max_threads + + self._server_options = [ + ("grpc.max_send_message_length", self._max_message_size), + ("grpc.max_receive_message_length", self._max_message_size), + ("grpc.so_reuseport", 1), + ("grpc.so_reuseaddr", 1), + ] + # Set the number of processes to be spawned to the number of CPUs or + # the value of the env var NUM_CPU_MULTIPROC defined by the user + # Setting the max value to 2 * CPU count + self._process_count = min( + int(os.getenv("NUM_CPU_MULTIPROC", str(os.cpu_count()))), 2 * os.cpu_count() + ) + self._threads_per_proc = int(os.getenv("MAX_THREADS", "4")) + + def MapFn( + self, request: map_pb2.MapRequest, context: NumaflowServicerContext + ) -> map_pb2.MapResponse: + """ + Applies a function to each datum element. + The pascal case function name comes from the proto map_pb2_grpc.py file. + """ + return _map_fn_util(self.__map_handler, request, context) + + def IsReady( + self, request: _empty_pb2.Empty, context: NumaflowServicerContext + ) -> map_pb2.ReadyResponse: + """ + IsReady is the heartbeat endpoint for gRPC. + The pascal case function name comes from the proto map_pb2_grpc.py file. + """ + return map_pb2.ReadyResponse(ready=True) diff --git a/pynumaflow/mapper/server.py b/pynumaflow/mapper/server.py index 4b3cd522..5e2b7d45 100644 --- a/pynumaflow/mapper/server.py +++ b/pynumaflow/mapper/server.py @@ -1,7 +1,6 @@ import logging import os -import grpc from google.protobuf import empty_pb2 as _empty_pb2 from pynumaflow import setup_logging @@ -9,10 +8,10 @@ MAX_MESSAGE_SIZE, MAP_SOCK_PATH, ) -from pynumaflow.mapper import Datum from pynumaflow.mapper._dtypes import MapCallable from pynumaflow.mapper.proto import map_pb2 from pynumaflow.mapper.proto import map_pb2_grpc +from pynumaflow.mapper.utils import _map_fn_util from pynumaflow.types import NumaflowServicerContext from pynumaflow._constants import MAX_THREADS @@ -52,19 +51,9 @@ def __init__( self, handler: MapCallable, sock_path=MAP_SOCK_PATH, - max_message_size=MAX_MESSAGE_SIZE, - max_threads=MAX_THREADS, ): self.__map_handler: MapCallable = handler self.sock_path = f"unix://{sock_path}" - self._max_message_size = max_message_size - self._max_threads = max_threads - self.cleanup_coroutines = [] - - self._server_options = [ - ("grpc.max_send_message_length", self._max_message_size), - ("grpc.max_receive_message_length", self._max_message_size), - ] def MapFn( self, request: map_pb2.MapRequest, context: NumaflowServicerContext @@ -73,30 +62,7 @@ def MapFn( Applies a function to each datum element. The pascal case function name comes from the proto map_pb2_grpc.py file. """ - # proto repeated field(keys) is of type google._upb._message.RepeatedScalarContainer - # we need to explicitly convert it to list - try: - msgs = self.__map_handler( - list(request.keys), - Datum( - keys=list(request.keys), - value=request.value, - event_time=request.event_time.ToDatetime(), - watermark=request.watermark.ToDatetime(), - ), - ) - except Exception as err: - _LOGGER.critical("UDFError, re-raising the error", exc_info=True) - context.set_code(grpc.StatusCode.UNKNOWN) - context.set_details(str(err)) - return map_pb2.MapResponse(results=[]) - - datums = [] - - for msg in msgs: - datums.append(map_pb2.MapResponse.Result(keys=msg.keys, value=msg.value, tags=msg.tags)) - - return map_pb2.MapResponse(results=datums) + return _map_fn_util(self.__map_handler, request, context) def IsReady( self, request: _empty_pb2.Empty, context: NumaflowServicerContext diff --git a/pynumaflow/mapper/utils.py b/pynumaflow/mapper/utils.py new file mode 100644 index 00000000..42e3c9a4 --- /dev/null +++ b/pynumaflow/mapper/utils.py @@ -0,0 +1,36 @@ +import grpc +from pynumaflow.mapper._dtypes import MapCallable + +from pynumaflow.mapper import Datum +from pynumaflow.mapper.proto import map_pb2 +from pynumaflow.types import NumaflowServicerContext +from pynumaflow._constants import _LOGGER + + +def _map_fn_util( + __map_handler: MapCallable, request: map_pb2.MapRequest, context: NumaflowServicerContext +) -> map_pb2.MapResponse: + # proto repeated field(keys) is of type google._upb._message.RepeatedScalarContainer + # we need to explicitly convert it to list + try: + msgs = __map_handler( + list(request.keys), + Datum( + keys=list(request.keys), + value=request.value, + event_time=request.event_time.ToDatetime(), + watermark=request.watermark.ToDatetime(), + ), + ) + except Exception as err: + _LOGGER.critical("UDFError, re-raising the error", exc_info=True) + context.set_code(grpc.StatusCode.UNKNOWN) + context.set_details(str(err)) + return map_pb2.MapResponse(results=[]) + + datums = [] + + for msg in msgs: + datums.append(map_pb2.MapResponse.Result(keys=msg.keys, value=msg.value, tags=msg.tags)) + + return map_pb2.MapResponse(results=datums) diff --git a/pynumaflow/shared/server.py b/pynumaflow/shared/server.py index 66013def..82b9b30e 100644 --- a/pynumaflow/shared/server.py +++ b/pynumaflow/shared/server.py @@ -1,9 +1,26 @@ +import contextlib +import multiprocessing +import socket from abc import abstractmethod -from concurrent.futures import ThreadPoolExecutor +from concurrent import futures +from collections.abc import Iterator + import grpc -from pynumaflow._constants import MAX_MESSAGE_SIZE, MAX_THREADS, ServerType, _LOGGER -from pynumaflow.info.server import get_sdk_version, write as info_server_write -from pynumaflow.info.types import ServerInfo, Protocol, Language, SERVER_INFO_FILE_PATH +from pynumaflow._constants import ( + MAX_THREADS, + ServerType, + _LOGGER, + MULTIPROC_MAP_SOCK_ADDR, +) +from pynumaflow.exceptions import SocketError +from pynumaflow.info.server import get_sdk_version, write as info_server_write, get_metadata_env +from pynumaflow.info.types import ( + ServerInfo, + Protocol, + Language, + SERVER_INFO_FILE_PATH, + METADATA_ENVS, +) class NumaflowServer: @@ -29,8 +46,9 @@ def start(self): def prepare_server( sock_path: str, server_type: ServerType, - max_message_size=MAX_MESSAGE_SIZE, max_threads=MAX_THREADS, + server_options=None, + process_count=1, ): """ Create a new grpc Server instance. @@ -38,15 +56,18 @@ def prepare_server( The server instance is returned. """ - _server_options = [ - ("grpc.max_send_message_length", max_message_size), - ("grpc.max_receive_message_length", max_message_size), - ] - server = grpc.server(ThreadPoolExecutor(max_workers=max_threads), options=_server_options) - if server_type == ServerType.Async: - server = grpc.aio.server(options=_server_options) - server.add_insecure_port(sock_path) - return server + if server_type == ServerType.Sync: + server = _get_sync_server( + bind_address=sock_path, threads_per_proc=max_threads, server_options=server_options + ) + return server + elif server_type == ServerType.Multiproc: + servers, server_ports = get_multiproc_servers( + max_threads=max_threads, + server_options=server_options, + process_count=process_count, + ) + return servers, server_ports def write_info_file(protocol: Protocol) -> None: @@ -61,7 +82,84 @@ def write_info_file(protocol: Protocol) -> None: info_server_write(server_info=serv_info, info_file=SERVER_INFO_FILE_PATH) -async def __serve_async(self, server) -> None: +def start_sync_server(server: grpc.Server): + """ + Starts the Synchronous server instance on the given UNIX socket with given max threads. + Write information about the server to the server info file. + Wait for the server to terminate. + """ + # Start the server + server.start() + # Add the server information to the server info file, + # here we just write the protocol and language information + write_info_file(Protocol.UDS) + # Wait for the server to terminate + server.wait_for_termination() + + +def start_multiproc_server(servers: list, server_ports: list, max_threads: int): + """ + Start N grpc servers in different processes where N = The number of CPUs or the + value of the env var NUM_CPU_MULTIPROC defined by the user. The max value + is set to 2 * CPU count. + Each server will be bound to a different port, and we will create equal number of + workers to handle each server. + On the client side there will be same number of connections as the number of servers. + """ + workers = [] + _LOGGER.info( + "Starting new Multiproc server with num_procs: %s, num_threads/proc: %s", + len(servers), + max_threads, + ) + for server in servers: + # NOTE: It is imperative that the worker subprocesses be forked before + # any gRPC servers start up. See + # https://github.com/grpc/grpc/issues/16001 for more details. + worker = multiprocessing.Process(target=start_sync_server, args=(server,)) + worker.start() + workers.append(worker) + + for port in server_ports: + _LOGGER.info("Starting server on port: %s", port) + + # Convert the available ports to a comma separated string + ports = ",".join(map(str, server_ports)) + + serv_info = ServerInfo( + protocol=Protocol.TCP, + language=Language.PYTHON, + version=get_sdk_version(), + metadata=get_metadata_env(envs=METADATA_ENVS), + ) + # Add the PORTS metadata using the available ports + serv_info.metadata["SERV_PORTS"] = ports + info_server_write(server_info=serv_info, info_file=SERVER_INFO_FILE_PATH) + for worker in workers: + worker.join() + + +async def start_async_server( + server_async: grpc.aio.Server, sock_path: str, max_threads: int, cleanup_coroutines: list +): + """ + Starts the Async server instance on the given UNIX socket with given max threads. + Add the server graceful shutdown coroutine to the cleanup_coroutines list. + Wait for the server to terminate. + """ + await server_async.start() + + # Add the server information to the server info file + # Here we just write the protocol and language information + write_info_file(Protocol.UDS) + + # Log the server start + _LOGGER.info( + "New Async GRPC Server listening on: %s with max threads: %s", + sock_path, + max_threads, + ) + async def server_graceful_shutdown(): """ Shuts down the server with 5 seconds of grace period. During the @@ -69,13 +167,78 @@ async def server_graceful_shutdown(): existing RPCs to continue within the grace period. """ _LOGGER.info("Starting graceful shutdown...") - await server.stop(5) + await server_async.stop(5) + + cleanup_coroutines.append(server_graceful_shutdown()) + await server_async.wait_for_termination() + + +# async def __serve_async(self, server) -> None: +# async def server_graceful_shutdown(): +# """ +# Shuts down the server with 5 seconds of grace period. During the +# grace period, the server won't accept new connections and allow +# existing RPCs to continue within the grace period. +# """ +# _LOGGER.info("Starting graceful shutdown...") +# await server.stop(5) +# +# self.cleanup_coroutines.append(server_graceful_shutdown()) +# await server.wait_for_termination() +# +# +# async def start(self) -> None: +# """Starts the Async gRPC mapper on the given UNIX socket.""" +# server = grpc.aio.server(options=self._server_options) +# await self.__serve_async(server) + + +def _get_sync_server(bind_address: str, threads_per_proc: int, server_options: list) -> grpc.Server: + """Get a new sync grpc server instance.""" + + server = grpc.server( + futures.ThreadPoolExecutor( + max_workers=threads_per_proc, + ), + options=server_options, + ) + server.add_insecure_port(bind_address) + return server + # server.start() + # _LOGGER.info("GRPC Multi-Processor Server listening on: %s %d", bind_address, os.getpid()) + # server.wait_for_termination() + + +@contextlib.contextmanager +def _reserve_port(port_num: int) -> Iterator[int]: + """Find and reserve a port for all subprocesses to use.""" + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + if sock.getsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR) == 0: + raise SocketError("Failed to set SO_REUSEADDR.") + try: + sock.bind(("", port_num)) + yield sock.getsockname()[1] + finally: + sock.close() + - self.cleanup_coroutines.append(server_graceful_shutdown()) - await server.wait_for_termination() +def get_multiproc_servers(process_count: int, max_threads=MAX_THREADS, server_options=None): + # workers = [] + server_ports = [] + servers = [] + for _ in range(process_count): + # Find a port to bind to for each server, thus sending the port number = 0 + # to the _reserve_port function so that kernel can find and return a free port + with _reserve_port(port_num=0) as port: + bind_address = f"{MULTIPROC_MAP_SOCK_ADDR}:{port}" + server = _get_sync_server( + bind_address=bind_address, + threads_per_proc=max_threads, + server_options=server_options, + ) + servers.append(server) + server_ports.append(port) -async def start(self) -> None: - """Starts the Async gRPC mapper on the given UNIX socket.""" - server = grpc.aio.server(options=self._server_options) - await self.__serve_async(server) + return servers, server_ports From a0ac382c783beeeec0d7ec7d5f2d6cc1905bb077 Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Fri, 5 Jan 2024 16:23:03 -0800 Subject: [PATCH 18/78] multiproc info fix Signed-off-by: Sidhant Kohli --- pynumaflow/mapper/map.py | 13 +++++++----- pynumaflow/mapper/server.py | 2 -- pynumaflow/shared/server.py | 40 +++++++++++++++++++++++-------------- 3 files changed, 33 insertions(+), 22 deletions(-) diff --git a/pynumaflow/mapper/map.py b/pynumaflow/mapper/map.py index 5cd74542..5ef109d9 100644 --- a/pynumaflow/mapper/map.py +++ b/pynumaflow/mapper/map.py @@ -11,8 +11,8 @@ prepare_server, NumaflowServer, start_async_server, - start_sync_server, start_multiproc_server, + sync_server_start, ) @@ -50,9 +50,11 @@ def __init__( self._server_options = [ ("grpc.max_send_message_length", self.max_message_size), ("grpc.max_receive_message_length", self.max_message_size), - ("grpc.so_reuseport", 1), - ("grpc.so_reuseaddr", 1), ] + if server_type == ServerType.Multiproc: + self._server_options.append(("grpc.so_reuseport", 1)) + self._server_options.append(("grpc.so_reuseaddr", 1)) + # Set the number of processes to be spawned to the number of CPUs or # the value of the env var NUM_CPU_MULTIPROC defined by the user # Setting the max value to 2 * CPU count @@ -88,7 +90,8 @@ def exec(self): map_servicer = self.get_servicer( mapper_instance=self.mapper_instance, server_type=self.server_type ) - map_pb2_grpc.add_MapServicer_to_server(map_servicer, server) + _LOGGER.info("Starting Sync Map Server...") + map_pb2_grpc.add_MapServicer_to_server(servicer=map_servicer, server=server) # server.start() # write_info_file(Protocol.UDS) # _LOGGER.info( @@ -103,7 +106,7 @@ def exec(self): self.sock_path, self.max_threads, ) - start_sync_server(server=server) + sync_server_start(server=server) def exec_multiproc(self): """ diff --git a/pynumaflow/mapper/server.py b/pynumaflow/mapper/server.py index 5e2b7d45..391f7bb8 100644 --- a/pynumaflow/mapper/server.py +++ b/pynumaflow/mapper/server.py @@ -5,7 +5,6 @@ from pynumaflow import setup_logging from pynumaflow._constants import ( - MAX_MESSAGE_SIZE, MAP_SOCK_PATH, ) from pynumaflow.mapper._dtypes import MapCallable @@ -13,7 +12,6 @@ from pynumaflow.mapper.proto import map_pb2_grpc from pynumaflow.mapper.utils import _map_fn_util from pynumaflow.types import NumaflowServicerContext -from pynumaflow._constants import MAX_THREADS _LOGGER = setup_logging(__name__) if os.getenv("PYTHONDEBUG"): diff --git a/pynumaflow/shared/server.py b/pynumaflow/shared/server.py index 82b9b30e..7373bbad 100644 --- a/pynumaflow/shared/server.py +++ b/pynumaflow/shared/server.py @@ -2,8 +2,8 @@ import multiprocessing import socket from abc import abstractmethod -from concurrent import futures from collections.abc import Iterator +from concurrent.futures import ThreadPoolExecutor import grpc from pynumaflow._constants import ( @@ -82,17 +82,24 @@ def write_info_file(protocol: Protocol) -> None: info_server_write(server_info=serv_info, info_file=SERVER_INFO_FILE_PATH) -def start_sync_server(server: grpc.Server): +def sync_server_start(server: grpc.Server): """ Starts the Synchronous server instance on the given UNIX socket with given max threads. - Write information about the server to the server info file. Wait for the server to terminate. """ - # Start the server - server.start() + start_sync_server_util(server=server) # Add the server information to the server info file, # here we just write the protocol and language information write_info_file(Protocol.UDS) + + +def start_sync_server_util(server: grpc.Server): + """ + Starts the Synchronous server instance on the given UNIX socket with given max threads. + Wait for the server to terminate. + """ + # Start the server + server.start() # Wait for the server to terminate server.wait_for_termination() @@ -116,7 +123,7 @@ def start_multiproc_server(servers: list, server_ports: list, max_threads: int): # NOTE: It is imperative that the worker subprocesses be forked before # any gRPC servers start up. See # https://github.com/grpc/grpc/issues/16001 for more details. - worker = multiprocessing.Process(target=start_sync_server, args=(server,)) + worker = multiprocessing.Process(target=start_sync_server_util, args=(server,)) worker.start() workers.append(worker) @@ -193,16 +200,19 @@ async def server_graceful_shutdown(): # await self.__serve_async(server) -def _get_sync_server(bind_address: str, threads_per_proc: int, server_options: list) -> grpc.Server: +def _get_sync_server(bind_address: str, threads_per_proc: int, server_options: list): """Get a new sync grpc server instance.""" - - server = grpc.server( - futures.ThreadPoolExecutor( - max_workers=threads_per_proc, - ), - options=server_options, - ) - server.add_insecure_port(bind_address) + try: + server = grpc.server( + ThreadPoolExecutor( + max_workers=threads_per_proc, + ), + options=server_options, + ) + server.add_insecure_port(bind_address) + except Exception as err: + _LOGGER.critical("Failed to start server: %s", err, exc_info=True) + raise err return server # server.start() # _LOGGER.info("GRPC Multi-Processor Server listening on: %s %d", bind_address, os.getpid()) From 05a2cfe931cba2bb7059a1c36f4fc335d68d9c2b Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Sun, 7 Jan 2024 23:22:18 -0800 Subject: [PATCH 19/78] modular Signed-off-by: Sidhant Kohli --- pynumaflow/mapper/map.py | 15 ++++++++++++--- pynumaflow/shared/server.py | 15 +++++++++------ 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/pynumaflow/mapper/map.py b/pynumaflow/mapper/map.py index 5ef109d9..eb063981 100644 --- a/pynumaflow/mapper/map.py +++ b/pynumaflow/mapper/map.py @@ -1,4 +1,5 @@ import os +from concurrent.futures import ThreadPoolExecutor import aiorun import grpc @@ -65,6 +66,7 @@ def __init__( # Get the server instance based on the server type and assign it to self.server # self.server = self.get_server(server_type=server_type, mapper_instance=mapper_instance) + # self.server = prepare_server(self.sock_path, self.max_threads, self._server_options) def start(self) -> None: """ @@ -77,8 +79,6 @@ def start(self) -> None: aiorun.run(self.aexec()) elif self.server_type == ServerType.Multiproc: self.exec_multiproc() - raise NotImplementedError - else: raise NotImplementedError @@ -86,12 +86,21 @@ def exec(self): """ Starts the Synchronous gRPC server on the given UNIX socket with given max threads. """ - server = prepare_server(self.sock_path, self.max_threads, self._server_options) + server = prepare_server(sock_path=self.sock_path, server_type=self.server_type, + max_threads=self.max_threads, server_options=self._server_options) map_servicer = self.get_servicer( mapper_instance=self.mapper_instance, server_type=self.server_type ) + # server = grpc.server( + # ThreadPoolExecutor( + # max_workers=self.max_threads, + # ), + # options=self._server_options, + # ) + # self.server.add_insecure_port(self.sock_path) _LOGGER.info("Starting Sync Map Server...") map_pb2_grpc.add_MapServicer_to_server(servicer=map_servicer, server=server) + # server.start() # write_info_file(Protocol.UDS) # _LOGGER.info( diff --git a/pynumaflow/shared/server.py b/pynumaflow/shared/server.py index 7373bbad..ea24cdd3 100644 --- a/pynumaflow/shared/server.py +++ b/pynumaflow/shared/server.py @@ -44,11 +44,11 @@ def start(self): def prepare_server( - sock_path: str, - server_type: ServerType, - max_threads=MAX_THREADS, - server_options=None, - process_count=1, + sock_path: str, + server_type: ServerType, + max_threads=MAX_THREADS, + server_options=None, + process_count=1, ): """ Create a new grpc Server instance. @@ -142,12 +142,13 @@ def start_multiproc_server(servers: list, server_ports: list, max_threads: int): # Add the PORTS metadata using the available ports serv_info.metadata["SERV_PORTS"] = ports info_server_write(server_info=serv_info, info_file=SERVER_INFO_FILE_PATH) + for worker in workers: worker.join() async def start_async_server( - server_async: grpc.aio.Server, sock_path: str, max_threads: int, cleanup_coroutines: list + server_async: grpc.aio.Server, sock_path: str, max_threads: int, cleanup_coroutines: list ): """ Starts the Async server instance on the given UNIX socket with given max threads. @@ -210,6 +211,8 @@ def _get_sync_server(bind_address: str, threads_per_proc: int, server_options: l options=server_options, ) server.add_insecure_port(bind_address) + print("bind_address", bind_address) + _LOGGER.info("Starting new server with bind_address: %s", bind_address) except Exception as err: _LOGGER.critical("Failed to start server: %s", err, exc_info=True) raise err From 3f6049f408c89f55ddf7dd6cae6e7640b2057507 Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Sun, 7 Jan 2024 23:57:01 -0800 Subject: [PATCH 20/78] modular Signed-off-by: Sidhant Kohli --- pynumaflow/shared/server.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pynumaflow/shared/server.py b/pynumaflow/shared/server.py index ea24cdd3..ab98c41c 100644 --- a/pynumaflow/shared/server.py +++ b/pynumaflow/shared/server.py @@ -87,11 +87,14 @@ def sync_server_start(server: grpc.Server): Starts the Synchronous server instance on the given UNIX socket with given max threads. Wait for the server to terminate. """ - start_sync_server_util(server=server) + # start_sync_server_util(server=server) + # Start the server + server.start() # Add the server information to the server info file, # here we just write the protocol and language information write_info_file(Protocol.UDS) - + # Wait for the server to terminate + server.wait_for_termination() def start_sync_server_util(server: grpc.Server): """ @@ -250,7 +253,6 @@ def get_multiproc_servers(process_count: int, max_threads=MAX_THREADS, server_op threads_per_proc=max_threads, server_options=server_options, ) - servers.append(server) server_ports.append(port) From b642b1757bd0b225f3c88efbab57ac1213a02f14 Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Mon, 8 Jan 2024 00:15:25 -0800 Subject: [PATCH 21/78] modular Signed-off-by: Sidhant Kohli --- pynumaflow/mapper/map.py | 24 ++++----- pynumaflow/shared/server.py | 103 ++++++++++++++++++++++-------------- 2 files changed, 76 insertions(+), 51 deletions(-) diff --git a/pynumaflow/mapper/map.py b/pynumaflow/mapper/map.py index eb063981..6f8041a5 100644 --- a/pynumaflow/mapper/map.py +++ b/pynumaflow/mapper/map.py @@ -121,23 +121,23 @@ def exec_multiproc(self): """ Starts the gRPC server on the given UNIX socket with given max threads. """ - servers, server_ports = prepare_server( - server_type=self.server_type, - max_threads=self.max_threads, - server_options=self._server_options, - process_count=self._process_count, - sock_path=self.sock_path, - ) + # servers, server_ports = prepare_server( + # server_type=self.server_type, + # max_threads=self.max_threads, + # server_options=self._server_options, + # process_count=self._process_count, + # sock_path=self.sock_path, + # ) map_servicer = self.get_servicer( mapper_instance=self.mapper_instance, server_type=self.server_type ) - for server in servers: - map_pb2_grpc.add_MapServicer_to_server(map_servicer, server) + # for server in servers: + # map_pb2_grpc.add_MapServicer_to_server(map_servicer, server) - start_multiproc_server( - servers=servers, server_ports=server_ports, max_threads=self.max_threads - ) + start_multiproc_server(max_threads=self.max_threads, servicer=map_servicer, + process_count=self._process_count, + server_options=self._server_options) # server.start() # write_info_file(Protocol.UDS) # _LOGGER.info( diff --git a/pynumaflow/shared/server.py b/pynumaflow/shared/server.py index ab98c41c..1cae1873 100644 --- a/pynumaflow/shared/server.py +++ b/pynumaflow/shared/server.py @@ -1,5 +1,6 @@ import contextlib import multiprocessing +import os import socket from abc import abstractmethod from collections.abc import Iterator @@ -21,6 +22,7 @@ SERVER_INFO_FILE_PATH, METADATA_ENVS, ) +from pynumaflow.mapper.proto import map_pb2_grpc class NumaflowServer: @@ -61,13 +63,13 @@ def prepare_server( bind_address=sock_path, threads_per_proc=max_threads, server_options=server_options ) return server - elif server_type == ServerType.Multiproc: - servers, server_ports = get_multiproc_servers( - max_threads=max_threads, - server_options=server_options, - process_count=process_count, - ) - return servers, server_ports + # elif server_type == ServerType.Multiproc: + # servers, server_ports = get_multiproc_servers( + # max_threads=max_threads, + # server_options=server_options, + # process_count=process_count, + # ) + # return servers, server_ports def write_info_file(protocol: Protocol) -> None: @@ -96,6 +98,7 @@ def sync_server_start(server: grpc.Server): # Wait for the server to terminate server.wait_for_termination() + def start_sync_server_util(server: grpc.Server): """ Starts the Synchronous server instance on the given UNIX socket with given max threads. @@ -107,7 +110,23 @@ def start_sync_server_util(server: grpc.Server): server.wait_for_termination() -def start_multiproc_server(servers: list, server_ports: list, max_threads: int): +def _run_server(servicer, bind_address: str, threads_per_proc, server_options) -> None: + """Start a server in a subprocess.""" + server = grpc.server( + ThreadPoolExecutor( + max_workers=threads_per_proc, + ), + options=server_options, + ) + map_pb2_grpc.add_MapServicer_to_server(servicer, server) + server.add_insecure_port(bind_address) + server.start() + _LOGGER.info("GRPC Server listening on: %s %d", bind_address, os.getpid()) + server.wait_for_termination() + + +def start_multiproc_server(max_threads: int, servicer, + process_count: int, server_options=None): """ Start N grpc servers in different processes where N = The number of CPUs or the value of the env var NUM_CPU_MULTIPROC defined by the user. The max value @@ -116,22 +135,28 @@ def start_multiproc_server(servers: list, server_ports: list, max_threads: int): workers to handle each server. On the client side there will be same number of connections as the number of servers. """ - workers = [] + _LOGGER.info( - "Starting new Multiproc server with num_procs: %s, num_threads/proc: %s", - len(servers), + "Starting new Multiproc server with num_procs: %s, num_threads per proc: %s", + process_count, max_threads, ) - for server in servers: - # NOTE: It is imperative that the worker subprocesses be forked before - # any gRPC servers start up. See - # https://github.com/grpc/grpc/issues/16001 for more details. - worker = multiprocessing.Process(target=start_sync_server_util, args=(server,)) - worker.start() - workers.append(worker) - - for port in server_ports: - _LOGGER.info("Starting server on port: %s", port) + workers = [] + server_ports = [] + for _ in range(process_count): + # Find a port to bind to for each server, thus sending the port number = 0 + # to the _reserve_port function so that kernel can find and return a free port + with _reserve_port(port_num=0) as port: + bind_address = f"{MULTIPROC_MAP_SOCK_ADDR}:{port}" + _LOGGER.info("Starting server on port: %s", port) + # NOTE: It is imperative that the worker subprocesses be forked before + # any gRPC servers start up. See + # https://github.com/grpc/grpc/issues/16001 for more details. + worker = multiprocessing.Process(target=_run_server, args=(servicer, bind_address, + max_threads, server_options)) + worker.start() + workers.append(worker) + server_ports.append(port) # Convert the available ports to a comma separated string ports = ",".join(map(str, server_ports)) @@ -239,21 +264,21 @@ def _reserve_port(port_num: int) -> Iterator[int]: sock.close() -def get_multiproc_servers(process_count: int, max_threads=MAX_THREADS, server_options=None): - # workers = [] - server_ports = [] - servers = [] - for _ in range(process_count): - # Find a port to bind to for each server, thus sending the port number = 0 - # to the _reserve_port function so that kernel can find and return a free port - with _reserve_port(port_num=0) as port: - bind_address = f"{MULTIPROC_MAP_SOCK_ADDR}:{port}" - server = _get_sync_server( - bind_address=bind_address, - threads_per_proc=max_threads, - server_options=server_options, - ) - servers.append(server) - server_ports.append(port) - - return servers, server_ports +# def get_multiproc_servers(process_count: int, max_threads=MAX_THREADS, server_options=None): +# # workers = [] +# server_ports = [] +# servers = [] +# for _ in range(process_count): +# # Find a port to bind to for each server, thus sending the port number = 0 +# # to the _reserve_port function so that kernel can find and return a free port +# with _reserve_port(port_num=0) as port: +# bind_address = f"{MULTIPROC_MAP_SOCK_ADDR}:{port}" +# server = _get_sync_server( +# bind_address=bind_address, +# threads_per_proc=max_threads, +# server_options=server_options, +# ) +# servers.append(server) +# server_ports.append(port) +# +# return servers, server_ports From 40bb29590c57ec8c94432dd6603280c1219ea496 Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Mon, 8 Jan 2024 00:37:54 -0800 Subject: [PATCH 22/78] modular Signed-off-by: Sidhant Kohli --- pynumaflow/mapper/map.py | 43 +++++++++++++++++++++---------------- pynumaflow/shared/server.py | 42 ++++++++++++++++++++++++------------ 2 files changed, 52 insertions(+), 33 deletions(-) diff --git a/pynumaflow/mapper/map.py b/pynumaflow/mapper/map.py index 6f8041a5..0b9630b4 100644 --- a/pynumaflow/mapper/map.py +++ b/pynumaflow/mapper/map.py @@ -23,12 +23,12 @@ class MapServer(NumaflowServer): """ def __init__( - self, - mapper_instance: MapCallable, - sock_path=MAP_SOCK_PATH, - max_message_size=MAX_MESSAGE_SIZE, - max_threads=MAX_THREADS, - server_type=ServerType.Sync, + self, + mapper_instance: MapCallable, + sock_path=MAP_SOCK_PATH, + max_message_size=MAX_MESSAGE_SIZE, + max_threads=MAX_THREADS, + server_type=ServerType.Sync, ): """ Create a new grpc Server instance. @@ -86,20 +86,20 @@ def exec(self): """ Starts the Synchronous gRPC server on the given UNIX socket with given max threads. """ - server = prepare_server(sock_path=self.sock_path, server_type=self.server_type, - max_threads=self.max_threads, server_options=self._server_options) + # server = prepare_server(sock_path=self.sock_path, server_type=self.server_type, + # max_threads=self.max_threads, server_options=self._server_options) map_servicer = self.get_servicer( mapper_instance=self.mapper_instance, server_type=self.server_type ) - # server = grpc.server( - # ThreadPoolExecutor( - # max_workers=self.max_threads, - # ), - # options=self._server_options, - # ) - # self.server.add_insecure_port(self.sock_path) - _LOGGER.info("Starting Sync Map Server...") - map_pb2_grpc.add_MapServicer_to_server(servicer=map_servicer, server=server) + # # server = grpc.server( + # # ThreadPoolExecutor( + # # max_workers=self.max_threads, + # # ), + # # options=self._server_options, + # # ) + # # self.server.add_insecure_port(self.sock_path) + # _LOGGER.info("Starting Sync Map Server...") + # map_pb2_grpc.add_MapServicer_to_server(servicer=map_servicer, server=server) # server.start() # write_info_file(Protocol.UDS) @@ -115,7 +115,12 @@ def exec(self): self.sock_path, self.max_threads, ) - sync_server_start(server=server) + # sync_server_start(server=server) + sync_server_start(servicer=map_servicer, + bind_address=self.sock_path, + max_threads=self.max_threads, + server_options=self._server_options, + udf_type="Map") def exec_multiproc(self): """ @@ -137,7 +142,7 @@ def exec_multiproc(self): start_multiproc_server(max_threads=self.max_threads, servicer=map_servicer, process_count=self._process_count, - server_options=self._server_options) + server_options=self._server_options, udf_type="Map") # server.start() # write_info_file(Protocol.UDS) # _LOGGER.info( diff --git a/pynumaflow/shared/server.py b/pynumaflow/shared/server.py index 1cae1873..bb6e356e 100644 --- a/pynumaflow/shared/server.py +++ b/pynumaflow/shared/server.py @@ -84,19 +84,27 @@ def write_info_file(protocol: Protocol) -> None: info_server_write(server_info=serv_info, info_file=SERVER_INFO_FILE_PATH) -def sync_server_start(server: grpc.Server): +def sync_server_start(servicer, bind_address: str, max_threads: int, + server_options=None, udf_type: str = "Map"): """ Starts the Synchronous server instance on the given UNIX socket with given max threads. Wait for the server to terminate. """ - # start_sync_server_util(server=server) - # Start the server - server.start() - # Add the server information to the server info file, - # here we just write the protocol and language information - write_info_file(Protocol.UDS) - # Wait for the server to terminate - server.wait_for_termination() + # # start_sync_server_util(server=server) + # # Start the server + # server.start() + # # Add the server information to the server info file, + # # here we just write the protocol and language information + # write_info_file(Protocol.UDS) + # # Wait for the server to terminate + # server.wait_for_termination() + server_info = ServerInfo( + protocol=Protocol.UDS, + language=Language.PYTHON, + version=get_sdk_version(), + ) + _run_server(servicer=servicer, bind_address=bind_address, threads_per_proc=max_threads, + server_options=server_options, udf_type=udf_type, server_info=server_info) def start_sync_server_util(server: grpc.Server): @@ -110,7 +118,8 @@ def start_sync_server_util(server: grpc.Server): server.wait_for_termination() -def _run_server(servicer, bind_address: str, threads_per_proc, server_options) -> None: +def _run_server(servicer, bind_address: str, threads_per_proc, server_options, + udf_type: str, server_info=None) -> None: """Start a server in a subprocess.""" server = grpc.server( ThreadPoolExecutor( @@ -118,15 +127,21 @@ def _run_server(servicer, bind_address: str, threads_per_proc, server_options) - ), options=server_options, ) - map_pb2_grpc.add_MapServicer_to_server(servicer, server) + if udf_type == "Map": + map_pb2_grpc.add_MapServicer_to_server(servicer, server) server.add_insecure_port(bind_address) server.start() + + if server_info: + info_server_write(server_info=server_info, info_file=SERVER_INFO_FILE_PATH) + _LOGGER.info("GRPC Server listening on: %s %d", bind_address, os.getpid()) server.wait_for_termination() def start_multiproc_server(max_threads: int, servicer, - process_count: int, server_options=None): + process_count: int, server_options=None, + udf_type: str = "Map"): """ Start N grpc servers in different processes where N = The number of CPUs or the value of the env var NUM_CPU_MULTIPROC defined by the user. The max value @@ -153,7 +168,7 @@ def start_multiproc_server(max_threads: int, servicer, # any gRPC servers start up. See # https://github.com/grpc/grpc/issues/16001 for more details. worker = multiprocessing.Process(target=_run_server, args=(servicer, bind_address, - max_threads, server_options)) + max_threads, server_options, udf_type)) worker.start() workers.append(worker) server_ports.append(port) @@ -263,7 +278,6 @@ def _reserve_port(port_num: int) -> Iterator[int]: finally: sock.close() - # def get_multiproc_servers(process_count: int, max_threads=MAX_THREADS, server_options=None): # # workers = [] # server_ports = [] From e3b7090410cf765ca6270c2717e75560aea48038 Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Mon, 8 Jan 2024 10:42:53 -0800 Subject: [PATCH 23/78] mapstream Signed-off-by: Sidhant Kohli --- pynumaflow/mapper/async_server.py | 20 -- pynumaflow/mapper/example.py | 41 ---- pynumaflow/mapper/map.py | 104 ++-------- pynumaflow/mapper/multiproc_server.py | 98 --------- pynumaflow/mapper/server.py | 5 - pynumaflow/mapper/test.py | 53 +++++ pynumaflow/mapstreamer/__init__.py | 13 ++ pynumaflow/mapstreamer/_dtypes.py | 191 ++++++++++++++++++ pynumaflow/mapstreamer/async_server.py | 100 +++++++++ pynumaflow/mapstreamer/mapstream.py | 72 +++++++ pynumaflow/mapstreamer/proto/__init__.py | 0 pynumaflow/mapstreamer/proto/mapstream.proto | 44 ++++ pynumaflow/mapstreamer/proto/mapstream_pb2.py | 38 ++++ .../mapstreamer/proto/mapstream_pb2_grpc.py | 125 ++++++++++++ pynumaflow/shared/server.py | 131 +++++------- 15 files changed, 708 insertions(+), 327 deletions(-) delete mode 100644 pynumaflow/mapper/example.py delete mode 100644 pynumaflow/mapper/multiproc_server.py create mode 100644 pynumaflow/mapper/test.py create mode 100644 pynumaflow/mapstreamer/__init__.py create mode 100644 pynumaflow/mapstreamer/_dtypes.py create mode 100644 pynumaflow/mapstreamer/async_server.py create mode 100644 pynumaflow/mapstreamer/mapstream.py create mode 100644 pynumaflow/mapstreamer/proto/__init__.py create mode 100644 pynumaflow/mapstreamer/proto/mapstream.proto create mode 100644 pynumaflow/mapstreamer/proto/mapstream_pb2.py create mode 100644 pynumaflow/mapstreamer/proto/mapstream_pb2_grpc.py diff --git a/pynumaflow/mapper/async_server.py b/pynumaflow/mapper/async_server.py index 0ab1abed..d6c86f23 100644 --- a/pynumaflow/mapper/async_server.py +++ b/pynumaflow/mapper/async_server.py @@ -7,10 +7,6 @@ from google.protobuf import empty_pb2 as _empty_pb2 from pynumaflow import setup_logging -from pynumaflow._constants import ( - MAX_MESSAGE_SIZE, - MAP_SOCK_PATH, -) from pynumaflow.mapper import Datum from pynumaflow.mapper._dtypes import MapAsyncCallable, MapCallable from pynumaflow.mapper.proto import map_pb2 @@ -57,24 +53,8 @@ class AsyncMapper(map_pb2_grpc.MapServicer): def __init__( self, handler: MapAsyncCallable, - sock_path=MAP_SOCK_PATH, - max_message_size=MAX_MESSAGE_SIZE, - max_threads=MAX_THREADS, ): self.__map_handler: MapCallable = handler - self.sock_path = f"unix://{sock_path}" - self._max_message_size = max_message_size - self._max_threads = max_threads - self.cleanup_coroutines = [] - # Collection for storing strong references to all running tasks. - # Event loop only keeps a weak reference, which can cause it to - # get lost during execution. - self.background_tasks = set() - - self._server_options = [ - ("grpc.max_send_message_length", self._max_message_size), - ("grpc.max_receive_message_length", self._max_message_size), - ] async def MapFn( self, request: map_pb2.MapRequest, context: NumaflowServicerContext diff --git a/pynumaflow/mapper/example.py b/pynumaflow/mapper/example.py deleted file mode 100644 index c32d4ff7..00000000 --- a/pynumaflow/mapper/example.py +++ /dev/null @@ -1,41 +0,0 @@ -""" -Write a class which implements the MapperClass interface. -""" -from pynumaflow.mapper import ServerType -from pynumaflow.mapper import Datum, Messages, Message -from pynumaflow.mapper.map import MapServer -from pynumaflow.mapper._dtypes import MapperClass - - -class ExampleMapperClass(MapperClass): - """ - Provides an interface to write a Mapper - """ - - def handler(self, keys: [str], datum: Datum) -> Messages: - """ - Write a handler function which implements the MapCallable interface. - """ - val = datum.value - _ = datum.event_time - _ = datum.watermark - messages = Messages(Message(val, keys=keys)) - return messages - - -def handler_new(keys: [str], datum: Datum) -> Messages: - """ - Write a handler function which implements the MapCallable interface. - """ - val = datum.value - _ = datum.event_time - _ = datum.watermark - messages = Messages(Message(val, keys=keys)) - return messages - - -# Write a main function to create a new MapServer instance. -if __name__ == "__main__": - map_instance = ExampleMapperClass() - grpc_server = MapServer(mapper_instance=map_instance, server_type=ServerType.Async) - grpc_server.start() diff --git a/pynumaflow/mapper/map.py b/pynumaflow/mapper/map.py index 0b9630b4..323a92d1 100644 --- a/pynumaflow/mapper/map.py +++ b/pynumaflow/mapper/map.py @@ -1,5 +1,4 @@ import os -from concurrent.futures import ThreadPoolExecutor import aiorun import grpc @@ -9,7 +8,6 @@ from pynumaflow.mapper._dtypes import MapCallable from pynumaflow.mapper.proto import map_pb2_grpc from pynumaflow.shared.server import ( - prepare_server, NumaflowServer, start_async_server, start_multiproc_server, @@ -23,12 +21,12 @@ class MapServer(NumaflowServer): """ def __init__( - self, - mapper_instance: MapCallable, - sock_path=MAP_SOCK_PATH, - max_message_size=MAX_MESSAGE_SIZE, - max_threads=MAX_THREADS, - server_type=ServerType.Sync, + self, + mapper_instance: MapCallable, + sock_path=MAP_SOCK_PATH, + max_message_size=MAX_MESSAGE_SIZE, + max_threads=MAX_THREADS, + server_type=ServerType.Sync, ): """ Create a new grpc Server instance. @@ -45,8 +43,6 @@ def __init__( self.mapper_instance = mapper_instance self.server_type = server_type - self.background_tasks = set() - self.cleanup_coroutines = [] self._server_options = [ ("grpc.max_send_message_length", self.max_message_size), @@ -64,10 +60,6 @@ def __init__( int(os.getenv("NUM_CPU_MULTIPROC", str(os.cpu_count()))), 2 * os.cpu_count() ) - # Get the server instance based on the server type and assign it to self.server - # self.server = self.get_server(server_type=server_type, mapper_instance=mapper_instance) - # self.server = prepare_server(self.sock_path, self.max_threads, self._server_options) - def start(self) -> None: """ Starts the gRPC server on the given UNIX socket with given max threads. @@ -86,71 +78,37 @@ def exec(self): """ Starts the Synchronous gRPC server on the given UNIX socket with given max threads. """ - # server = prepare_server(sock_path=self.sock_path, server_type=self.server_type, - # max_threads=self.max_threads, server_options=self._server_options) map_servicer = self.get_servicer( mapper_instance=self.mapper_instance, server_type=self.server_type ) - # # server = grpc.server( - # # ThreadPoolExecutor( - # # max_workers=self.max_threads, - # # ), - # # options=self._server_options, - # # ) - # # self.server.add_insecure_port(self.sock_path) - # _LOGGER.info("Starting Sync Map Server...") - # map_pb2_grpc.add_MapServicer_to_server(servicer=map_servicer, server=server) - - # server.start() - # write_info_file(Protocol.UDS) - # _LOGGER.info( - # "Sync GRPC Server listening on: %s with max threads: %s", - # self.sock_path, - # self.max_threads, - # ) - # server.wait_for_termination() - # Log the server start _LOGGER.info( "Sync GRPC Server listening on: %s with max threads: %s", self.sock_path, self.max_threads, ) # sync_server_start(server=server) - sync_server_start(servicer=map_servicer, - bind_address=self.sock_path, - max_threads=self.max_threads, - server_options=self._server_options, - udf_type="Map") + sync_server_start( + servicer=map_servicer, + bind_address=self.sock_path, + max_threads=self.max_threads, + server_options=self._server_options, + udf_type="Map", + ) def exec_multiproc(self): """ Starts the gRPC server on the given UNIX socket with given max threads. """ - # servers, server_ports = prepare_server( - # server_type=self.server_type, - # max_threads=self.max_threads, - # server_options=self._server_options, - # process_count=self._process_count, - # sock_path=self.sock_path, - # ) - map_servicer = self.get_servicer( mapper_instance=self.mapper_instance, server_type=self.server_type ) - # for server in servers: - # map_pb2_grpc.add_MapServicer_to_server(map_servicer, server) - - start_multiproc_server(max_threads=self.max_threads, servicer=map_servicer, - process_count=self._process_count, - server_options=self._server_options, udf_type="Map") - # server.start() - # write_info_file(Protocol.UDS) - # _LOGGER.info( - # "Sync GRPC Server listening on: %s with max threads: %s", - # self.sock_path, - # self.max_threads, - # ) - # server.wait_for_termination() + start_multiproc_server( + max_threads=self.max_threads, + servicer=map_servicer, + process_count=self._process_count, + server_options=self._server_options, + udf_type="Map", + ) async def aexec(self) -> None: """ @@ -165,28 +123,6 @@ async def aexec(self) -> None: await start_async_server(server_new, self.sock_path, self.max_threads, self._server_options) - # await server_new.start() - # - # write_info_file(Protocol.UDS) - # _LOGGER.info( - # "Async Map New GRPC Server listening on: %s with max threads: %s", - # self.sock_path, - # self.max_threads, - # ) - # - # async def server_graceful_shutdown(): - # """ - # Shuts down the server with 5 seconds of grace period. During the - # grace period, the server won't accept new connections and allow - # existing RPCs to continue within the grace period. - # """ - # _LOGGER.info("Starting graceful shutdown...") - # await server_new.stop(5) - # - # self.cleanup_coroutines.append(server_graceful_shutdown()) - # # asyncio.run_coroutine_threadsafe(self.server.wait_for_termination(), _loop) - # await server_new.wait_for_termination() - def get_servicer(self, mapper_instance: MapCallable, server_type: ServerType): if server_type == ServerType.Sync: map_servicer = Mapper(handler=mapper_instance) diff --git a/pynumaflow/mapper/multiproc_server.py b/pynumaflow/mapper/multiproc_server.py deleted file mode 100644 index 72f69891..00000000 --- a/pynumaflow/mapper/multiproc_server.py +++ /dev/null @@ -1,98 +0,0 @@ -import logging -import os - -from google.protobuf import empty_pb2 as _empty_pb2 - -from pynumaflow import setup_logging -from pynumaflow._constants import ( - MAX_MESSAGE_SIZE, - MAP_SOCK_PATH, - MAX_THREADS, -) -from pynumaflow.mapper._dtypes import MapCallable -from pynumaflow.mapper.proto import map_pb2 -from pynumaflow.mapper.proto import map_pb2_grpc -from pynumaflow.mapper.utils import _map_fn_util -from pynumaflow.types import NumaflowServicerContext - -_LOGGER = setup_logging(__name__) -if os.getenv("PYTHONDEBUG"): - _LOGGER.setLevel(logging.DEBUG) - - -class MultiProcMapper(map_pb2_grpc.MapServicer): - """ - Provides an interface to write a Multi Proc Mapper - which will be exposed over gRPC. - - Args: - handler: Function callable following the type signature of MapCallable - max_message_size: The max message size in bytes the server can receive and send - - Example invocation: - >>> from typing import Iterator - >>> from pynumaflow.mapper import Messages, Message \ - ... Datum, MultiProcMapper - ... - >>> def map_handler(keys: list[str], datum: Datum) -> Messages: - ... val = datum.value - ... _ = datum.event_time - ... _ = datum.watermark - ... messages = Messages(Message(val, keys=keys)) - ... return messages - ... - >>> grpc_server = MultiProcMapper(handler=map_handler) - >>> grpc_server.start() - """ - - __slots__ = ( - "__map_handler", - "_max_message_size", - "_server_options", - "_process_count", - "_threads_per_proc", - ) - - def __init__( - self, - handler: MapCallable, - sock_path=MAP_SOCK_PATH, - max_message_size=MAX_MESSAGE_SIZE, - max_threads=MAX_THREADS, - ): - self.__map_handler: MapCallable = handler - self.sock_path = f"unix://{sock_path}" - self._max_message_size = max_message_size - self._max_threads = max_threads - - self._server_options = [ - ("grpc.max_send_message_length", self._max_message_size), - ("grpc.max_receive_message_length", self._max_message_size), - ("grpc.so_reuseport", 1), - ("grpc.so_reuseaddr", 1), - ] - # Set the number of processes to be spawned to the number of CPUs or - # the value of the env var NUM_CPU_MULTIPROC defined by the user - # Setting the max value to 2 * CPU count - self._process_count = min( - int(os.getenv("NUM_CPU_MULTIPROC", str(os.cpu_count()))), 2 * os.cpu_count() - ) - self._threads_per_proc = int(os.getenv("MAX_THREADS", "4")) - - def MapFn( - self, request: map_pb2.MapRequest, context: NumaflowServicerContext - ) -> map_pb2.MapResponse: - """ - Applies a function to each datum element. - The pascal case function name comes from the proto map_pb2_grpc.py file. - """ - return _map_fn_util(self.__map_handler, request, context) - - def IsReady( - self, request: _empty_pb2.Empty, context: NumaflowServicerContext - ) -> map_pb2.ReadyResponse: - """ - IsReady is the heartbeat endpoint for gRPC. - The pascal case function name comes from the proto map_pb2_grpc.py file. - """ - return map_pb2.ReadyResponse(ready=True) diff --git a/pynumaflow/mapper/server.py b/pynumaflow/mapper/server.py index 391f7bb8..69b78d8b 100644 --- a/pynumaflow/mapper/server.py +++ b/pynumaflow/mapper/server.py @@ -4,9 +4,6 @@ from google.protobuf import empty_pb2 as _empty_pb2 from pynumaflow import setup_logging -from pynumaflow._constants import ( - MAP_SOCK_PATH, -) from pynumaflow.mapper._dtypes import MapCallable from pynumaflow.mapper.proto import map_pb2 from pynumaflow.mapper.proto import map_pb2_grpc @@ -48,10 +45,8 @@ class Mapper(map_pb2_grpc.MapServicer): def __init__( self, handler: MapCallable, - sock_path=MAP_SOCK_PATH, ): self.__map_handler: MapCallable = handler - self.sock_path = f"unix://{sock_path}" def MapFn( self, request: map_pb2.MapRequest, context: NumaflowServicerContext diff --git a/pynumaflow/mapper/test.py b/pynumaflow/mapper/test.py new file mode 100644 index 00000000..cd5f0899 --- /dev/null +++ b/pynumaflow/mapper/test.py @@ -0,0 +1,53 @@ +import asyncio +from abc import ABCMeta, abstractmethod + +import aiorun + + +class MapperClass(metaclass=ABCMeta): + """ + Provides an interface to write a Mapper + which will be exposed over a Synchronous gRPC server. + + Args: + + """ + + def __call__(self, *args, **kwargs): + """ + Allow to call handler function directly if class instance is sent + """ + return self.handler(*args, **kwargs) + + @abstractmethod + def handler(self, key: str) -> int: + """ + Write a handler function which implements the MapCallable interface. + """ + pass + + +class Sync(MapperClass): + def handler(self, key: str) -> int: + print("Sync") + return 1 + + +class Async(MapperClass): + async def handler(self, key: str) -> int: + await self.hey(key) + + async def hey(self, key): + print(key) + await asyncio.sleep(10) + return 1 + # return await self.handler( + + +if __name__ == "__main__": + hello = Sync() + print(hello.handler("hello")) + hello = Async() + aiorun.run(hello.handler("hello")) + hello2 = Async() + aiorun.run(hello2.handler("hello2")) diff --git a/pynumaflow/mapstreamer/__init__.py b/pynumaflow/mapstreamer/__init__.py new file mode 100644 index 00000000..be47226b --- /dev/null +++ b/pynumaflow/mapstreamer/__init__.py @@ -0,0 +1,13 @@ +from pynumaflow._constants import DROP + +from pynumaflow.mapstreamer._dtypes import Message, Messages, Datum, MapStreamerClass +from pynumaflow.mapstreamer.mapstream import MapStreamServer + +__all__ = [ + "Message", + "Messages", + "Datum", + "DROP", + "MapStreamServer", + "MapStreamerClass", +] diff --git a/pynumaflow/mapstreamer/_dtypes.py b/pynumaflow/mapstreamer/_dtypes.py new file mode 100644 index 00000000..25b7e444 --- /dev/null +++ b/pynumaflow/mapstreamer/_dtypes.py @@ -0,0 +1,191 @@ +from abc import ABCMeta, abstractmethod +from collections.abc import Iterator, Sequence +from dataclasses import dataclass +from datetime import datetime +from typing import TypeVar, Callable, Union +from collections.abc import AsyncIterable +from warnings import warn + +from pynumaflow._constants import DROP + +M = TypeVar("M", bound="Message") +Ms = TypeVar("Ms", bound="Messages") + + +@dataclass(init=False) +class Message: + """ + Basic datatype for data passing to the next vertex/vertices. + + Args: + value: data in bytes + keys: []string keys for vertex (optional) + tags: []string tags for conditional forwarding (optional) + """ + + __slots__ = ("_value", "_keys", "_tags") + + _value: bytes + _keys: list[str] + _tags: list[str] + + def __init__(self, value: bytes, keys: list[str] = None, tags: list[str] = None): + """ + Creates a Message object to send value to a vertex. + """ + self._keys = keys or [] + self._tags = tags or [] + self._value = value or b"" + + # returns the Message Object which will be dropped + @classmethod + def to_drop(cls: type[M]) -> M: + return cls(b"", None, [DROP]) + + @property + def value(self) -> bytes: + return self._value + + @property + def keys(self) -> list[str]: + return self._keys + + @property + def tags(self) -> list[str]: + return self._tags + + +class Messages(Sequence[M]): + """ + Class to define a list of Message objects. + + Args: + messages: list of Message objects. + """ + + __slots__ = ("_messages",) + + def __init__(self, *messages: M): + self._messages = list(messages) or [] + + def __str__(self) -> str: + return str(self._messages) + + def __repr__(self) -> str: + return str(self) + + def __len__(self) -> int: + return len(self._messages) + + def __iter__(self) -> Iterator[M]: + return iter(self._messages) + + def __getitem__(self, index: int) -> M: + if isinstance(index, slice): + raise TypeError("Slicing is not supported for Messages") + return self._messages[index] + + def append(self, message: Message) -> None: + self._messages.append(message) + + def items(self) -> list[Message]: + warn( + "Using items is deprecated and will be removed in v0.5. " + "Iterate or index the Messages object instead.", + DeprecationWarning, + stacklevel=2, + ) + return self._messages + + +@dataclass(init=False) +class Datum: + """ + Class to define the important information for the event. + Args: + keys: the keys of the event. + value: the payload of the event. + event_time: the event time of the event. + watermark: the watermark of the event. + >>> # Example usage + >>> from pynumaflow.mapstreamer import Datum + >>> from datetime import datetime, timezone + >>> payload = bytes("test_mock_message", encoding="utf-8") + >>> t1 = datetime.fromtimestamp(1662998400, timezone.utc) + >>> t2 = datetime.fromtimestamp(1662998460, timezone.utc) + >>> d = Datum( + ... keys=["test_key"], + ... value=payload, + ... event_time=t1, + ... watermark=t2, + ... ) + """ + + __slots__ = ("_keys", "_value", "_event_time", "_watermark") + + _keys: list[str] + _value: bytes + _event_time: datetime + _watermark: datetime + + def __init__( + self, + keys: list[str], + value: bytes, + event_time: datetime, + watermark: datetime, + ): + self._keys = keys or list() + self._value = value or b"" + if not isinstance(event_time, datetime): + raise TypeError(f"Wrong data type: {type(event_time)} for Datum.event_time") + self._event_time = event_time + if not isinstance(watermark, datetime): + raise TypeError(f"Wrong data type: {type(watermark)} for Datum.watermark") + self._watermark = watermark + + def keys(self) -> list[str]: + """Returns the keys of the event""" + return self._keys + + @property + def value(self) -> bytes: + """Returns the value of the event.""" + return self._value + + @property + def event_time(self) -> datetime: + """Returns the event time of the event.""" + return self._event_time + + @property + def watermark(self) -> datetime: + """Returns the watermark of the event.""" + return self._watermark + + +class MapStreamerClass(metaclass=ABCMeta): + """ + Provides an interface to write a Map Streamer + which will be exposed over a gRPC server. + + Args: + + """ + + def __call__(self, *args, **kwargs): + """ + Allow to call handler function directly if class instance is sent + """ + return self.handler(*args, **kwargs) + + @abstractmethod + async def handler(self, keys: list[str], datum: Datum) -> AsyncIterable[Message]: + """ + Write a handler function which implements the MapCallable interface. + """ + pass + + +MapStreamAsyncCallable = Callable[[list[str], Datum], AsyncIterable[Message]] +MapStreamCallable = Union[MapStreamerClass, MapStreamAsyncCallable] diff --git a/pynumaflow/mapstreamer/async_server.py b/pynumaflow/mapstreamer/async_server.py new file mode 100644 index 00000000..7556c19e --- /dev/null +++ b/pynumaflow/mapstreamer/async_server.py @@ -0,0 +1,100 @@ +import logging +import multiprocessing +import os + +from collections.abc import AsyncIterable + +from google.protobuf import empty_pb2 as _empty_pb2 + +from pynumaflow import setup_logging +from pynumaflow._constants import ( + MAX_MESSAGE_SIZE, + MAP_STREAM_SOCK_PATH, +) +from pynumaflow.mapstreamer import Datum +from pynumaflow.mapstreamer._dtypes import MapStreamCallable +from pynumaflow.mapstreamer.proto import mapstream_pb2 +from pynumaflow.mapstreamer.proto import mapstream_pb2_grpc +from pynumaflow.types import NumaflowServicerContext + +_LOGGER = setup_logging(__name__) +if os.getenv("PYTHONDEBUG"): + _LOGGER.setLevel(logging.DEBUG) + +_PROCESS_COUNT = multiprocessing.cpu_count() +MAX_THREADS = int(os.getenv("MAX_THREADS", 0)) or (_PROCESS_COUNT * 4) + + +class AsyncMapStreamer(mapstream_pb2_grpc.MapStreamServicer): + """ + Provides an interface to write a Map Streamer + which will be exposed over gRPC. + + Args: + handler: Function callable following the type signature of MapStreamCallable + sock_path: Path to the UNIX Domain Socket + max_message_size: The max message size in bytes the server can receive and send + max_threads: The max number of threads to be spawned; + defaults to number of processors x4 + + Example invocation: + >>> from typing import Iterator + >>> from pynumaflow.mapstreamer import Messages, Message \ + ... Datum, AsyncMapStreamer + ... import aiorun + >>> async def map_stream_handler(key: [str], datums: Datum) -> AsyncIterable[Message]: + ... val = datum.value + ... _ = datum.event_time + ... _ = datum.watermark + ... for i in range(10): + ... yield Message(val, keys=keys) + ... + >>> grpc_server = AsyncMapStreamer(handler=map_stream_handler) + >>> aiorun.run(grpc_server.start()) + """ + + def __init__( + self, + handler: MapStreamCallable, + ): + self.__map_stream_handler: MapStreamCallable = handler + + async def MapStreamFn( + self, + request: mapstream_pb2.MapStreamRequest, + context: NumaflowServicerContext, + ) -> AsyncIterable[mapstream_pb2.MapStreamResponse]: + """ + Applies a map function to a datum stream in streaming mode. + The pascal case function name comes from the proto mapstream_pb2_grpc.py file. + """ + + async for res in self.__invoke_map_stream( + list(request.keys), + Datum( + keys=list(request.keys), + value=request.value, + event_time=request.event_time.ToDatetime(), + watermark=request.watermark.ToDatetime(), + ), + ): + yield mapstream_pb2.MapStreamResponse(result=res) + + async def __invoke_map_stream(self, keys: list[str], req: Datum): + try: + async for msg in self.__map_stream_handler(keys, req): + yield mapstream_pb2.MapStreamResponse.Result( + keys=msg.keys, value=msg.value, tags=msg.tags + ) + except Exception as err: + _LOGGER.critical("UDFError, re-raising the error", exc_info=True) + raise err + + async def IsReady( + self, request: _empty_pb2.Empty, context: NumaflowServicerContext + ) -> mapstream_pb2.ReadyResponse: + """ + IsReady is the heartbeat endpoint for gRPC. + The pascal case function name comes from the proto mapstream_pb2_grpc.py file. + """ + return mapstream_pb2.ReadyResponse(ready=True) diff --git a/pynumaflow/mapstreamer/mapstream.py b/pynumaflow/mapstreamer/mapstream.py new file mode 100644 index 00000000..81e24870 --- /dev/null +++ b/pynumaflow/mapstreamer/mapstream.py @@ -0,0 +1,72 @@ +import os + +import aiorun +import grpc + +from pynumaflow.mapstreamer import AsyncMapStreamer +from pynumaflow.mapstreamer.proto import mapstream_pb2_grpc + +from pynumaflow._constants import ( + MAP_STREAM_SOCK_PATH, + MAX_MESSAGE_SIZE, + MAX_THREADS, + ServerType, + _LOGGER, +) + +from pynumaflow.mapstreamer._dtypes import MapStreamCallable + +from pynumaflow.shared.server import NumaflowServer, start_async_server + + +class MapStreamServer(NumaflowServer): + """ + Class for a new Map Stream Server instance. + """ + + def __init__( + self, + map_stream_instance: MapStreamCallable, + sock_path=MAP_STREAM_SOCK_PATH, + max_message_size=MAX_MESSAGE_SIZE, + max_threads=MAX_THREADS, + server_type=ServerType.Async, + ): + """ """ + self.map_stream_instance: MapStreamCallable = map_stream_instance + self.sock_path = f"unix://{sock_path}" + self.max_message_size = max_message_size + self.max_threads = min(max_threads, int(os.getenv("MAX_THREADS", "4"))) + self.server_type = server_type + + self._server_options = [ + ("grpc.max_send_message_length", self.max_message_size), + ("grpc.max_receive_message_length", self.max_message_size), + ] + + def start(self): + if self.server_type == ServerType.Async: + aiorun.run(self.aexec()) + else: + _LOGGER.error("Server type not supported", self.server_type) + raise NotImplementedError + + async def aexec(self): + server = grpc.aio.server() + server.add_insecure_port(self.sock_path) + map_servicer = self.get_servicer( + map_stream_instance=self.map_stream_instance, server_type=self.server_type + ) + mapstream_pb2_grpc.add_MapStreamServicer_to_server( + map_servicer, + server, + ) + _LOGGER.info("Starting Map Stream Server") + await start_async_server(server, self.sock_path, self.max_threads, self._server_options) + + def get_servicer(self, map_stream_instance: MapStreamCallable, server_type: ServerType): + if server_type == ServerType.Async: + map_servicer = AsyncMapStreamer(handler=map_stream_instance) + else: + raise NotImplementedError + return map_servicer diff --git a/pynumaflow/mapstreamer/proto/__init__.py b/pynumaflow/mapstreamer/proto/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pynumaflow/mapstreamer/proto/mapstream.proto b/pynumaflow/mapstreamer/proto/mapstream.proto new file mode 100644 index 00000000..45605169 --- /dev/null +++ b/pynumaflow/mapstreamer/proto/mapstream.proto @@ -0,0 +1,44 @@ +syntax = "proto3"; + +import "google/protobuf/empty.proto"; +import "google/protobuf/timestamp.proto"; + + +package mapstream.v1; + +service MapStream { + // MapStreamFn applies a function to each request element and returns a stream. + rpc MapStreamFn(MapStreamRequest) returns (stream MapStreamResponse); + + // IsReady is the heartbeat endpoint for gRPC. + rpc IsReady(google.protobuf.Empty) returns (ReadyResponse); +} + +/** + * MapStreamRequest represents a request element. + */ +message MapStreamRequest { + repeated string keys = 1; + bytes value = 2; + google.protobuf.Timestamp event_time = 3; + google.protobuf.Timestamp watermark = 4; +} + +/** + * MapStreamResponse represents a response element. + */ +message MapStreamResponse { + message Result { + repeated string keys = 1; + bytes value = 2; + repeated string tags = 3; + } + Result result = 1; +} + +/** + * ReadyResponse is the health check result. + */ +message ReadyResponse { + bool ready = 1; +} \ No newline at end of file diff --git a/pynumaflow/mapstreamer/proto/mapstream_pb2.py b/pynumaflow/mapstreamer/proto/mapstream_pb2.py new file mode 100644 index 00000000..f1c2c169 --- /dev/null +++ b/pynumaflow/mapstreamer/proto/mapstream_pb2.py @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: mapstream.proto +"""Generated protocol buffer code.""" +from google.protobuf import descriptor as _descriptor +from google.protobuf import descriptor_pool as _descriptor_pool +from google.protobuf import symbol_database as _symbol_database +from google.protobuf.internal import builder as _builder + +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + +from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2 +from google.protobuf import timestamp_pb2 as google_dot_protobuf_dot_timestamp__pb2 + + +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( + b'\n\x0fmapstream.proto\x12\x0cmapstream.v1\x1a\x1bgoogle/protobuf/empty.proto\x1a\x1fgoogle/protobuf/timestamp.proto"\x8e\x01\n\x10MapStreamRequest\x12\x0c\n\x04keys\x18\x01 \x03(\t\x12\r\n\x05value\x18\x02 \x01(\x0c\x12.\n\nevent_time\x18\x03 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12-\n\twatermark\x18\x04 \x01(\x0b\x32\x1a.google.protobuf.Timestamp"\x80\x01\n\x11MapStreamResponse\x12\x36\n\x06result\x18\x01 \x01(\x0b\x32&.mapstream.v1.MapStreamResponse.Result\x1a\x33\n\x06Result\x12\x0c\n\x04keys\x18\x01 \x03(\t\x12\r\n\x05value\x18\x02 \x01(\x0c\x12\x0c\n\x04tags\x18\x03 \x03(\t"\x1e\n\rReadyResponse\x12\r\n\x05ready\x18\x01 \x01(\x08\x32\x9d\x01\n\tMapStream\x12P\n\x0bMapStreamFn\x12\x1e.mapstream.v1.MapStreamRequest\x1a\x1f.mapstream.v1.MapStreamResponse0\x01\x12>\n\x07IsReady\x12\x16.google.protobuf.Empty\x1a\x1b.mapstream.v1.ReadyResponseb\x06proto3' +) + +_globals = globals() +_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) +_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "mapstream_pb2", _globals) +if _descriptor._USE_C_DESCRIPTORS == False: + DESCRIPTOR._options = None + _globals["_MAPSTREAMREQUEST"]._serialized_start = 96 + _globals["_MAPSTREAMREQUEST"]._serialized_end = 238 + _globals["_MAPSTREAMRESPONSE"]._serialized_start = 241 + _globals["_MAPSTREAMRESPONSE"]._serialized_end = 369 + _globals["_MAPSTREAMRESPONSE_RESULT"]._serialized_start = 318 + _globals["_MAPSTREAMRESPONSE_RESULT"]._serialized_end = 369 + _globals["_READYRESPONSE"]._serialized_start = 371 + _globals["_READYRESPONSE"]._serialized_end = 401 + _globals["_MAPSTREAM"]._serialized_start = 404 + _globals["_MAPSTREAM"]._serialized_end = 561 +# @@protoc_insertion_point(module_scope) diff --git a/pynumaflow/mapstreamer/proto/mapstream_pb2_grpc.py b/pynumaflow/mapstreamer/proto/mapstream_pb2_grpc.py new file mode 100644 index 00000000..305c8e05 --- /dev/null +++ b/pynumaflow/mapstreamer/proto/mapstream_pb2_grpc.py @@ -0,0 +1,125 @@ +# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT! +"""Client and server classes corresponding to protobuf-defined services.""" +import grpc + +from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2 +from . import mapstream_pb2 as mapstream__pb2 + + +class MapStreamStub(object): + """Missing associated documentation comment in .proto file.""" + + def __init__(self, channel): + """Constructor. + + Args: + channel: A grpc.Channel. + """ + self.MapStreamFn = channel.unary_stream( + "/mapstream.v1.MapStream/MapStreamFn", + request_serializer=mapstream__pb2.MapStreamRequest.SerializeToString, + response_deserializer=mapstream__pb2.MapStreamResponse.FromString, + ) + self.IsReady = channel.unary_unary( + "/mapstream.v1.MapStream/IsReady", + request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, + response_deserializer=mapstream__pb2.ReadyResponse.FromString, + ) + + +class MapStreamServicer(object): + """Missing associated documentation comment in .proto file.""" + + def MapStreamFn(self, request, context): + """MapStreamFn applies a function to each request element and returns a stream.""" + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details("Method not implemented!") + raise NotImplementedError("Method not implemented!") + + def IsReady(self, request, context): + """IsReady is the heartbeat endpoint for gRPC.""" + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details("Method not implemented!") + raise NotImplementedError("Method not implemented!") + + +def add_MapStreamServicer_to_server(servicer, server): + rpc_method_handlers = { + "MapStreamFn": grpc.unary_stream_rpc_method_handler( + servicer.MapStreamFn, + request_deserializer=mapstream__pb2.MapStreamRequest.FromString, + response_serializer=mapstream__pb2.MapStreamResponse.SerializeToString, + ), + "IsReady": grpc.unary_unary_rpc_method_handler( + servicer.IsReady, + request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + response_serializer=mapstream__pb2.ReadyResponse.SerializeToString, + ), + } + generic_handler = grpc.method_handlers_generic_handler( + "mapstream.v1.MapStream", rpc_method_handlers + ) + server.add_generic_rpc_handlers((generic_handler,)) + + +# This class is part of an EXPERIMENTAL API. +class MapStream(object): + """Missing associated documentation comment in .proto file.""" + + @staticmethod + def MapStreamFn( + request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None, + ): + return grpc.experimental.unary_stream( + request, + target, + "/mapstream.v1.MapStream/MapStreamFn", + mapstream__pb2.MapStreamRequest.SerializeToString, + mapstream__pb2.MapStreamResponse.FromString, + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + ) + + @staticmethod + def IsReady( + request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None, + ): + return grpc.experimental.unary_unary( + request, + target, + "/mapstream.v1.MapStream/IsReady", + google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, + mapstream__pb2.ReadyResponse.FromString, + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + ) diff --git a/pynumaflow/shared/server.py b/pynumaflow/shared/server.py index bb6e356e..03074d94 100644 --- a/pynumaflow/shared/server.py +++ b/pynumaflow/shared/server.py @@ -8,8 +8,6 @@ import grpc from pynumaflow._constants import ( - MAX_THREADS, - ServerType, _LOGGER, MULTIPROC_MAP_SOCK_ADDR, ) @@ -45,31 +43,31 @@ def start(self): raise NotImplementedError -def prepare_server( - sock_path: str, - server_type: ServerType, - max_threads=MAX_THREADS, - server_options=None, - process_count=1, -): - """ - Create a new grpc Server instance. - A new servicer instance is created and attached to the server. - The server instance is returned. - - """ - if server_type == ServerType.Sync: - server = _get_sync_server( - bind_address=sock_path, threads_per_proc=max_threads, server_options=server_options - ) - return server - # elif server_type == ServerType.Multiproc: - # servers, server_ports = get_multiproc_servers( - # max_threads=max_threads, - # server_options=server_options, - # process_count=process_count, - # ) - # return servers, server_ports +# def prepare_server( +# sock_path: str, +# server_type: ServerType, +# max_threads=MAX_THREADS, +# server_options=None, +# process_count=1, +# ): +# """ +# Create a new grpc Server instance. +# A new servicer instance is created and attached to the server. +# The server instance is returned. +# +# """ +# if server_type == ServerType.Sync: +# server = _get_sync_server( +# bind_address=sock_path, threads_per_proc=max_threads, server_options=server_options +# ) +# return server +# # elif server_type == ServerType.Multiproc: +# # servers, server_ports = get_multiproc_servers( +# # max_threads=max_threads, +# # server_options=server_options, +# # process_count=process_count, +# # ) +# # return servers, server_ports def write_info_file(protocol: Protocol) -> None: @@ -84,43 +82,38 @@ def write_info_file(protocol: Protocol) -> None: info_server_write(server_info=serv_info, info_file=SERVER_INFO_FILE_PATH) -def sync_server_start(servicer, bind_address: str, max_threads: int, - server_options=None, udf_type: str = "Map"): +def sync_server_start( + servicer, bind_address: str, max_threads: int, server_options=None, udf_type: str = "Map" +): """ Starts the Synchronous server instance on the given UNIX socket with given max threads. Wait for the server to terminate. """ - # # start_sync_server_util(server=server) - # # Start the server - # server.start() - # # Add the server information to the server info file, - # # here we just write the protocol and language information - # write_info_file(Protocol.UDS) - # # Wait for the server to terminate - # server.wait_for_termination() + # Add the server information to the server info file, + # here we just write the protocol and language information server_info = ServerInfo( protocol=Protocol.UDS, language=Language.PYTHON, version=get_sdk_version(), ) - _run_server(servicer=servicer, bind_address=bind_address, threads_per_proc=max_threads, - server_options=server_options, udf_type=udf_type, server_info=server_info) + # Run a sync server instances + _run_server( + servicer=servicer, + bind_address=bind_address, + threads_per_proc=max_threads, + server_options=server_options, + udf_type=udf_type, + server_info=server_info, + ) -def start_sync_server_util(server: grpc.Server): +def _run_server( + servicer, bind_address: str, threads_per_proc, server_options, udf_type: str, server_info=None +) -> None: """ - Starts the Synchronous server instance on the given UNIX socket with given max threads. - Wait for the server to terminate. + Starts the Synchronous server instance on the given UNIX socket + with given max threads. Wait for the server to terminate. """ - # Start the server - server.start() - # Wait for the server to terminate - server.wait_for_termination() - - -def _run_server(servicer, bind_address: str, threads_per_proc, server_options, - udf_type: str, server_info=None) -> None: - """Start a server in a subprocess.""" server = grpc.server( ThreadPoolExecutor( max_workers=threads_per_proc, @@ -139,9 +132,9 @@ def _run_server(servicer, bind_address: str, threads_per_proc, server_options, server.wait_for_termination() -def start_multiproc_server(max_threads: int, servicer, - process_count: int, server_options=None, - udf_type: str = "Map"): +def start_multiproc_server( + max_threads: int, servicer, process_count: int, server_options=None, udf_type: str = "Map" +): """ Start N grpc servers in different processes where N = The number of CPUs or the value of the env var NUM_CPU_MULTIPROC defined by the user. The max value @@ -167,8 +160,10 @@ def start_multiproc_server(max_threads: int, servicer, # NOTE: It is imperative that the worker subprocesses be forked before # any gRPC servers start up. See # https://github.com/grpc/grpc/issues/16001 for more details. - worker = multiprocessing.Process(target=_run_server, args=(servicer, bind_address, - max_threads, server_options, udf_type)) + worker = multiprocessing.Process( + target=_run_server, + args=(servicer, bind_address, max_threads, server_options, udf_type), + ) worker.start() workers.append(worker) server_ports.append(port) @@ -191,7 +186,7 @@ def start_multiproc_server(max_threads: int, servicer, async def start_async_server( - server_async: grpc.aio.Server, sock_path: str, max_threads: int, cleanup_coroutines: list + server_async: grpc.aio.Server, sock_path: str, max_threads: int, cleanup_coroutines: list ): """ Starts the Async server instance on the given UNIX socket with given max threads. @@ -260,9 +255,6 @@ def _get_sync_server(bind_address: str, threads_per_proc: int, server_options: l _LOGGER.critical("Failed to start server: %s", err, exc_info=True) raise err return server - # server.start() - # _LOGGER.info("GRPC Multi-Processor Server listening on: %s %d", bind_address, os.getpid()) - # server.wait_for_termination() @contextlib.contextmanager @@ -277,22 +269,3 @@ def _reserve_port(port_num: int) -> Iterator[int]: yield sock.getsockname()[1] finally: sock.close() - -# def get_multiproc_servers(process_count: int, max_threads=MAX_THREADS, server_options=None): -# # workers = [] -# server_ports = [] -# servers = [] -# for _ in range(process_count): -# # Find a port to bind to for each server, thus sending the port number = 0 -# # to the _reserve_port function so that kernel can find and return a free port -# with _reserve_port(port_num=0) as port: -# bind_address = f"{MULTIPROC_MAP_SOCK_ADDR}:{port}" -# server = _get_sync_server( -# bind_address=bind_address, -# threads_per_proc=max_threads, -# server_options=server_options, -# ) -# servers.append(server) -# server_ports.append(port) -# -# return servers, server_ports From 88bb474d48a2a69e6e434ebf7618b1f92e5d5c55 Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Mon, 8 Jan 2024 10:43:20 -0800 Subject: [PATCH 24/78] mapstream Signed-off-by: Sidhant Kohli --- pynumaflow/mapper/test.py | 53 --------------------------------------- 1 file changed, 53 deletions(-) delete mode 100644 pynumaflow/mapper/test.py diff --git a/pynumaflow/mapper/test.py b/pynumaflow/mapper/test.py deleted file mode 100644 index cd5f0899..00000000 --- a/pynumaflow/mapper/test.py +++ /dev/null @@ -1,53 +0,0 @@ -import asyncio -from abc import ABCMeta, abstractmethod - -import aiorun - - -class MapperClass(metaclass=ABCMeta): - """ - Provides an interface to write a Mapper - which will be exposed over a Synchronous gRPC server. - - Args: - - """ - - def __call__(self, *args, **kwargs): - """ - Allow to call handler function directly if class instance is sent - """ - return self.handler(*args, **kwargs) - - @abstractmethod - def handler(self, key: str) -> int: - """ - Write a handler function which implements the MapCallable interface. - """ - pass - - -class Sync(MapperClass): - def handler(self, key: str) -> int: - print("Sync") - return 1 - - -class Async(MapperClass): - async def handler(self, key: str) -> int: - await self.hey(key) - - async def hey(self, key): - print(key) - await asyncio.sleep(10) - return 1 - # return await self.handler( - - -if __name__ == "__main__": - hello = Sync() - print(hello.handler("hello")) - hello = Async() - aiorun.run(hello.handler("hello")) - hello2 = Async() - aiorun.run(hello2.handler("hello2")) From e187f90d4d27265426232e743ee70676851c2b61 Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Mon, 8 Jan 2024 10:52:58 -0800 Subject: [PATCH 25/78] mapstream Signed-off-by: Sidhant Kohli --- pynumaflow/mapstreamer/async_server.py | 4 ---- pynumaflow/mapstreamer/mapstream.py | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/pynumaflow/mapstreamer/async_server.py b/pynumaflow/mapstreamer/async_server.py index 7556c19e..0ecb4de7 100644 --- a/pynumaflow/mapstreamer/async_server.py +++ b/pynumaflow/mapstreamer/async_server.py @@ -7,10 +7,6 @@ from google.protobuf import empty_pb2 as _empty_pb2 from pynumaflow import setup_logging -from pynumaflow._constants import ( - MAX_MESSAGE_SIZE, - MAP_STREAM_SOCK_PATH, -) from pynumaflow.mapstreamer import Datum from pynumaflow.mapstreamer._dtypes import MapStreamCallable from pynumaflow.mapstreamer.proto import mapstream_pb2 diff --git a/pynumaflow/mapstreamer/mapstream.py b/pynumaflow/mapstreamer/mapstream.py index 81e24870..9fd318c3 100644 --- a/pynumaflow/mapstreamer/mapstream.py +++ b/pynumaflow/mapstreamer/mapstream.py @@ -3,7 +3,7 @@ import aiorun import grpc -from pynumaflow.mapstreamer import AsyncMapStreamer +from pynumaflow.mapstreamer.async_server import AsyncMapStreamer from pynumaflow.mapstreamer.proto import mapstream_pb2_grpc from pynumaflow._constants import ( From d7ff4a15a501499f4061700f877e0d20a697cd30 Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Mon, 8 Jan 2024 11:44:49 -0800 Subject: [PATCH 26/78] reduce Signed-off-by: Sidhant Kohli --- pynumaflow/mapper/map.py | 1 - pynumaflow/reducer/__init__.py | 22 ++ pynumaflow/reducer/_dtypes.py | 262 ++++++++++++++++++++ pynumaflow/reducer/async_server.py | 202 +++++++++++++++ pynumaflow/reducer/asynciter.py | 23 ++ pynumaflow/reducer/proto/__init__.py | 0 pynumaflow/reducer/proto/reduce.proto | 44 ++++ pynumaflow/reducer/proto/reduce_pb2.py | 38 +++ pynumaflow/reducer/proto/reduce_pb2_grpc.py | 123 +++++++++ pynumaflow/reducer/reduce.py | 61 +++++ 10 files changed, 775 insertions(+), 1 deletion(-) create mode 100644 pynumaflow/reducer/__init__.py create mode 100644 pynumaflow/reducer/_dtypes.py create mode 100644 pynumaflow/reducer/async_server.py create mode 100644 pynumaflow/reducer/asynciter.py create mode 100644 pynumaflow/reducer/proto/__init__.py create mode 100644 pynumaflow/reducer/proto/reduce.proto create mode 100644 pynumaflow/reducer/proto/reduce_pb2.py create mode 100644 pynumaflow/reducer/proto/reduce_pb2_grpc.py create mode 100644 pynumaflow/reducer/reduce.py diff --git a/pynumaflow/mapper/map.py b/pynumaflow/mapper/map.py index 323a92d1..6ecae55d 100644 --- a/pynumaflow/mapper/map.py +++ b/pynumaflow/mapper/map.py @@ -67,7 +67,6 @@ def start(self) -> None: if self.server_type == ServerType.Sync: self.exec() elif self.server_type == ServerType.Async: - _LOGGER.info("Starting Async Map Server with aiorun...") aiorun.run(self.aexec()) elif self.server_type == ServerType.Multiproc: self.exec_multiproc() diff --git a/pynumaflow/reducer/__init__.py b/pynumaflow/reducer/__init__.py new file mode 100644 index 00000000..2c7df2fd --- /dev/null +++ b/pynumaflow/reducer/__init__.py @@ -0,0 +1,22 @@ +from pynumaflow.reducer._dtypes import ( + Message, + Messages, + Datum, + IntervalWindow, + Metadata, + DROP, + ReducerClass, +) + +__all__ = [ + "Message", + "Messages", + "Datum", + "IntervalWindow", + "Metadata", + "DROP", + "ReduceServer", + "ReducerClass", +] + +from pynumaflow.reducer.reduce import ReduceServer diff --git a/pynumaflow/reducer/_dtypes.py b/pynumaflow/reducer/_dtypes.py new file mode 100644 index 00000000..4f96d03f --- /dev/null +++ b/pynumaflow/reducer/_dtypes.py @@ -0,0 +1,262 @@ +from abc import ABCMeta, abstractmethod +from asyncio import Task +from collections.abc import Iterator, Sequence, Awaitable +from dataclasses import dataclass +from datetime import datetime +from typing import TypeVar, Callable, Union +from collections.abc import AsyncIterable +from warnings import warn + +from pynumaflow.reducer.asynciter import NonBlockingIterator +from pynumaflow._constants import DROP + +M = TypeVar("M", bound="Message") +Ms = TypeVar("Ms", bound="Messages") + + +@dataclass(init=False) +class Message: + """ + Basic datatype for data passing to the next vertex/vertices. + + Args: + value: data in bytes + keys: []string keys for vertex (optional) + tags: []string tags for conditional forwarding (optional) + """ + + __slots__ = ("_value", "_keys", "_tags") + + _value: bytes + _keys: list[str] + _tags: list[str] + + def __init__(self, value: bytes, keys: list[str] = None, tags: list[str] = None): + """ + Creates a Message object to send value to a vertex. + """ + self._keys = keys or [] + self._tags = tags or [] + self._value = value or b"" + + # returns the Message Object which will be dropped + @classmethod + def to_drop(cls: type[M]) -> M: + return cls(b"", None, [DROP]) + + @property + def value(self) -> bytes: + return self._value + + @property + def keys(self) -> list[str]: + return self._keys + + @property + def tags(self) -> list[str]: + return self._tags + + +class Messages(Sequence[M]): + """ + Class to define a list of Message objects. + + Args: + messages: list of Message objects. + """ + + __slots__ = ("_messages",) + + def __init__(self, *messages: M): + self._messages = list(messages) or [] + + def __str__(self) -> str: + return str(self._messages) + + def __repr__(self) -> str: + return str(self) + + def __len__(self) -> int: + return len(self._messages) + + def __iter__(self) -> Iterator[M]: + return iter(self._messages) + + def __getitem__(self, index: int) -> M: + if isinstance(index, slice): + raise TypeError("Slicing is not supported for Messages") + return self._messages[index] + + def append(self, message: Message) -> None: + self._messages.append(message) + + def items(self) -> list[Message]: + warn( + "Using items is deprecated and will be removed in v0.5. " + "Iterate or index the Messages object instead.", + DeprecationWarning, + stacklevel=2, + ) + return self._messages + + +@dataclass(init=False) +class Datum: + """ + Class to define the important information for the event. + Args: + keys: the keys of the event. + value: the payload of the event. + event_time: the event time of the event. + watermark: the watermark of the event. + >>> # Example usage + >>> from pynumaflow.reducer import Datum + >>> from datetime import datetime, timezone + >>> payload = bytes("test_mock_message", encoding="utf-8") + >>> t1 = datetime.fromtimestamp(1662998400, timezone.utc) + >>> t2 = datetime.fromtimestamp(1662998460, timezone.utc) + >>> d = Datum( + ... keys=["test_key"], + ... value=payload, + ... event_time=t1, + ... watermark=t2, + ... ) + """ + + __slots__ = ("_keys", "_value", "_event_time", "_watermark") + + _keys: list[str] + _value: bytes + _event_time: datetime + _watermark: datetime + + def __init__( + self, + keys: list[str], + value: bytes, + event_time: datetime, + watermark: datetime, + ): + self._keys = keys or list() + self._value = value or b"" + if not isinstance(event_time, datetime): + raise TypeError(f"Wrong data type: {type(event_time)} for Datum.event_time") + self._event_time = event_time + if not isinstance(watermark, datetime): + raise TypeError(f"Wrong data type: {type(watermark)} for Datum.watermark") + self._watermark = watermark + + def keys(self) -> list[str]: + """Returns the keys of the event""" + return self._keys + + @property + def value(self) -> bytes: + """Returns the value of the event.""" + return self._value + + @property + def event_time(self) -> datetime: + """Returns the event time of the event.""" + return self._event_time + + @property + def watermark(self) -> datetime: + """Returns the watermark of the event.""" + return self._watermark + + +@dataclass(init=False) +class IntervalWindow: + """Defines the start and end of the interval window for the event.""" + + __slots__ = ("_start", "_end") + + _start: datetime + _end: datetime + + def __init__(self, start: datetime, end: datetime): + self._start = start + self._end = end + + @property + def start(self): + """Returns the start point of the interval window.""" + return self._start + + @property + def end(self): + """Returns the end point of the interval window.""" + return self._end + + +@dataclass(init=False) +class Metadata: + """Defines the metadata for the event.""" + + __slots__ = ("_interval_window",) + + _interval_window: IntervalWindow + + def __init__(self, interval_window: IntervalWindow): + self._interval_window = interval_window + + @property + def interval_window(self): + """Returns the interval window for the event.""" + return self._interval_window + + +@dataclass +class ReduceResult: + """Defines the object to hold the result of reduce computation.""" + + __slots__ = ("_future", "_iterator", "_key") + + _future: Task + _iterator: NonBlockingIterator + _key: list[str] + + @property + def future(self): + """Returns the future result of computation.""" + return self._future + + @property + def iterator(self): + """Returns the handle to the producer queue.""" + return self._iterator + + @property + def keys(self) -> list[str]: + """Returns the keys of the partition.""" + return self._key + + +class ReducerClass(metaclass=ABCMeta): + """ + Provides an interface to write a Reducer + which will be exposed over a gRPC server. + + Args: + + """ + + def __call__(self, *args, **kwargs): + """ + Allow to call handler function directly if class instance is sent + """ + return self.handler(*args, **kwargs) + + @abstractmethod + async def handler( + self, keys: list[str], datums: AsyncIterable[Datum], md: Metadata + ) -> Messages: + """ + Write a handler function which implements the ReduceCallable interface. + """ + pass + + +ReduceAsyncCallable = Callable[[list[str], AsyncIterable[Datum], Metadata], Awaitable[Messages]] +ReduceCallable = Union[ReduceAsyncCallable, ReducerClass] diff --git a/pynumaflow/reducer/async_server.py b/pynumaflow/reducer/async_server.py new file mode 100644 index 00000000..dc355ec6 --- /dev/null +++ b/pynumaflow/reducer/async_server.py @@ -0,0 +1,202 @@ +import asyncio +import logging +import multiprocessing +import os + +from datetime import datetime, timezone +from collections.abc import AsyncIterable + +import grpc +from google.protobuf import empty_pb2 as _empty_pb2 + +from pynumaflow import setup_logging +from pynumaflow._constants import ( + WIN_START_TIME, + WIN_END_TIME, + STREAM_EOF, + DELIMITER, +) +from pynumaflow.reducer import Datum, IntervalWindow, Metadata +from pynumaflow.reducer._dtypes import ReduceResult, ReduceCallable +from pynumaflow.reducer.asynciter import NonBlockingIterator +from pynumaflow.reducer.proto import reduce_pb2 +from pynumaflow.reducer.proto import reduce_pb2_grpc +from pynumaflow.types import NumaflowServicerContext + +_LOGGER = setup_logging(__name__) +if os.getenv("PYTHONDEBUG"): + _LOGGER.setLevel(logging.DEBUG) + +_PROCESS_COUNT = multiprocessing.cpu_count() +MAX_THREADS = int(os.getenv("MAX_THREADS", 0)) or (_PROCESS_COUNT * 4) + + +async def datum_generator( + request_iterator: AsyncIterable[reduce_pb2.ReduceRequest], +) -> AsyncIterable[Datum]: + async for d in request_iterator: + datum = Datum( + keys=list(d.keys), + value=d.value, + event_time=d.event_time.ToDatetime(), + watermark=d.watermark.ToDatetime(), + ) + yield datum + + +class AsyncReducer(reduce_pb2_grpc.ReduceServicer): + """ + Provides an interface to write a Reduce Function + which will be exposed over gRPC. + + Args: + handler: Function callable following the type signature of ReduceCallable + sock_path: Path to the UNIX Domain Socket + max_message_size: The max message size in bytes the server can receive and send + max_threads: The max number of threads to be spawned; + defaults to number of processors x4 + + Example invocation: + >>> from typing import Iterator + >>> from pynumaflow.reducer import Messages, Message\ + ... Datum, Metadata, AsyncReducer + ... import aiorun + ... + >>> async def reduce_handler(key: list[str], datums: AsyncIterable[Datum], + >>> md: Metadata) -> Messages: + ... interval_window = md.interval_window + ... counter = 0 + ... async for _ in datums: + ... counter += 1 + ... msg = ( + ... f"counter:{counter} interval_window_start:{interval_window.start} " + ... f"interval_window_end:{interval_window.end}" + ... ) + ... return Messages(Message(value=str.encode(msg), keys=keys)) + ... + >>> grpc_server = AsyncReducer(handler=reduce_handler) + >>> aiorun.run(grpc_server.start()) + """ + + def __init__( + self, + handler: ReduceCallable, + ): + # Collection for storing strong references to all running tasks. + # Event loop only keeps a weak reference, which can cause it to + # get lost during execution. + self.background_tasks = set() + + async def ReduceFn( + self, + request_iterator: AsyncIterable[reduce_pb2.ReduceRequest], + context: NumaflowServicerContext, + ) -> reduce_pb2.ReduceResponse: + """ + Applies a reduce function to a datum stream. + The pascal case function name comes from the proto reduce_pb2_grpc.py file. + """ + + start, end = None, None + for metadata_key, metadata_value in context.invocation_metadata(): + if metadata_key == WIN_START_TIME: + start = metadata_value + elif metadata_key == WIN_END_TIME: + end = metadata_value + if not (start or end): + context.set_code(grpc.StatusCode.INVALID_ARGUMENT) + context.set_details( + f"Expected to have all key/window_start_time/window_end_time; " + f"got start: {start}, end: {end}." + ) + yield reduce_pb2.ReduceResponse(results=[]) + return + + start_dt = datetime.fromtimestamp(int(start) / 1e3, timezone.utc) + end_dt = datetime.fromtimestamp(int(end) / 1e3, timezone.utc) + interval_window = IntervalWindow(start=start_dt, end=end_dt) + + datum_iterator = datum_generator(request_iterator=request_iterator) + + response_task = asyncio.create_task( + self.__async_reduce_handler(interval_window, datum_iterator) + ) + + # Save a reference to the result of this function, to avoid a + # task disappearing mid-execution. + self.background_tasks.add(response_task) + response_task.add_done_callback(lambda t: self.background_tasks.remove(t)) + + await response_task + results_futures = response_task.result() + + try: + for fut in results_futures: + await fut + yield reduce_pb2.ReduceResponse(results=fut.result()) + except Exception as e: + context.set_code(grpc.StatusCode.UNKNOWN) + context.set_details(e.__str__()) + yield reduce_pb2.ReduceResponse(results=[]) + + async def __async_reduce_handler(self, interval_window, datum_iterator: AsyncIterable[Datum]): + callable_dict = {} + # iterate through all the values + async for d in datum_iterator: + keys = d.keys() + unified_key = DELIMITER.join(keys) + result = callable_dict.get(unified_key, None) + + if not result: + niter = NonBlockingIterator() + riter = niter.read_iterator() + # schedule an async task for consumer + # returns a future that will give the results later. + task = asyncio.create_task( + self.__invoke_reduce(keys, riter, Metadata(interval_window=interval_window)) + ) + # Save a reference to the result of this function, to avoid a + # task disappearing mid-execution. + self.background_tasks.add(task) + task.add_done_callback(lambda t: self.background_tasks.remove(t)) + result = ReduceResult(task, niter, keys) + + callable_dict[unified_key] = result + + await result.iterator.put(d) + + for unified_key in callable_dict: + await callable_dict[unified_key].iterator.put(STREAM_EOF) + + tasks = [] + for unified_key in callable_dict: + fut = callable_dict[unified_key].future + tasks.append(fut) + + return tasks + + async def __invoke_reduce( + self, keys: list[str], request_iterator: AsyncIterable[Datum], md: Metadata + ): + try: + msgs = await self.__reduce_handler(keys, request_iterator, md) + except Exception as err: + _LOGGER.critical("UDFError, re-raising the error", exc_info=True) + raise err + + datum_responses = [] + for msg in msgs: + datum_responses.append( + reduce_pb2.ReduceResponse.Result(keys=msg.keys, value=msg.value, tags=msg.tags) + ) + + return datum_responses + + async def IsReady( + self, request: _empty_pb2.Empty, context: NumaflowServicerContext + ) -> reduce_pb2.ReadyResponse: + """ + IsReady is the heartbeat endpoint for gRPC. + The pascal case function name comes from the proto reduce_pb2_grpc.py file. + """ + return reduce_pb2.ReadyResponse(ready=True) diff --git a/pynumaflow/reducer/asynciter.py b/pynumaflow/reducer/asynciter.py new file mode 100644 index 00000000..3ab6135b --- /dev/null +++ b/pynumaflow/reducer/asynciter.py @@ -0,0 +1,23 @@ +import asyncio + +from pynumaflow._constants import STREAM_EOF + + +class NonBlockingIterator: + """An Async Interator backed by a queue""" + + __slots__ = "_queue" + + def __init__(self): + self._queue = asyncio.Queue() + + async def read_iterator(self): + item = await self._queue.get() + while True: + if item == STREAM_EOF: + break + yield item + item = await self._queue.get() + + async def put(self, item): + await self._queue.put(item) diff --git a/pynumaflow/reducer/proto/__init__.py b/pynumaflow/reducer/proto/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pynumaflow/reducer/proto/reduce.proto b/pynumaflow/reducer/proto/reduce.proto new file mode 100644 index 00000000..81571e14 --- /dev/null +++ b/pynumaflow/reducer/proto/reduce.proto @@ -0,0 +1,44 @@ +syntax = "proto3"; + +import "google/protobuf/empty.proto"; +import "google/protobuf/timestamp.proto"; + + +package reduce.v1; + +service Reduce { + // ReduceFn applies a reduce function to a request stream. + rpc ReduceFn(stream ReduceRequest) returns (stream ReduceResponse); + + // IsReady is the heartbeat endpoint for gRPC. + rpc IsReady(google.protobuf.Empty) returns (ReadyResponse); +} + +/** + * ReduceRequest represents a request element. + */ +message ReduceRequest { + repeated string keys = 1; + bytes value = 2; + google.protobuf.Timestamp event_time = 3; + google.protobuf.Timestamp watermark = 4; +} + +/** + * ReduceResponse represents a response element. + */ +message ReduceResponse { + message Result { + repeated string keys = 1; + bytes value = 2; + repeated string tags = 3; + } + repeated Result results = 1; +} + +/** + * ReadyResponse is the health check result. + */ +message ReadyResponse { + bool ready = 1; +} \ No newline at end of file diff --git a/pynumaflow/reducer/proto/reduce_pb2.py b/pynumaflow/reducer/proto/reduce_pb2.py new file mode 100644 index 00000000..f61b8887 --- /dev/null +++ b/pynumaflow/reducer/proto/reduce_pb2.py @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: reduce.proto +"""Generated protocol buffer code.""" +from google.protobuf import descriptor as _descriptor +from google.protobuf import descriptor_pool as _descriptor_pool +from google.protobuf import symbol_database as _symbol_database +from google.protobuf.internal import builder as _builder + +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + +from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2 +from google.protobuf import timestamp_pb2 as google_dot_protobuf_dot_timestamp__pb2 + + +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( + b'\n\x0creduce.proto\x12\treduce.v1\x1a\x1bgoogle/protobuf/empty.proto\x1a\x1fgoogle/protobuf/timestamp.proto"\x8b\x01\n\rReduceRequest\x12\x0c\n\x04keys\x18\x01 \x03(\t\x12\r\n\x05value\x18\x02 \x01(\x0c\x12.\n\nevent_time\x18\x03 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12-\n\twatermark\x18\x04 \x01(\x0b\x32\x1a.google.protobuf.Timestamp"x\n\x0eReduceResponse\x12\x31\n\x07results\x18\x01 \x03(\x0b\x32 .reduce.v1.ReduceResponse.Result\x1a\x33\n\x06Result\x12\x0c\n\x04keys\x18\x01 \x03(\t\x12\r\n\x05value\x18\x02 \x01(\x0c\x12\x0c\n\x04tags\x18\x03 \x03(\t"\x1e\n\rReadyResponse\x12\r\n\x05ready\x18\x01 \x01(\x08\x32\x8a\x01\n\x06Reduce\x12\x43\n\x08ReduceFn\x12\x18.reduce.v1.ReduceRequest\x1a\x19.reduce.v1.ReduceResponse(\x01\x30\x01\x12;\n\x07IsReady\x12\x16.google.protobuf.Empty\x1a\x18.reduce.v1.ReadyResponseb\x06proto3' +) + +_globals = globals() +_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) +_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "reduce_pb2", _globals) +if _descriptor._USE_C_DESCRIPTORS == False: + DESCRIPTOR._options = None + _globals["_REDUCEREQUEST"]._serialized_start = 90 + _globals["_REDUCEREQUEST"]._serialized_end = 229 + _globals["_REDUCERESPONSE"]._serialized_start = 231 + _globals["_REDUCERESPONSE"]._serialized_end = 351 + _globals["_REDUCERESPONSE_RESULT"]._serialized_start = 300 + _globals["_REDUCERESPONSE_RESULT"]._serialized_end = 351 + _globals["_READYRESPONSE"]._serialized_start = 353 + _globals["_READYRESPONSE"]._serialized_end = 383 + _globals["_REDUCE"]._serialized_start = 386 + _globals["_REDUCE"]._serialized_end = 524 +# @@protoc_insertion_point(module_scope) diff --git a/pynumaflow/reducer/proto/reduce_pb2_grpc.py b/pynumaflow/reducer/proto/reduce_pb2_grpc.py new file mode 100644 index 00000000..5a0a15f6 --- /dev/null +++ b/pynumaflow/reducer/proto/reduce_pb2_grpc.py @@ -0,0 +1,123 @@ +# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT! +"""Client and server classes corresponding to protobuf-defined services.""" +import grpc + +from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2 +from . import reduce_pb2 as reduce__pb2 + + +class ReduceStub(object): + """Missing associated documentation comment in .proto file.""" + + def __init__(self, channel): + """Constructor. + + Args: + channel: A grpc.Channel. + """ + self.ReduceFn = channel.stream_stream( + "/reduce.v1.Reduce/ReduceFn", + request_serializer=reduce__pb2.ReduceRequest.SerializeToString, + response_deserializer=reduce__pb2.ReduceResponse.FromString, + ) + self.IsReady = channel.unary_unary( + "/reduce.v1.Reduce/IsReady", + request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, + response_deserializer=reduce__pb2.ReadyResponse.FromString, + ) + + +class ReduceServicer(object): + """Missing associated documentation comment in .proto file.""" + + def ReduceFn(self, request_iterator, context): + """ReduceFn applies a reduce function to a request stream.""" + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details("Method not implemented!") + raise NotImplementedError("Method not implemented!") + + def IsReady(self, request, context): + """IsReady is the heartbeat endpoint for gRPC.""" + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details("Method not implemented!") + raise NotImplementedError("Method not implemented!") + + +def add_ReduceServicer_to_server(servicer, server): + rpc_method_handlers = { + "ReduceFn": grpc.stream_stream_rpc_method_handler( + servicer.ReduceFn, + request_deserializer=reduce__pb2.ReduceRequest.FromString, + response_serializer=reduce__pb2.ReduceResponse.SerializeToString, + ), + "IsReady": grpc.unary_unary_rpc_method_handler( + servicer.IsReady, + request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + response_serializer=reduce__pb2.ReadyResponse.SerializeToString, + ), + } + generic_handler = grpc.method_handlers_generic_handler("reduce.v1.Reduce", rpc_method_handlers) + server.add_generic_rpc_handlers((generic_handler,)) + + +# This class is part of an EXPERIMENTAL API. +class Reduce(object): + """Missing associated documentation comment in .proto file.""" + + @staticmethod + def ReduceFn( + request_iterator, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None, + ): + return grpc.experimental.stream_stream( + request_iterator, + target, + "/reduce.v1.Reduce/ReduceFn", + reduce__pb2.ReduceRequest.SerializeToString, + reduce__pb2.ReduceResponse.FromString, + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + ) + + @staticmethod + def IsReady( + request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None, + ): + return grpc.experimental.unary_unary( + request, + target, + "/reduce.v1.Reduce/IsReady", + google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, + reduce__pb2.ReadyResponse.FromString, + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + ) diff --git a/pynumaflow/reducer/reduce.py b/pynumaflow/reducer/reduce.py new file mode 100644 index 00000000..3b2ca06b --- /dev/null +++ b/pynumaflow/reducer/reduce.py @@ -0,0 +1,61 @@ +import aiorun +import grpc + +from pynumaflow.reducer.proto import reduce_pb2_grpc + +from pynumaflow.reducer.async_server import AsyncReducer + +from pynumaflow._constants import ( + REDUCE_SOCK_PATH, + MAX_MESSAGE_SIZE, + MAX_THREADS, + ServerType, + _LOGGER, +) + +from pynumaflow.reducer._dtypes import ReduceCallable + +from pynumaflow.shared.server import NumaflowServer, start_async_server + + +class ReduceServer(NumaflowServer): + def __init__( + self, + reducer_instance: ReduceCallable, + sock_path=REDUCE_SOCK_PATH, + max_message_size=MAX_MESSAGE_SIZE, + max_threads=MAX_THREADS, + server_type=ServerType.Async, + ): + self.reducer_instance: ReduceCallable = reducer_instance + self.sock_path = f"unix://{sock_path}" + self.max_message_size = max_message_size + self.max_threads = max_threads + self.server_type = server_type + + self._server_options = [ + ("grpc.max_send_message_length", self.max_message_size), + ("grpc.max_receive_message_length", self.max_message_size), + ] + + def start(self): + if self.server_type == ServerType.Async: + aiorun.run(self.aexec()) + else: + _LOGGER.error("Server type not supported", self.server_type) + raise NotImplementedError + + async def aexec(self): + server = grpc.aio.server() + server.add_insecure_port(self.sock_path) + reduce_servicer = self.get_servicer( + reducer_instance=self.reducer_instance, server_type=self.server_type + ) + reduce_pb2_grpc.add_ReduceServicer_to_server(reduce_servicer, server) + await start_async_server(server, self.sock_path, self.max_threads, self._server_options) + + def get_servicer(self, reducer_instance: ReduceCallable, server_type: ServerType): + if server_type == ServerType.Async: + return AsyncReducer(reducer_instance) + else: + raise NotImplementedError From 32d29757e864747934594edd77c4639a7c4c260a Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Mon, 8 Jan 2024 16:33:25 -0800 Subject: [PATCH 27/78] reduce Signed-off-by: Sidhant Kohli --- pynumaflow/reducer/async_server.py | 1 + pynumaflow/sideinput/__init__.py | 4 + pynumaflow/sideinput/_dtypes.py | 38 ++++ pynumaflow/sideinput/proto/__init__.py | 0 pynumaflow/sideinput/proto/sideinput.proto | 40 +++++ pynumaflow/sideinput/proto/sideinput_pb2.py | 33 ++++ .../sideinput/proto/sideinput_pb2_grpc.py | 149 ++++++++++++++++ pynumaflow/sideinput/server.py | 115 ++++++++++++ pynumaflow/sinker/__init__.py | 5 + pynumaflow/sinker/_dtypes.py | 164 ++++++++++++++++++ pynumaflow/sinker/async_sink.py | 153 ++++++++++++++++ pynumaflow/sinker/proto/__init__.py | 0 pynumaflow/sinker/proto/sink.proto | 50 ++++++ pynumaflow/sinker/proto/sink_pb2.py | 39 +++++ pynumaflow/sinker/proto/sink_pb2_grpc.py | 123 +++++++++++++ pynumaflow/sinker/server.py | 139 +++++++++++++++ 16 files changed, 1053 insertions(+) create mode 100644 pynumaflow/sideinput/__init__.py create mode 100644 pynumaflow/sideinput/_dtypes.py create mode 100644 pynumaflow/sideinput/proto/__init__.py create mode 100644 pynumaflow/sideinput/proto/sideinput.proto create mode 100644 pynumaflow/sideinput/proto/sideinput_pb2.py create mode 100644 pynumaflow/sideinput/proto/sideinput_pb2_grpc.py create mode 100644 pynumaflow/sideinput/server.py create mode 100644 pynumaflow/sinker/__init__.py create mode 100644 pynumaflow/sinker/_dtypes.py create mode 100644 pynumaflow/sinker/async_sink.py create mode 100644 pynumaflow/sinker/proto/__init__.py create mode 100644 pynumaflow/sinker/proto/sink.proto create mode 100644 pynumaflow/sinker/proto/sink_pb2.py create mode 100644 pynumaflow/sinker/proto/sink_pb2_grpc.py create mode 100644 pynumaflow/sinker/server.py diff --git a/pynumaflow/reducer/async_server.py b/pynumaflow/reducer/async_server.py index dc355ec6..b9cb9614 100644 --- a/pynumaflow/reducer/async_server.py +++ b/pynumaflow/reducer/async_server.py @@ -86,6 +86,7 @@ def __init__( # Event loop only keeps a weak reference, which can cause it to # get lost during execution. self.background_tasks = set() + self.__reduce_handler: ReduceCallable = handler async def ReduceFn( self, diff --git a/pynumaflow/sideinput/__init__.py b/pynumaflow/sideinput/__init__.py new file mode 100644 index 00000000..8a3c36f3 --- /dev/null +++ b/pynumaflow/sideinput/__init__.py @@ -0,0 +1,4 @@ +from pynumaflow.sideinput._dtypes import Response +from pynumaflow.sideinput.server import SideInput + +__all__ = ["Response", "SideInput"] diff --git a/pynumaflow/sideinput/_dtypes.py b/pynumaflow/sideinput/_dtypes.py new file mode 100644 index 00000000..86826578 --- /dev/null +++ b/pynumaflow/sideinput/_dtypes.py @@ -0,0 +1,38 @@ +from dataclasses import dataclass +from typing import TypeVar + +R = TypeVar("R", bound="Response") + + +@dataclass +class Response: + """ + Class to define the important information for the event. + Args: + value: the payload of the event. + no_broadcast: the flag to indicate whether the event should be broadcasted. + >>> # Example usage + >>> Response.broadcast_message(b"hello") + >>> Response.no_broadcast_message() + """ + + __slots__ = ("value", "no_broadcast") + + value: bytes + no_broadcast: bool + + @classmethod + def broadcast_message(cls: type[R], value: bytes) -> R: + """ + Returns a SideInputResponse object with the given value, + and the No broadcast flag set to False. This event will be broadcasted. + """ + return Response(value=value, no_broadcast=False) + + @classmethod + def no_broadcast_message(cls: type[R]) -> R: + """ + Returns a SideInputResponse object with the No broadcast flag set to True. + This event will not be broadcasted. + """ + return Response(value=b"", no_broadcast=True) diff --git a/pynumaflow/sideinput/proto/__init__.py b/pynumaflow/sideinput/proto/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pynumaflow/sideinput/proto/sideinput.proto b/pynumaflow/sideinput/proto/sideinput.proto new file mode 100644 index 00000000..c53f055e --- /dev/null +++ b/pynumaflow/sideinput/proto/sideinput.proto @@ -0,0 +1,40 @@ +syntax = "proto3"; + +import "google/protobuf/empty.proto"; + +package sideinput.v1; + +// SideInput is the gRPC service for user-defined Side Inputs. +// It is used to propagate changes in the values of the provided Side Inputs +// which allows access to slow updated data or configuration without needing to retrieve +// it during each message processing. +// Through this service we should should be able to:- +// 1) Invoke retrieval request for a single Side Input parameter, which in turn should +// check for updates and return its latest value. +// 2) Provide a health check endpoint to indicate whether the service is ready to be used. +service SideInput { + // RetrieveSideInput is the endpoint to retrieve the latest value of a given Side Input. + rpc RetrieveSideInput(google.protobuf.Empty) returns (SideInputResponse); + + // IsReady is the health check endpoint to indicate whether the service is ready to be used. + rpc IsReady(google.protobuf.Empty) returns (ReadyResponse); +} + +/** + * SideInputResponse represents a response to a given side input retrieval request. + */ +message SideInputResponse { + // value represents the latest value of the side input payload + bytes value = 1; + // noBroadcast indicates whether the side input value should be broadcasted to all + // True if value should not be broadcasted + // False if value should be broadcasted + bool no_broadcast = 2; +} + +/** + * ReadyResponse is the health check result. + */ +message ReadyResponse { + bool ready = 1; +} \ No newline at end of file diff --git a/pynumaflow/sideinput/proto/sideinput_pb2.py b/pynumaflow/sideinput/proto/sideinput_pb2.py new file mode 100644 index 00000000..8278c1df --- /dev/null +++ b/pynumaflow/sideinput/proto/sideinput_pb2.py @@ -0,0 +1,33 @@ +# -*- coding: utf-8 -*- +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: sideinput.proto +"""Generated protocol buffer code.""" +from google.protobuf import descriptor as _descriptor +from google.protobuf import descriptor_pool as _descriptor_pool +from google.protobuf import symbol_database as _symbol_database +from google.protobuf.internal import builder as _builder + +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + +from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2 + + +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( + b'\n\x0fsideinput.proto\x12\x0csideinput.v1\x1a\x1bgoogle/protobuf/empty.proto"8\n\x11SideInputResponse\x12\r\n\x05value\x18\x01 \x01(\x0c\x12\x14\n\x0cno_broadcast\x18\x02 \x01(\x08"\x1e\n\rReadyResponse\x12\r\n\x05ready\x18\x01 \x01(\x08\x32\x99\x01\n\tSideInput\x12L\n\x11RetrieveSideInput\x12\x16.google.protobuf.Empty\x1a\x1f.sideinput.v1.SideInputResponse\x12>\n\x07IsReady\x12\x16.google.protobuf.Empty\x1a\x1b.sideinput.v1.ReadyResponseb\x06proto3' +) + +_globals = globals() +_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) +_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "sideinput_pb2", _globals) +if _descriptor._USE_C_DESCRIPTORS == False: + DESCRIPTOR._options = None + _globals["_SIDEINPUTRESPONSE"]._serialized_start = 62 + _globals["_SIDEINPUTRESPONSE"]._serialized_end = 118 + _globals["_READYRESPONSE"]._serialized_start = 120 + _globals["_READYRESPONSE"]._serialized_end = 150 + _globals["_SIDEINPUT"]._serialized_start = 153 + _globals["_SIDEINPUT"]._serialized_end = 306 +# @@protoc_insertion_point(module_scope) diff --git a/pynumaflow/sideinput/proto/sideinput_pb2_grpc.py b/pynumaflow/sideinput/proto/sideinput_pb2_grpc.py new file mode 100644 index 00000000..72ea87ed --- /dev/null +++ b/pynumaflow/sideinput/proto/sideinput_pb2_grpc.py @@ -0,0 +1,149 @@ +# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT! +"""Client and server classes corresponding to protobuf-defined services.""" +import grpc + +from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2 +from . import sideinput_pb2 as sideinput__pb2 + + +class SideInputStub(object): + """SideInput is the gRPC service for user-defined Side Inputs. + It is used to propagate changes in the values of the provided Side Inputs + which allows access to slow updated data or configuration without needing to retrieve + it during each message processing. + Through this service we should should be able to:- + 1) Invoke retrieval request for a single Side Input parameter, which in turn should + check for updates and return its latest value. + 2) Provide a health check endpoint to indicate whether the service is ready to be used. + """ + + def __init__(self, channel): + """Constructor. + + Args: + channel: A grpc.Channel. + """ + self.RetrieveSideInput = channel.unary_unary( + "/sideinput.v1.SideInput/RetrieveSideInput", + request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, + response_deserializer=sideinput__pb2.SideInputResponse.FromString, + ) + self.IsReady = channel.unary_unary( + "/sideinput.v1.SideInput/IsReady", + request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, + response_deserializer=sideinput__pb2.ReadyResponse.FromString, + ) + + +class SideInputServicer(object): + """SideInput is the gRPC service for user-defined Side Inputs. + It is used to propagate changes in the values of the provided Side Inputs + which allows access to slow updated data or configuration without needing to retrieve + it during each message processing. + Through this service we should should be able to:- + 1) Invoke retrieval request for a single Side Input parameter, which in turn should + check for updates and return its latest value. + 2) Provide a health check endpoint to indicate whether the service is ready to be used. + """ + + def RetrieveSideInput(self, request, context): + """RetrieveSideInput is the endpoint to retrieve the latest value of a given Side Input.""" + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details("Method not implemented!") + raise NotImplementedError("Method not implemented!") + + def IsReady(self, request, context): + """IsReady is the health check endpoint to indicate whether the service is ready to be used.""" + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details("Method not implemented!") + raise NotImplementedError("Method not implemented!") + + +def add_SideInputServicer_to_server(servicer, server): + rpc_method_handlers = { + "RetrieveSideInput": grpc.unary_unary_rpc_method_handler( + servicer.RetrieveSideInput, + request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + response_serializer=sideinput__pb2.SideInputResponse.SerializeToString, + ), + "IsReady": grpc.unary_unary_rpc_method_handler( + servicer.IsReady, + request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + response_serializer=sideinput__pb2.ReadyResponse.SerializeToString, + ), + } + generic_handler = grpc.method_handlers_generic_handler( + "sideinput.v1.SideInput", rpc_method_handlers + ) + server.add_generic_rpc_handlers((generic_handler,)) + + +# This class is part of an EXPERIMENTAL API. +class SideInput(object): + """SideInput is the gRPC service for user-defined Side Inputs. + It is used to propagate changes in the values of the provided Side Inputs + which allows access to slow updated data or configuration without needing to retrieve + it during each message processing. + Through this service we should should be able to:- + 1) Invoke retrieval request for a single Side Input parameter, which in turn should + check for updates and return its latest value. + 2) Provide a health check endpoint to indicate whether the service is ready to be used. + """ + + @staticmethod + def RetrieveSideInput( + request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None, + ): + return grpc.experimental.unary_unary( + request, + target, + "/sideinput.v1.SideInput/RetrieveSideInput", + google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, + sideinput__pb2.SideInputResponse.FromString, + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + ) + + @staticmethod + def IsReady( + request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None, + ): + return grpc.experimental.unary_unary( + request, + target, + "/sideinput.v1.SideInput/IsReady", + google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, + sideinput__pb2.ReadyResponse.FromString, + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + ) diff --git a/pynumaflow/sideinput/server.py b/pynumaflow/sideinput/server.py new file mode 100644 index 00000000..d786f0d7 --- /dev/null +++ b/pynumaflow/sideinput/server.py @@ -0,0 +1,115 @@ +import logging +import multiprocessing +import os +from concurrent.futures import ThreadPoolExecutor +from typing import Callable + +import grpc +from google.protobuf import empty_pb2 as _empty_pb2 + +from pynumaflow import setup_logging +from pynumaflow._constants import ( + MAX_MESSAGE_SIZE, + SIDE_INPUT_SOCK_PATH, +) +from pynumaflow.sideinput import Response +from pynumaflow.sideinput.proto import sideinput_pb2, sideinput_pb2_grpc +from pynumaflow.types import NumaflowServicerContext + +_LOGGER = setup_logging(__name__) +if os.getenv("PYTHONDEBUG"): + _LOGGER.setLevel(logging.DEBUG) + +RetrieverCallable = Callable[[], Response] +_PROCESS_COUNT = multiprocessing.cpu_count() +MAX_THREADS = int(os.getenv("MAX_THREADS", 0)) or (_PROCESS_COUNT * 4) + + +class SideInput(sideinput_pb2_grpc.SideInputServicer): + """ + Provides an interface to write a User Defined Side Input (UDSideInput) + which will be exposed over gRPC. + + Args: + handler: Function callable following the type signature of RetrieverCallable + sock_path: Path to the UNIX Domain Socket + max_message_size: The max message size in bytes the server can receive and send + max_threads: The max number of threads to be spawned; + defaults to number of processors x 4 + + Example invocation: + >>> from typing import List + >>> from pynumaflow.sideinput import Response, SideInput + >>> def my_handler() -> Response: + ... response = Response.broadcast_message(b"hello") + ... return response + >>> grpc_server = SideInput(my_handler) + >>> grpc_server.start() + """ + + SIDE_INPUT_DIR_PATH = "/var/numaflow/side-inputs" + + def __init__( + self, + handler: RetrieverCallable, + sock_path=SIDE_INPUT_SOCK_PATH, + max_message_size=MAX_MESSAGE_SIZE, + max_threads=MAX_THREADS, + ): + self.__retrieve_handler: RetrieverCallable = handler + self.sock_path = f"unix://{sock_path}" + self._max_message_size = max_message_size + self._max_threads = max_threads + self.cleanup_coroutines = [] + + self._server_options = [ + ("grpc.max_send_message_length", self._max_message_size), + ("grpc.max_receive_message_length", self._max_message_size), + ] + + def RetrieveSideInput( + self, request: _empty_pb2.Empty, context: NumaflowServicerContext + ) -> sideinput_pb2.SideInputResponse: + """ + Applies a sideinput function for a retrieval request. + The pascal case function name comes from the proto sideinput_pb2_grpc.py file. + """ + # if there is an exception, we will mark all the responses as a failure + try: + rspn = self.__retrieve_handler() + except Exception as err: + err_msg = "RetrieveSideInputErr: %r" % err + _LOGGER.critical(err_msg, exc_info=True) + context.set_code(grpc.StatusCode.UNKNOWN) + context.set_details(str(err)) + return sideinput_pb2.SideInputResponse(value=None, no_broadcast=True) + + return sideinput_pb2.SideInputResponse(value=rspn.value, no_broadcast=rspn.no_broadcast) + + def IsReady( + self, request: _empty_pb2.Empty, context: NumaflowServicerContext + ) -> sideinput_pb2.ReadyResponse: + """ + IsReady is the heartbeat endpoint for gRPC. + The pascal case function name comes from the proto sideinput_pb2_grpc.py file. + """ + return sideinput_pb2.ReadyResponse(ready=True) + + def start(self) -> None: + """ + Starts the gRPC server on the given UNIX socket with given max threads. + """ + server = grpc.server( + ThreadPoolExecutor(max_workers=self._max_threads), options=self._server_options + ) + sideinput_pb2_grpc.add_SideInputServicer_to_server( + SideInput(self.__retrieve_handler), server + ) + server.add_insecure_port(self.sock_path) + server.start() + _LOGGER.info( + "Side Input gRPC Server listening on: %s with max threads: %s", + self.sock_path, + self._max_threads, + ) + server.wait_for_termination() diff --git a/pynumaflow/sinker/__init__.py b/pynumaflow/sinker/__init__.py new file mode 100644 index 00000000..c6b5e679 --- /dev/null +++ b/pynumaflow/sinker/__init__.py @@ -0,0 +1,5 @@ +from pynumaflow.sinker._dtypes import Response, Responses, Datum +from pynumaflow.sinker.async_sink import AsyncSinker +from pynumaflow.sinker.server import Sinker + +__all__ = ["Response", "Responses", "Datum", "Sinker", "AsyncSinker"] diff --git a/pynumaflow/sinker/_dtypes.py b/pynumaflow/sinker/_dtypes.py new file mode 100644 index 00000000..1a020ac7 --- /dev/null +++ b/pynumaflow/sinker/_dtypes.py @@ -0,0 +1,164 @@ +from dataclasses import dataclass +from datetime import datetime +from typing import TypeVar, Optional, Callable +from collections.abc import Sequence, Iterator +from warnings import warn + +R = TypeVar("R", bound="Response") +Rs = TypeVar("Rs", bound="Responses") + + +@dataclass +class Response: + """ + Basic datatype for UDSink response. + + Args: + id: the id of the event. + success: boolean indicating whether the event was successfully processed. + err: error message if the event was not successfully processed. + """ + + id: str + success: bool + err: Optional[str] + + __slots__ = ("id", "success", "err") + + @classmethod + def as_success(cls: type[R], id_: str) -> R: + return Response(id=id_, success=True, err=None) + + @classmethod + def as_failure(cls: type[R], id_: str, err_msg: str) -> R: + return Response(id=id_, success=False, err=err_msg) + + +class Responses(Sequence[R]): + """ + Container to hold a list of Response instances. + + Args: + responses: list of Response instances. + """ + + __slots__ = ("_responses",) + + def __init__(self, *responses: R): + self._responses = list(responses) or [] + + def __str__(self) -> str: + return str(self._responses) + + def __repr__(self) -> str: + return str(self) + + def __len__(self) -> int: + return len(self._responses) + + def __iter__(self) -> Iterator[R]: + return iter(self._responses) + + def __getitem__(self, index: int) -> R: + if isinstance(index, slice): + raise TypeError("Slicing is not supported for Responses") + return self._responses[index] + + def append(self, response: R) -> None: + self._responses.append(response) + + def items(self) -> list[R]: + warn( + "Using items is deprecated and will be removed in v0.5. " + "Iterate or index the Responses object instead.", + DeprecationWarning, + stacklevel=2, + ) + return self._responses + + +@dataclass(init=False, repr=False) +class Datum: + """ + Class to define the important information for the event. + Args: + keys: the keys of the event. + value: the payload of the event. + event_time: the event time of the event. + watermark: the watermark of the event. + >>> # Example usage + >>> from pynumaflow.sinker import Datum + >>> from datetime import datetime, timezone + >>> payload = bytes("test_mock_message", encoding="utf-8") + >>> t1 = datetime.fromtimestamp(1662998400, timezone.utc) + >>> t2 = datetime.fromtimestamp(1662998460, timezone.utc) + >>> msg_id = "test_id" + >>> output_keys = ["test_key"] + >>> d = Datum(keys=output_keys, sink_msg_id=msg_id, value=payload, event_time=t1, watermark=t2) + """ + + __slots__ = ("_keys", "_id", "_value", "_event_time", "_watermark") + + _keys: list[str] + _id: str + _value: bytes + _event_time: datetime + _watermark: datetime + + def __init__( + self, + keys: list[str], + sink_msg_id: str, + value: bytes, + event_time: datetime, + watermark: datetime, + ): + self._keys = keys + self._id = sink_msg_id or "" + self._value = value or b"" + if not isinstance(event_time, datetime): + raise TypeError(f"Wrong data type: {type(event_time)} for Datum.event_time") + self._event_time = event_time + if not isinstance(watermark, datetime): + raise TypeError(f"Wrong data type: {type(watermark)} for Datum.watermark") + self._watermark = watermark + + def __str__(self): + value_string = self._value.decode("utf-8") + return ( + f"keys: {self._keys}, " + f"id: {self._id}, value: {value_string}, " + f"event_time: {str(self._event_time)}, " + f"watermark: {str(self._watermark)}" + ) + + def __repr__(self): + return str(self) + + @property + def id(self) -> str: + """Returns the id of the event.""" + return self._id + + @property + def keys(self) -> list[str]: + """Returns the keys of the event.""" + return self._keys + + @property + def value(self) -> bytes: + """Returns the value of the event.""" + return self._value + + @property + def event_time(self) -> datetime: + """Returns the event time of the event.""" + return self._event_time + + @property + def watermark(self) -> datetime: + """Returns the watermark of the event.""" + return self._watermark + + +SinkCallable = Callable[[Iterator[Datum]], Responses] diff --git a/pynumaflow/sinker/async_sink.py b/pynumaflow/sinker/async_sink.py new file mode 100644 index 00000000..8333710c --- /dev/null +++ b/pynumaflow/sinker/async_sink.py @@ -0,0 +1,153 @@ +import logging +import multiprocessing +import os +from collections.abc import AsyncIterable + +import grpc +from google.protobuf import empty_pb2 as _empty_pb2 + +from pynumaflow import setup_logging +from pynumaflow._constants import ( + SINK_SOCK_PATH, + MAX_MESSAGE_SIZE, +) +from pynumaflow.info.server import get_sdk_version, write as info_server_write +from pynumaflow.info.types import ServerInfo, Protocol, Language, SERVER_INFO_FILE_PATH +from pynumaflow.sinker import Responses, Datum, Response +from pynumaflow.sinker._dtypes import SinkCallable +from pynumaflow.sinker.proto import sink_pb2_grpc, sink_pb2 +from pynumaflow.types import NumaflowServicerContext + +_LOGGER = setup_logging(__name__) +if os.getenv("PYTHONDEBUG"): + _LOGGER.setLevel(logging.DEBUG) + +_PROCESS_COUNT = multiprocessing.cpu_count() +MAX_THREADS = int(os.getenv("MAX_THREADS", 0)) or (_PROCESS_COUNT * 4) + + +async def datum_generator( + request_iterator: AsyncIterable[sink_pb2.SinkRequest], +) -> AsyncIterable[Datum]: + async for d in request_iterator: + datum = Datum( + keys=list(d.keys), + sink_msg_id=d.id, + value=d.value, + event_time=d.event_time.ToDatetime(), + watermark=d.watermark.ToDatetime(), + ) + yield datum + + +class AsyncSinker(sink_pb2_grpc.SinkServicer): + """ + Provides an interface to write an Async Sinker + which will be exposed over an Asyncronous gRPC server. + + Args: + handler: Function callable following the type signature of SinkCallable + sock_path: Path to the UNIX Domain Socket + max_message_size: The max message size in bytes the server can receive and send + max_threads: The max number of threads to be spawned; + defaults to number of processors x 4 + + Example invocation: + >>> import aiorun + >>> from pynumaflow.sinker import Datum, Responses, Response, AsyncSinker + >>> async def my_handler(datums: AsyncIterable[Datum]) -> Responses: + ... responses = Responses() + ... async for msg in datums: + ... responses.append(Response.as_success(msg.id)) + ... return responses + >>> grpc_server = AsyncSinker(handler=my_handler) + >>> aiorun.run(grpc_server.start()) + """ + + def __init__( + self, + handler: SinkCallable, + sock_path=SINK_SOCK_PATH, + max_message_size=MAX_MESSAGE_SIZE, + max_threads=MAX_THREADS, + ): + self.background_tasks = set() + self.__sink_handler: SinkCallable = handler + self.sock_path = f"unix://{sock_path}" + self._max_message_size = max_message_size + self._max_threads = max_threads + self.cleanup_coroutines = [] + + self._server_options = [ + ("grpc.max_send_message_length", self._max_message_size), + ("grpc.max_receive_message_length", self._max_message_size), + ] + + async def SinkFn( + self, + request_iterator: AsyncIterable[sink_pb2.SinkRequest], + context: NumaflowServicerContext, + ) -> sink_pb2.SinkResponse: + """ + Applies a sink function to a list of datum elements. + The pascal case function name comes from the proto sink_pb2_grpc.py file. + """ + # if there is an exception, we will mark all the responses as a failure + datum_iterator = datum_generator(request_iterator=request_iterator) + results = await self.__invoke_sink(datum_iterator) + + return sink_pb2.SinkResponse(results=results) + + async def __invoke_sink(self, datum_iterator: AsyncIterable[Datum]): + try: + rspns = await self.__sink_handler(datum_iterator) + except Exception as err: + err_msg = "UDSinkError: %r" % err + _LOGGER.critical(err_msg, exc_info=True) + rspns = Responses() + async for _datum in datum_iterator: + rspns.append(Response.as_failure(_datum.id, err_msg)) + responses = [] + for rspn in rspns: + responses.append( + sink_pb2.SinkResponse.Result(id=rspn.id, success=rspn.success, err_msg=rspn.err) + ) + return responses + + async def IsReady( + self, request: _empty_pb2.Empty, context: NumaflowServicerContext + ) -> sink_pb2.ReadyResponse: + """ + IsReady is the heartbeat endpoint for gRPC. + The pascal case function name comes from the proto sink_pb2_grpc.py file. + """ + return sink_pb2.ReadyResponse(ready=True) + + async def __serve_async(self, server) -> None: + sink_pb2_grpc.add_SinkServicer_to_server(AsyncSinker(self.__sink_handler), server) + server.add_insecure_port(self.sock_path) + _LOGGER.info("GRPC Async Server listening on: %s", self.sock_path) + await server.start() + serv_info = ServerInfo( + protocol=Protocol.UDS, + language=Language.PYTHON, + version=get_sdk_version(), + ) + info_server_write(server_info=serv_info, info_file=SERVER_INFO_FILE_PATH) + + async def server_graceful_shutdown(): + _LOGGER.info("Starting graceful shutdown...") + """ + Shuts down the server with 5 seconds of grace period. During the + grace period, the server won't accept new connections and allow + existing RPCs to continue within the grace period. + await server.stop(5) + """ + + self.cleanup_coroutines.append(server_graceful_shutdown()) + await server.wait_for_termination() + + async def start(self) -> None: + """Starts the Async gRPC server on the given UNIX socket.""" + server = grpc.aio.server(options=self._server_options) + await self.__serve_async(server) diff --git a/pynumaflow/sinker/proto/__init__.py b/pynumaflow/sinker/proto/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pynumaflow/sinker/proto/sink.proto b/pynumaflow/sinker/proto/sink.proto new file mode 100644 index 00000000..1c97077a --- /dev/null +++ b/pynumaflow/sinker/proto/sink.proto @@ -0,0 +1,50 @@ +syntax = "proto3"; + +option go_package = "github.com/numaproj/numaflow-go/pkg/apis/proto/sink/v1"; + +import "google/protobuf/empty.proto"; +import "google/protobuf/timestamp.proto"; + + +package sink.v1; + +service Sink { + // SinkFn writes the request to a user defined sink. + rpc SinkFn(stream SinkRequest) returns (SinkResponse); + + // IsReady is the heartbeat endpoint for gRPC. + rpc IsReady(google.protobuf.Empty) returns (ReadyResponse); +} + +/** + * SinkRequest represents a request element. + */ +message SinkRequest { + repeated string keys = 1; + bytes value = 2; + google.protobuf.Timestamp event_time = 3; + google.protobuf.Timestamp watermark = 4; + string id = 5; +} + +/** + * ReadyResponse is the health check result. + */ +message ReadyResponse { + bool ready = 1; +} + +/** + * SinkResponse is the individual response of each message written to the sink. + */ +message SinkResponse { + message Result { + // id is the ID of the message, can be used to uniquely identify the message. + string id = 1; + // success denotes the status of persisting to disk. if set to false, it means writing to sink for the message failed. + bool success = 2; + // err_msg is the error message, set it if success is set to false. + string err_msg = 3; + } + repeated Result results = 1; +} \ No newline at end of file diff --git a/pynumaflow/sinker/proto/sink_pb2.py b/pynumaflow/sinker/proto/sink_pb2.py new file mode 100644 index 00000000..b6182a45 --- /dev/null +++ b/pynumaflow/sinker/proto/sink_pb2.py @@ -0,0 +1,39 @@ +# -*- coding: utf-8 -*- +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: sink.proto +"""Generated protocol buffer code.""" +from google.protobuf import descriptor as _descriptor +from google.protobuf import descriptor_pool as _descriptor_pool +from google.protobuf import symbol_database as _symbol_database +from google.protobuf.internal import builder as _builder + +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + +from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2 +from google.protobuf import timestamp_pb2 as google_dot_protobuf_dot_timestamp__pb2 + + +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( + b'\n\nsink.proto\x12\x07sink.v1\x1a\x1bgoogle/protobuf/empty.proto\x1a\x1fgoogle/protobuf/timestamp.proto"\x95\x01\n\x0bSinkRequest\x12\x0c\n\x04keys\x18\x01 \x03(\t\x12\r\n\x05value\x18\x02 \x01(\x0c\x12.\n\nevent_time\x18\x03 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12-\n\twatermark\x18\x04 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12\n\n\x02id\x18\x05 \x01(\t"\x1e\n\rReadyResponse\x12\r\n\x05ready\x18\x01 \x01(\x08"u\n\x0cSinkResponse\x12-\n\x07results\x18\x01 \x03(\x0b\x32\x1c.sink.v1.SinkResponse.Result\x1a\x36\n\x06Result\x12\n\n\x02id\x18\x01 \x01(\t\x12\x0f\n\x07success\x18\x02 \x01(\x08\x12\x0f\n\x07\x65rr_msg\x18\x03 \x01(\t2z\n\x04Sink\x12\x37\n\x06SinkFn\x12\x14.sink.v1.SinkRequest\x1a\x15.sink.v1.SinkResponse(\x01\x12\x39\n\x07IsReady\x12\x16.google.protobuf.Empty\x1a\x16.sink.v1.ReadyResponseB8Z6github.com/numaproj/numaflow-go/pkg/apis/proto/sink/v1b\x06proto3' +) + +_globals = globals() +_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) +_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "sink_pb2", _globals) +if _descriptor._USE_C_DESCRIPTORS == False: + DESCRIPTOR._options = None + DESCRIPTOR._serialized_options = b"Z6github.com/numaproj/numaflow-go/pkg/apis/proto/sink/v1" + _globals["_SINKREQUEST"]._serialized_start = 86 + _globals["_SINKREQUEST"]._serialized_end = 235 + _globals["_READYRESPONSE"]._serialized_start = 237 + _globals["_READYRESPONSE"]._serialized_end = 267 + _globals["_SINKRESPONSE"]._serialized_start = 269 + _globals["_SINKRESPONSE"]._serialized_end = 386 + _globals["_SINKRESPONSE_RESULT"]._serialized_start = 332 + _globals["_SINKRESPONSE_RESULT"]._serialized_end = 386 + _globals["_SINK"]._serialized_start = 388 + _globals["_SINK"]._serialized_end = 510 +# @@protoc_insertion_point(module_scope) diff --git a/pynumaflow/sinker/proto/sink_pb2_grpc.py b/pynumaflow/sinker/proto/sink_pb2_grpc.py new file mode 100644 index 00000000..ef673e9d --- /dev/null +++ b/pynumaflow/sinker/proto/sink_pb2_grpc.py @@ -0,0 +1,123 @@ +# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT! +"""Client and server classes corresponding to protobuf-defined services.""" +import grpc + +from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2 +from . import sink_pb2 as sink__pb2 + + +class SinkStub(object): + """Missing associated documentation comment in .proto file.""" + + def __init__(self, channel): + """Constructor. + + Args: + channel: A grpc.Channel. + """ + self.SinkFn = channel.stream_unary( + "/sink.v1.Sink/SinkFn", + request_serializer=sink__pb2.SinkRequest.SerializeToString, + response_deserializer=sink__pb2.SinkResponse.FromString, + ) + self.IsReady = channel.unary_unary( + "/sink.v1.Sink/IsReady", + request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, + response_deserializer=sink__pb2.ReadyResponse.FromString, + ) + + +class SinkServicer(object): + """Missing associated documentation comment in .proto file.""" + + def SinkFn(self, request_iterator, context): + """SinkFn writes the request to a user defined sink.""" + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details("Method not implemented!") + raise NotImplementedError("Method not implemented!") + + def IsReady(self, request, context): + """IsReady is the heartbeat endpoint for gRPC.""" + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details("Method not implemented!") + raise NotImplementedError("Method not implemented!") + + +def add_SinkServicer_to_server(servicer, server): + rpc_method_handlers = { + "SinkFn": grpc.stream_unary_rpc_method_handler( + servicer.SinkFn, + request_deserializer=sink__pb2.SinkRequest.FromString, + response_serializer=sink__pb2.SinkResponse.SerializeToString, + ), + "IsReady": grpc.unary_unary_rpc_method_handler( + servicer.IsReady, + request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + response_serializer=sink__pb2.ReadyResponse.SerializeToString, + ), + } + generic_handler = grpc.method_handlers_generic_handler("sink.v1.Sink", rpc_method_handlers) + server.add_generic_rpc_handlers((generic_handler,)) + + +# This class is part of an EXPERIMENTAL API. +class Sink(object): + """Missing associated documentation comment in .proto file.""" + + @staticmethod + def SinkFn( + request_iterator, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None, + ): + return grpc.experimental.stream_unary( + request_iterator, + target, + "/sink.v1.Sink/SinkFn", + sink__pb2.SinkRequest.SerializeToString, + sink__pb2.SinkResponse.FromString, + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + ) + + @staticmethod + def IsReady( + request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None, + ): + return grpc.experimental.unary_unary( + request, + target, + "/sink.v1.Sink/IsReady", + google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, + sink__pb2.ReadyResponse.FromString, + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + ) diff --git a/pynumaflow/sinker/server.py b/pynumaflow/sinker/server.py new file mode 100644 index 00000000..195cee10 --- /dev/null +++ b/pynumaflow/sinker/server.py @@ -0,0 +1,139 @@ +import logging +import multiprocessing +import os +from concurrent.futures import ThreadPoolExecutor +from collections.abc import Iterator, Iterable + +import grpc +from google.protobuf import empty_pb2 as _empty_pb2 + +from pynumaflow import setup_logging +from pynumaflow._constants import ( + SINK_SOCK_PATH, + MAX_MESSAGE_SIZE, +) +from pynumaflow.info.server import get_sdk_version, write as info_server_write +from pynumaflow.info.types import ServerInfo, Protocol, Language, SERVER_INFO_FILE_PATH +from pynumaflow.sinker import Responses, Datum, Response +from pynumaflow.sinker._dtypes import SinkCallable +from pynumaflow.sinker.proto import sink_pb2_grpc, sink_pb2 +from pynumaflow.types import NumaflowServicerContext + +_LOGGER = setup_logging(__name__) +if os.getenv("PYTHONDEBUG"): + _LOGGER.setLevel(logging.DEBUG) + + +_PROCESS_COUNT = multiprocessing.cpu_count() +MAX_THREADS = int(os.getenv("MAX_THREADS", 0)) or (_PROCESS_COUNT * 4) + + +def datum_generator(request_iterator: Iterable[sink_pb2.SinkRequest]) -> Iterable[Datum]: + for d in request_iterator: + datum = Datum( + keys=list(d.keys), + sink_msg_id=d.id, + value=d.value, + event_time=d.event_time.ToDatetime(), + watermark=d.watermark.ToDatetime(), + ) + yield datum + + +class Sinker(sink_pb2_grpc.SinkServicer): + """ + Provides an interface to write a Sinker + which will be exposed over gRPC. + + Args: + handler: Function callable following the type signature of SinkCallable + sock_path: Path to the UNIX Domain Socket + max_message_size: The max message size in bytes the server can receive and send + max_threads: The max number of threads to be spawned; + defaults to number of processors x 4 + + Example invocation: + >>> from typing import List + >>> from pynumaflow.sinker import Datum, Responses, Response, Sinker + >>> def my_handler(datums: Iterator[Datum]) -> Responses: + ... responses = Responses() + ... for msg in datums: + ... responses.append(Response.as_success(msg.id)) + ... return responses + >>> grpc_server = Sinker(handler=my_handler) + >>> grpc_server.start() + """ + + def __init__( + self, + handler: SinkCallable, + sock_path=SINK_SOCK_PATH, + max_message_size=MAX_MESSAGE_SIZE, + max_threads=MAX_THREADS, + ): + self.__sink_handler: SinkCallable = handler + self.sock_path = f"unix://{sock_path}" + self._max_message_size = max_message_size + self._max_threads = max_threads + + self._server_options = [ + ("grpc.max_send_message_length", self._max_message_size), + ("grpc.max_receive_message_length", self._max_message_size), + ] + + def SinkFn( + self, request_iterator: Iterator[sink_pb2.SinkRequest], context: NumaflowServicerContext + ) -> sink_pb2.SinkResponse: + """ + Applies a sink function to a list of datum elements. + The pascal case function name comes from the proto sink_pb2_grpc.py file. + """ + # if there is an exception, we will mark all the responses as a failure + datum_iterator = datum_generator(request_iterator) + try: + rspns = self.__sink_handler(datum_iterator) + except Exception as err: + err_msg = "UDSinkError: %r" % err + _LOGGER.critical(err_msg, exc_info=True) + rspns = Responses() + for _datum in datum_iterator: + rspns.append(Response.as_failure(_datum.id, err_msg)) + + responses = [] + for rspn in rspns: + responses.append( + sink_pb2.SinkResponse.Result(id=rspn.id, success=rspn.success, err_msg=rspn.err) + ) + + return sink_pb2.SinkResponse(results=responses) + + def IsReady( + self, request: _empty_pb2.Empty, context: NumaflowServicerContext + ) -> sink_pb2.ReadyResponse: + """ + IsReady is the heartbeat endpoint for gRPC. + The pascal case function name comes from the proto sink_pb2_grpc.py file. + """ + return sink_pb2.ReadyResponse(ready=True) + + def start(self) -> None: + """ + Starts the gRPC server on the given UNIX socket with given max threads. + """ + server = grpc.server( + ThreadPoolExecutor(max_workers=self._max_threads), options=self._server_options + ) + sink_pb2_grpc.add_SinkServicer_to_server(Sinker(self.__sink_handler), server) + server.add_insecure_port(self.sock_path) + server.start() + serv_info = ServerInfo( + protocol=Protocol.UDS, + language=Language.PYTHON, + version=get_sdk_version(), + ) + info_server_write(server_info=serv_info, info_file=SERVER_INFO_FILE_PATH) + + _LOGGER.info( + "GRPC Server listening on: %s with max threads: %s", self.sock_path, self._max_threads + ) + server.wait_for_termination() From 3e3238adbd65ddd954189d828fd2108c5cbed144 Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Mon, 8 Jan 2024 17:03:33 -0800 Subject: [PATCH 28/78] sink Signed-off-by: Sidhant Kohli --- pynumaflow/_constants.py | 13 +++++ pynumaflow/mapper/map.py | 13 +++-- pynumaflow/shared/server.py | 12 +++-- pynumaflow/sinker/_dtypes.py | 29 +++++++++- pynumaflow/sinker/async_sink.py | 48 ----------------- pynumaflow/sinker/server.py | 41 --------------- pynumaflow/sinker/sink.py | 93 +++++++++++++++++++++++++++++++++ 7 files changed, 152 insertions(+), 97 deletions(-) create mode 100644 pynumaflow/sinker/sink.py diff --git a/pynumaflow/_constants.py b/pynumaflow/_constants.py index e1ab5463..0330d70b 100644 --- a/pynumaflow/_constants.py +++ b/pynumaflow/_constants.py @@ -41,3 +41,16 @@ class ServerType(str, Enum): Sync = "sync" Async = "async" Multiproc = "multiproc" + + +class UDFType(str, Enum): + """ + Enumerate the type of UDF. + """ + + Map = "map" + Reduce = "reduce" + Sink = "sink" + Source = "source" + SideInput = "sideinput" + SourceTransformer = "sourcetransformer" diff --git a/pynumaflow/mapper/map.py b/pynumaflow/mapper/map.py index 6ecae55d..9777378e 100644 --- a/pynumaflow/mapper/map.py +++ b/pynumaflow/mapper/map.py @@ -3,7 +3,14 @@ import aiorun import grpc -from pynumaflow._constants import MAX_THREADS, MAX_MESSAGE_SIZE, _LOGGER, MAP_SOCK_PATH, ServerType +from pynumaflow._constants import ( + MAX_THREADS, + MAX_MESSAGE_SIZE, + _LOGGER, + MAP_SOCK_PATH, + ServerType, + UDFType, +) from pynumaflow.mapper import Mapper, AsyncMapper from pynumaflow.mapper._dtypes import MapCallable from pynumaflow.mapper.proto import map_pb2_grpc @@ -91,7 +98,7 @@ def exec(self): bind_address=self.sock_path, max_threads=self.max_threads, server_options=self._server_options, - udf_type="Map", + udf_type=UDFType.Map, ) def exec_multiproc(self): @@ -106,7 +113,7 @@ def exec_multiproc(self): servicer=map_servicer, process_count=self._process_count, server_options=self._server_options, - udf_type="Map", + udf_type=UDFType.Map, ) async def aexec(self) -> None: diff --git a/pynumaflow/shared/server.py b/pynumaflow/shared/server.py index 03074d94..c360a86d 100644 --- a/pynumaflow/shared/server.py +++ b/pynumaflow/shared/server.py @@ -10,6 +10,7 @@ from pynumaflow._constants import ( _LOGGER, MULTIPROC_MAP_SOCK_ADDR, + UDFType, ) from pynumaflow.exceptions import SocketError from pynumaflow.info.server import get_sdk_version, write as info_server_write, get_metadata_env @@ -21,6 +22,7 @@ METADATA_ENVS, ) from pynumaflow.mapper.proto import map_pb2_grpc +from pynumaflow.sinker.proto import sink_pb2_grpc class NumaflowServer: @@ -83,7 +85,7 @@ def write_info_file(protocol: Protocol) -> None: def sync_server_start( - servicer, bind_address: str, max_threads: int, server_options=None, udf_type: str = "Map" + servicer, bind_address: str, max_threads: int, server_options=None, udf_type: str = UDFType.Map ): """ Starts the Synchronous server instance on the given UNIX socket with given max threads. @@ -120,11 +122,15 @@ def _run_server( ), options=server_options, ) - if udf_type == "Map": + if udf_type == UDFType.Map: map_pb2_grpc.add_MapServicer_to_server(servicer, server) + elif udf_type == UDFType.Sink: + sink_pb2_grpc.add_SinkServicer_to_server(servicer, server) + server.add_insecure_port(bind_address) server.start() + # Add the server information to the server info file if provided if server_info: info_server_write(server_info=server_info, info_file=SERVER_INFO_FILE_PATH) @@ -133,7 +139,7 @@ def _run_server( def start_multiproc_server( - max_threads: int, servicer, process_count: int, server_options=None, udf_type: str = "Map" + max_threads: int, servicer, process_count: int, server_options=None, udf_type: str = UDFType.Map ): """ Start N grpc servers in different processes where N = The number of CPUs or the diff --git a/pynumaflow/sinker/_dtypes.py b/pynumaflow/sinker/_dtypes.py index 1a020ac7..436dcf3a 100644 --- a/pynumaflow/sinker/_dtypes.py +++ b/pynumaflow/sinker/_dtypes.py @@ -1,6 +1,7 @@ +from abc import abstractmethod, ABCMeta from dataclasses import dataclass from datetime import datetime -from typing import TypeVar, Optional, Callable +from typing import TypeVar, Optional, Callable, Union from collections.abc import Sequence, Iterator from warnings import warn @@ -161,4 +162,28 @@ def watermark(self) -> datetime: return self._watermark -SinkCallable = Callable[[Iterator[Datum]], Responses] +class SinkerClass(metaclass=ABCMeta): + """ + Provides an interface to write a Sinker + which will be exposed over a gRPC server. + + Args: + + """ + + def __call__(self, *args, **kwargs): + """ + Allow to call handler function directly if class instance is sent + """ + return self.handler(*args, **kwargs) + + @abstractmethod + def handler(self, datums: Iterator[Datum]) -> Responses: + """ + Write a handler function which implements the MapCallable interface. + """ + pass + + +SinkHandlerCallable = Callable[[Iterator[Datum]], Responses] +SinkCallable = Union[SinkerClass, SinkHandlerCallable] diff --git a/pynumaflow/sinker/async_sink.py b/pynumaflow/sinker/async_sink.py index 8333710c..6075fb2b 100644 --- a/pynumaflow/sinker/async_sink.py +++ b/pynumaflow/sinker/async_sink.py @@ -3,16 +3,9 @@ import os from collections.abc import AsyncIterable -import grpc from google.protobuf import empty_pb2 as _empty_pb2 from pynumaflow import setup_logging -from pynumaflow._constants import ( - SINK_SOCK_PATH, - MAX_MESSAGE_SIZE, -) -from pynumaflow.info.server import get_sdk_version, write as info_server_write -from pynumaflow.info.types import ServerInfo, Protocol, Language, SERVER_INFO_FILE_PATH from pynumaflow.sinker import Responses, Datum, Response from pynumaflow.sinker._dtypes import SinkCallable from pynumaflow.sinker.proto import sink_pb2_grpc, sink_pb2 @@ -67,22 +60,10 @@ class AsyncSinker(sink_pb2_grpc.SinkServicer): def __init__( self, handler: SinkCallable, - sock_path=SINK_SOCK_PATH, - max_message_size=MAX_MESSAGE_SIZE, - max_threads=MAX_THREADS, ): - self.background_tasks = set() self.__sink_handler: SinkCallable = handler - self.sock_path = f"unix://{sock_path}" - self._max_message_size = max_message_size - self._max_threads = max_threads self.cleanup_coroutines = [] - self._server_options = [ - ("grpc.max_send_message_length", self._max_message_size), - ("grpc.max_receive_message_length", self._max_message_size), - ] - async def SinkFn( self, request_iterator: AsyncIterable[sink_pb2.SinkRequest], @@ -122,32 +103,3 @@ async def IsReady( The pascal case function name comes from the proto sink_pb2_grpc.py file. """ return sink_pb2.ReadyResponse(ready=True) - - async def __serve_async(self, server) -> None: - sink_pb2_grpc.add_SinkServicer_to_server(AsyncSinker(self.__sink_handler), server) - server.add_insecure_port(self.sock_path) - _LOGGER.info("GRPC Async Server listening on: %s", self.sock_path) - await server.start() - serv_info = ServerInfo( - protocol=Protocol.UDS, - language=Language.PYTHON, - version=get_sdk_version(), - ) - info_server_write(server_info=serv_info, info_file=SERVER_INFO_FILE_PATH) - - async def server_graceful_shutdown(): - _LOGGER.info("Starting graceful shutdown...") - """ - Shuts down the server with 5 seconds of grace period. During the - grace period, the server won't accept new connections and allow - existing RPCs to continue within the grace period. - await server.stop(5) - """ - - self.cleanup_coroutines.append(server_graceful_shutdown()) - await server.wait_for_termination() - - async def start(self) -> None: - """Starts the Async gRPC server on the given UNIX socket.""" - server = grpc.aio.server(options=self._server_options) - await self.__serve_async(server) diff --git a/pynumaflow/sinker/server.py b/pynumaflow/sinker/server.py index 195cee10..e3da5465 100644 --- a/pynumaflow/sinker/server.py +++ b/pynumaflow/sinker/server.py @@ -1,19 +1,11 @@ import logging import multiprocessing import os -from concurrent.futures import ThreadPoolExecutor from collections.abc import Iterator, Iterable -import grpc from google.protobuf import empty_pb2 as _empty_pb2 from pynumaflow import setup_logging -from pynumaflow._constants import ( - SINK_SOCK_PATH, - MAX_MESSAGE_SIZE, -) -from pynumaflow.info.server import get_sdk_version, write as info_server_write -from pynumaflow.info.types import ServerInfo, Protocol, Language, SERVER_INFO_FILE_PATH from pynumaflow.sinker import Responses, Datum, Response from pynumaflow.sinker._dtypes import SinkCallable from pynumaflow.sinker.proto import sink_pb2_grpc, sink_pb2 @@ -67,19 +59,8 @@ class Sinker(sink_pb2_grpc.SinkServicer): def __init__( self, handler: SinkCallable, - sock_path=SINK_SOCK_PATH, - max_message_size=MAX_MESSAGE_SIZE, - max_threads=MAX_THREADS, ): self.__sink_handler: SinkCallable = handler - self.sock_path = f"unix://{sock_path}" - self._max_message_size = max_message_size - self._max_threads = max_threads - - self._server_options = [ - ("grpc.max_send_message_length", self._max_message_size), - ("grpc.max_receive_message_length", self._max_message_size), - ] def SinkFn( self, request_iterator: Iterator[sink_pb2.SinkRequest], context: NumaflowServicerContext @@ -115,25 +96,3 @@ def IsReady( The pascal case function name comes from the proto sink_pb2_grpc.py file. """ return sink_pb2.ReadyResponse(ready=True) - - def start(self) -> None: - """ - Starts the gRPC server on the given UNIX socket with given max threads. - """ - server = grpc.server( - ThreadPoolExecutor(max_workers=self._max_threads), options=self._server_options - ) - sink_pb2_grpc.add_SinkServicer_to_server(Sinker(self.__sink_handler), server) - server.add_insecure_port(self.sock_path) - server.start() - serv_info = ServerInfo( - protocol=Protocol.UDS, - language=Language.PYTHON, - version=get_sdk_version(), - ) - info_server_write(server_info=serv_info, info_file=SERVER_INFO_FILE_PATH) - - _LOGGER.info( - "GRPC Server listening on: %s with max threads: %s", self.sock_path, self._max_threads - ) - server.wait_for_termination() diff --git a/pynumaflow/sinker/sink.py b/pynumaflow/sinker/sink.py new file mode 100644 index 00000000..f62703cf --- /dev/null +++ b/pynumaflow/sinker/sink.py @@ -0,0 +1,93 @@ +import os + +import aiorun +import grpc + +from pynumaflow.sinker.async_sink import AsyncSinker +from pynumaflow.sinker.proto import sink_pb2_grpc + +from pynumaflow.sinker.server import Sinker + +from pynumaflow._constants import ( + SINK_SOCK_PATH, + MAX_MESSAGE_SIZE, + MAX_THREADS, + ServerType, + _LOGGER, + UDFType, +) + +from pynumaflow.shared.server import NumaflowServer, sync_server_start, start_async_server +from pynumaflow.sinker._dtypes import SinkCallable + + +class SinkServer(NumaflowServer): + def __init__( + self, + sinker_instance: SinkCallable, + sock_path=SINK_SOCK_PATH, + max_message_size=MAX_MESSAGE_SIZE, + max_threads=MAX_THREADS, + server_type=ServerType.Sync, + ): + self.sock_path = f"unix://{sock_path}" + self.max_threads = min(max_threads, int(os.getenv("MAX_THREADS", "4"))) + self.max_message_size = max_message_size + + self.sinker_instance = sinker_instance + self.server_type = server_type + + self._server_options = [ + ("grpc.max_send_message_length", self.max_message_size), + ("grpc.max_receive_message_length", self.max_message_size), + ] + + def start(self): + if self.server_type == ServerType.Sync: + self.exec() + elif self.server_type == ServerType.Async: + aiorun.run(self.aexec()) + else: + _LOGGER.error("Server type not supported", self.server_type) + raise NotImplementedError + + def exec(self): + """ + Starts the Synchronous gRPC server on the given UNIX socket with given max threads. + """ + sink_servicer = self.get_servicer( + sinker_instance=self.sinker_instance, server_type=self.server_type + ) + _LOGGER.info( + "Sync GRPC Sink listening on: %s with max threads: %s", + self.sock_path, + self.max_threads, + ) + + sync_server_start( + servicer=sink_servicer, + bind_address=self.sock_path, + max_threads=self.max_threads, + server_options=self._server_options, + udf_type=UDFType.Sink, + ) + + async def aexec(self): + """ + Starts the Asynchronous gRPC server on the given UNIX socket with given max threads. + """ + server = grpc.aio.server() + server.add_insecure_port(self.sock_path) + sink_servicer = self.get_servicer( + sinker_instance=self.sinker_instance, server_type=self.server_type + ) + sink_pb2_grpc.add_SinkServicer_to_server(sink_servicer, server) + await start_async_server(server, self.sock_path, self.max_threads, self._server_options) + + def get_servicer(self, sinker_instance: SinkCallable, server_type: ServerType): + if server_type == ServerType.Sync: + return Sinker(sinker_instance) + elif server_type == ServerType.Async: + return AsyncSinker(sinker_instance) + else: + raise NotImplementedError From e35b156ab4435cc390850113cc8735192757982c Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Mon, 8 Jan 2024 17:09:22 -0800 Subject: [PATCH 29/78] sink Signed-off-by: Sidhant Kohli --- pynumaflow/mapper/__init__.py | 10 +++------- pynumaflow/mapper/map.py | 5 ++++- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/pynumaflow/mapper/__init__.py b/pynumaflow/mapper/__init__.py index 3109f812..286d80ba 100644 --- a/pynumaflow/mapper/__init__.py +++ b/pynumaflow/mapper/__init__.py @@ -1,8 +1,6 @@ -from pynumaflow.mapper._dtypes import Message, Messages, Datum, DROP, MapperClass -from pynumaflow.mapper.async_server import AsyncMapper +from pynumaflow.mapper.map import MapServer -# from pynumaflow.mapper.multiproc_server import MultiProcMapper -from pynumaflow.mapper.server import Mapper +from pynumaflow.mapper._dtypes import Message, Messages, Datum, DROP, MapperClass from pynumaflow._constants import ServerType __all__ = [ @@ -10,9 +8,7 @@ "Messages", "Datum", "DROP", - "Mapper", - "AsyncMapper", "ServerType", "MapperClass", - # "MultiProcMapper", + "MapServer" ] diff --git a/pynumaflow/mapper/map.py b/pynumaflow/mapper/map.py index 9777378e..264a3862 100644 --- a/pynumaflow/mapper/map.py +++ b/pynumaflow/mapper/map.py @@ -2,6 +2,9 @@ import aiorun import grpc +from pynumaflow.mapper.async_server import AsyncMapper + +from pynumaflow.mapper.server import Mapper from pynumaflow._constants import ( MAX_THREADS, @@ -11,7 +14,7 @@ ServerType, UDFType, ) -from pynumaflow.mapper import Mapper, AsyncMapper + from pynumaflow.mapper._dtypes import MapCallable from pynumaflow.mapper.proto import map_pb2_grpc from pynumaflow.shared.server import ( From 1ba673dc961f07836a166f2df886b8805db98f62 Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Mon, 8 Jan 2024 17:09:42 -0800 Subject: [PATCH 30/78] sink Signed-off-by: Sidhant Kohli --- pynumaflow/mapper/__init__.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/pynumaflow/mapper/__init__.py b/pynumaflow/mapper/__init__.py index 286d80ba..6df5e6ec 100644 --- a/pynumaflow/mapper/__init__.py +++ b/pynumaflow/mapper/__init__.py @@ -3,12 +3,4 @@ from pynumaflow.mapper._dtypes import Message, Messages, Datum, DROP, MapperClass from pynumaflow._constants import ServerType -__all__ = [ - "Message", - "Messages", - "Datum", - "DROP", - "ServerType", - "MapperClass", - "MapServer" -] +__all__ = ["Message", "Messages", "Datum", "DROP", "ServerType", "MapperClass", "MapServer"] From 069d681ead861bc64459fae921282082268ddf7c Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Mon, 8 Jan 2024 17:10:41 -0800 Subject: [PATCH 31/78] sink Signed-off-by: Sidhant Kohli --- pynumaflow/sinker/__init__.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/pynumaflow/sinker/__init__.py b/pynumaflow/sinker/__init__.py index c6b5e679..f69fa254 100644 --- a/pynumaflow/sinker/__init__.py +++ b/pynumaflow/sinker/__init__.py @@ -1,5 +1,8 @@ -from pynumaflow.sinker._dtypes import Response, Responses, Datum -from pynumaflow.sinker.async_sink import AsyncSinker -from pynumaflow.sinker.server import Sinker +from pynumaflow._constants import ServerType + +from pynumaflow.sinker.sink import SinkServer + +from pynumaflow.sinker._dtypes import Response, Responses, Datum, SinkerClass + +__all__ = ["Response", "Responses", "Datum", "SinkerClass", "SinkServer", "ServerType"] -__all__ = ["Response", "Responses", "Datum", "Sinker", "AsyncSinker"] From 285a1aea6b07246461d0f969d1022a6e96103da6 Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Mon, 8 Jan 2024 17:38:40 -0800 Subject: [PATCH 32/78] sink Signed-off-by: Sidhant Kohli --- pynumaflow/mapper/async_server.py | 2 +- pynumaflow/mapper/utils.py | 2 +- pynumaflow/reducer/async_server.py | 2 +- pynumaflow/sinker/__init__.py | 1 - pynumaflow/sinker/async_sink.py | 2 +- pynumaflow/sinker/server.py | 2 +- 6 files changed, 5 insertions(+), 6 deletions(-) diff --git a/pynumaflow/mapper/async_server.py b/pynumaflow/mapper/async_server.py index d6c86f23..6dff5633 100644 --- a/pynumaflow/mapper/async_server.py +++ b/pynumaflow/mapper/async_server.py @@ -7,7 +7,7 @@ from google.protobuf import empty_pb2 as _empty_pb2 from pynumaflow import setup_logging -from pynumaflow.mapper import Datum +from pynumaflow.mapper._dtypes import Datum from pynumaflow.mapper._dtypes import MapAsyncCallable, MapCallable from pynumaflow.mapper.proto import map_pb2 from pynumaflow.mapper.proto import map_pb2_grpc diff --git a/pynumaflow/mapper/utils.py b/pynumaflow/mapper/utils.py index 42e3c9a4..8d2af982 100644 --- a/pynumaflow/mapper/utils.py +++ b/pynumaflow/mapper/utils.py @@ -1,7 +1,7 @@ import grpc from pynumaflow.mapper._dtypes import MapCallable -from pynumaflow.mapper import Datum +from pynumaflow.mapper._dtypes import Datum from pynumaflow.mapper.proto import map_pb2 from pynumaflow.types import NumaflowServicerContext from pynumaflow._constants import _LOGGER diff --git a/pynumaflow/reducer/async_server.py b/pynumaflow/reducer/async_server.py index b9cb9614..7b8c6e33 100644 --- a/pynumaflow/reducer/async_server.py +++ b/pynumaflow/reducer/async_server.py @@ -16,7 +16,7 @@ STREAM_EOF, DELIMITER, ) -from pynumaflow.reducer import Datum, IntervalWindow, Metadata +from pynumaflow.reducer._dtypes import Datum, IntervalWindow, Metadata from pynumaflow.reducer._dtypes import ReduceResult, ReduceCallable from pynumaflow.reducer.asynciter import NonBlockingIterator from pynumaflow.reducer.proto import reduce_pb2 diff --git a/pynumaflow/sinker/__init__.py b/pynumaflow/sinker/__init__.py index f69fa254..9b89b5d6 100644 --- a/pynumaflow/sinker/__init__.py +++ b/pynumaflow/sinker/__init__.py @@ -5,4 +5,3 @@ from pynumaflow.sinker._dtypes import Response, Responses, Datum, SinkerClass __all__ = ["Response", "Responses", "Datum", "SinkerClass", "SinkServer", "ServerType"] - diff --git a/pynumaflow/sinker/async_sink.py b/pynumaflow/sinker/async_sink.py index 6075fb2b..4b522543 100644 --- a/pynumaflow/sinker/async_sink.py +++ b/pynumaflow/sinker/async_sink.py @@ -6,7 +6,7 @@ from google.protobuf import empty_pb2 as _empty_pb2 from pynumaflow import setup_logging -from pynumaflow.sinker import Responses, Datum, Response +from pynumaflow.sinker._dtypes import Responses, Datum, Response from pynumaflow.sinker._dtypes import SinkCallable from pynumaflow.sinker.proto import sink_pb2_grpc, sink_pb2 from pynumaflow.types import NumaflowServicerContext diff --git a/pynumaflow/sinker/server.py b/pynumaflow/sinker/server.py index e3da5465..c0423a13 100644 --- a/pynumaflow/sinker/server.py +++ b/pynumaflow/sinker/server.py @@ -6,7 +6,7 @@ from google.protobuf import empty_pb2 as _empty_pb2 from pynumaflow import setup_logging -from pynumaflow.sinker import Responses, Datum, Response +from pynumaflow.sinker._dtypes import Responses, Datum, Response from pynumaflow.sinker._dtypes import SinkCallable from pynumaflow.sinker.proto import sink_pb2_grpc, sink_pb2 from pynumaflow.types import NumaflowServicerContext From e86fbe180ffbe39065d8cb0709b0aaa7cf04abc5 Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Mon, 8 Jan 2024 22:11:02 -0800 Subject: [PATCH 33/78] proto Signed-off-by: Sidhant Kohli --- pynumaflow/mapper/async_server.py | 3 +-- pynumaflow/mapper/map.py | 2 +- pynumaflow/mapper/server.py | 3 +-- pynumaflow/mapper/utils.py | 2 +- pynumaflow/{mapper => }/proto/__init__.py | 0 pynumaflow/{sinker/proto => proto/mapper}/__init__.py | 0 pynumaflow/{mapper/proto => proto/mapper}/map.proto | 0 pynumaflow/{mapper/proto => proto/mapper}/map_pb2.py | 0 pynumaflow/{mapper/proto => proto/mapper}/map_pb2_grpc.py | 2 +- pynumaflow/proto/sinker/__init__.py | 0 pynumaflow/{sinker/proto => proto/sinker}/sink.proto | 0 pynumaflow/{sinker/proto => proto/sinker}/sink_pb2.py | 0 pynumaflow/{sinker/proto => proto/sinker}/sink_pb2_grpc.py | 2 +- pynumaflow/shared/__init__.py | 4 ++++ pynumaflow/shared/server.py | 4 ++-- pynumaflow/sinker/async_sink.py | 2 +- pynumaflow/sinker/server.py | 2 +- pynumaflow/sinker/sink.py | 2 +- 18 files changed, 15 insertions(+), 13 deletions(-) rename pynumaflow/{mapper => }/proto/__init__.py (100%) rename pynumaflow/{sinker/proto => proto/mapper}/__init__.py (100%) rename pynumaflow/{mapper/proto => proto/mapper}/map.proto (100%) rename pynumaflow/{mapper/proto => proto/mapper}/map_pb2.py (100%) rename pynumaflow/{mapper/proto => proto/mapper}/map_pb2_grpc.py (98%) create mode 100644 pynumaflow/proto/sinker/__init__.py rename pynumaflow/{sinker/proto => proto/sinker}/sink.proto (100%) rename pynumaflow/{sinker/proto => proto/sinker}/sink_pb2.py (100%) rename pynumaflow/{sinker/proto => proto/sinker}/sink_pb2_grpc.py (98%) create mode 100644 pynumaflow/shared/__init__.py diff --git a/pynumaflow/mapper/async_server.py b/pynumaflow/mapper/async_server.py index 6dff5633..d902a362 100644 --- a/pynumaflow/mapper/async_server.py +++ b/pynumaflow/mapper/async_server.py @@ -9,8 +9,7 @@ from pynumaflow import setup_logging from pynumaflow.mapper._dtypes import Datum from pynumaflow.mapper._dtypes import MapAsyncCallable, MapCallable -from pynumaflow.mapper.proto import map_pb2 -from pynumaflow.mapper.proto import map_pb2_grpc +from pynumaflow.proto.mapper import map_pb2, map_pb2_grpc from pynumaflow.types import NumaflowServicerContext _LOGGER = setup_logging(__name__) diff --git a/pynumaflow/mapper/map.py b/pynumaflow/mapper/map.py index 264a3862..b2546170 100644 --- a/pynumaflow/mapper/map.py +++ b/pynumaflow/mapper/map.py @@ -16,7 +16,7 @@ ) from pynumaflow.mapper._dtypes import MapCallable -from pynumaflow.mapper.proto import map_pb2_grpc +from pynumaflow.proto.mapper import map_pb2_grpc from pynumaflow.shared.server import ( NumaflowServer, start_async_server, diff --git a/pynumaflow/mapper/server.py b/pynumaflow/mapper/server.py index 69b78d8b..83680cf9 100644 --- a/pynumaflow/mapper/server.py +++ b/pynumaflow/mapper/server.py @@ -5,8 +5,7 @@ from pynumaflow import setup_logging from pynumaflow.mapper._dtypes import MapCallable -from pynumaflow.mapper.proto import map_pb2 -from pynumaflow.mapper.proto import map_pb2_grpc +from pynumaflow.proto.mapper import map_pb2, map_pb2_grpc from pynumaflow.mapper.utils import _map_fn_util from pynumaflow.types import NumaflowServicerContext diff --git a/pynumaflow/mapper/utils.py b/pynumaflow/mapper/utils.py index 8d2af982..8e764cb9 100644 --- a/pynumaflow/mapper/utils.py +++ b/pynumaflow/mapper/utils.py @@ -2,7 +2,7 @@ from pynumaflow.mapper._dtypes import MapCallable from pynumaflow.mapper._dtypes import Datum -from pynumaflow.mapper.proto import map_pb2 +from pynumaflow.proto.mapper import map_pb2 from pynumaflow.types import NumaflowServicerContext from pynumaflow._constants import _LOGGER diff --git a/pynumaflow/mapper/proto/__init__.py b/pynumaflow/proto/__init__.py similarity index 100% rename from pynumaflow/mapper/proto/__init__.py rename to pynumaflow/proto/__init__.py diff --git a/pynumaflow/sinker/proto/__init__.py b/pynumaflow/proto/mapper/__init__.py similarity index 100% rename from pynumaflow/sinker/proto/__init__.py rename to pynumaflow/proto/mapper/__init__.py diff --git a/pynumaflow/mapper/proto/map.proto b/pynumaflow/proto/mapper/map.proto similarity index 100% rename from pynumaflow/mapper/proto/map.proto rename to pynumaflow/proto/mapper/map.proto diff --git a/pynumaflow/mapper/proto/map_pb2.py b/pynumaflow/proto/mapper/map_pb2.py similarity index 100% rename from pynumaflow/mapper/proto/map_pb2.py rename to pynumaflow/proto/mapper/map_pb2.py diff --git a/pynumaflow/mapper/proto/map_pb2_grpc.py b/pynumaflow/proto/mapper/map_pb2_grpc.py similarity index 98% rename from pynumaflow/mapper/proto/map_pb2_grpc.py rename to pynumaflow/proto/mapper/map_pb2_grpc.py index da8edc68..6973f6e7 100644 --- a/pynumaflow/mapper/proto/map_pb2_grpc.py +++ b/pynumaflow/proto/mapper/map_pb2_grpc.py @@ -3,7 +3,7 @@ import grpc from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2 -from . import map_pb2 as map__pb2 +from pynumaflow.proto.mapper import map_pb2 as map__pb2 class MapStub(object): diff --git a/pynumaflow/proto/sinker/__init__.py b/pynumaflow/proto/sinker/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pynumaflow/sinker/proto/sink.proto b/pynumaflow/proto/sinker/sink.proto similarity index 100% rename from pynumaflow/sinker/proto/sink.proto rename to pynumaflow/proto/sinker/sink.proto diff --git a/pynumaflow/sinker/proto/sink_pb2.py b/pynumaflow/proto/sinker/sink_pb2.py similarity index 100% rename from pynumaflow/sinker/proto/sink_pb2.py rename to pynumaflow/proto/sinker/sink_pb2.py diff --git a/pynumaflow/sinker/proto/sink_pb2_grpc.py b/pynumaflow/proto/sinker/sink_pb2_grpc.py similarity index 98% rename from pynumaflow/sinker/proto/sink_pb2_grpc.py rename to pynumaflow/proto/sinker/sink_pb2_grpc.py index ef673e9d..92cf473b 100644 --- a/pynumaflow/sinker/proto/sink_pb2_grpc.py +++ b/pynumaflow/proto/sinker/sink_pb2_grpc.py @@ -3,7 +3,7 @@ import grpc from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2 -from . import sink_pb2 as sink__pb2 +from pynumaflow.proto.sinker import sink_pb2 as sink__pb2 class SinkStub(object): diff --git a/pynumaflow/shared/__init__.py b/pynumaflow/shared/__init__.py new file mode 100644 index 00000000..857f0a9f --- /dev/null +++ b/pynumaflow/shared/__init__.py @@ -0,0 +1,4 @@ +from pynumaflow.shared.server import NumaflowServer + + +__all__ = ["NumaflowServer"] diff --git a/pynumaflow/shared/server.py b/pynumaflow/shared/server.py index c360a86d..a7832831 100644 --- a/pynumaflow/shared/server.py +++ b/pynumaflow/shared/server.py @@ -21,8 +21,8 @@ SERVER_INFO_FILE_PATH, METADATA_ENVS, ) -from pynumaflow.mapper.proto import map_pb2_grpc -from pynumaflow.sinker.proto import sink_pb2_grpc +from pynumaflow.proto.mapper import map_pb2_grpc +from pynumaflow.proto.sinker import sink_pb2_grpc class NumaflowServer: diff --git a/pynumaflow/sinker/async_sink.py b/pynumaflow/sinker/async_sink.py index 4b522543..c705b6ae 100644 --- a/pynumaflow/sinker/async_sink.py +++ b/pynumaflow/sinker/async_sink.py @@ -8,7 +8,7 @@ from pynumaflow import setup_logging from pynumaflow.sinker._dtypes import Responses, Datum, Response from pynumaflow.sinker._dtypes import SinkCallable -from pynumaflow.sinker.proto import sink_pb2_grpc, sink_pb2 +from pynumaflow.proto.sinker import sink_pb2_grpc, sink_pb2 from pynumaflow.types import NumaflowServicerContext _LOGGER = setup_logging(__name__) diff --git a/pynumaflow/sinker/server.py b/pynumaflow/sinker/server.py index c0423a13..ee928845 100644 --- a/pynumaflow/sinker/server.py +++ b/pynumaflow/sinker/server.py @@ -8,7 +8,7 @@ from pynumaflow import setup_logging from pynumaflow.sinker._dtypes import Responses, Datum, Response from pynumaflow.sinker._dtypes import SinkCallable -from pynumaflow.sinker.proto import sink_pb2_grpc, sink_pb2 +from pynumaflow.proto.sinker import sink_pb2_grpc, sink_pb2 from pynumaflow.types import NumaflowServicerContext _LOGGER = setup_logging(__name__) diff --git a/pynumaflow/sinker/sink.py b/pynumaflow/sinker/sink.py index f62703cf..b4f898a9 100644 --- a/pynumaflow/sinker/sink.py +++ b/pynumaflow/sinker/sink.py @@ -4,7 +4,7 @@ import grpc from pynumaflow.sinker.async_sink import AsyncSinker -from pynumaflow.sinker.proto import sink_pb2_grpc +from pynumaflow.proto.sinker import sink_pb2_grpc from pynumaflow.sinker.server import Sinker From 98dd01380c0d9565c93e109c22fe316a7690a25c Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Tue, 9 Jan 2024 11:19:54 -0800 Subject: [PATCH 34/78] transform Signed-off-by: Sidhant Kohli --- .../sourcetransformer/multiproc_server.py | 206 ------------------ pynumaflow/mapper/map.py | 1 + pynumaflow/mapstreamer/async_server.py | 3 +- pynumaflow/mapstreamer/mapstream.py | 2 +- .../proto/mapstreamer}/__init__.py | 0 .../mapstreamer}/mapstream.proto | 0 .../mapstreamer}/mapstream_pb2.py | 0 .../mapstreamer}/mapstream_pb2_grpc.py | 2 +- .../proto => proto/reducer}/__init__.py | 0 .../proto => proto/reducer}/reduce.proto | 0 .../proto => proto/reducer}/reduce_pb2.py | 0 .../reducer}/reduce_pb2_grpc.py | 2 +- .../proto => proto/sideinput}/__init__.py | 0 .../proto => proto/sideinput}/sideinput.proto | 0 .../sideinput}/sideinput_pb2.py | 0 .../sideinput}/sideinput_pb2_grpc.py | 2 +- .../sourcetransformer}/__init__.py | 0 .../proto/sourcetransformer}/transform.proto | 0 .../proto/sourcetransformer}/transform_pb2.py | 0 .../sourcetransformer}/transform_pb2_grpc.py | 0 pynumaflow/reducer/async_server.py | 3 +- pynumaflow/reducer/reduce.py | 2 +- pynumaflow/shared/server.py | 3 + pynumaflow/sideinput/server.py | 2 +- pynumaflow/sourcetransformer/__init__.py | 20 ++ .../sourcetransformer/_dtypes.py | 29 ++- .../sourcetransformer/server.py | 47 +--- .../sourcetransformer/sourcetransform.py | 121 ++++++++++ 28 files changed, 182 insertions(+), 263 deletions(-) delete mode 100644 pynumaflow-old/sourcetransformer/multiproc_server.py rename {pynumaflow-old/sourcetransformer/proto => pynumaflow/proto/mapstreamer}/__init__.py (100%) rename pynumaflow/{mapstreamer/proto => proto/mapstreamer}/mapstream.proto (100%) rename pynumaflow/{mapstreamer/proto => proto/mapstreamer}/mapstream_pb2.py (100%) rename pynumaflow/{mapstreamer/proto => proto/mapstreamer}/mapstream_pb2_grpc.py (98%) rename pynumaflow/{mapstreamer/proto => proto/reducer}/__init__.py (100%) rename pynumaflow/{reducer/proto => proto/reducer}/reduce.proto (100%) rename pynumaflow/{reducer/proto => proto/reducer}/reduce_pb2.py (100%) rename pynumaflow/{reducer/proto => proto/reducer}/reduce_pb2_grpc.py (98%) rename pynumaflow/{reducer/proto => proto/sideinput}/__init__.py (100%) rename pynumaflow/{sideinput/proto => proto/sideinput}/sideinput.proto (100%) rename pynumaflow/{sideinput/proto => proto/sideinput}/sideinput_pb2.py (100%) rename pynumaflow/{sideinput/proto => proto/sideinput}/sideinput_pb2_grpc.py (98%) rename pynumaflow/{sideinput/proto => proto/sourcetransformer}/__init__.py (100%) rename {pynumaflow-old/sourcetransformer/proto => pynumaflow/proto/sourcetransformer}/transform.proto (100%) rename {pynumaflow-old/sourcetransformer/proto => pynumaflow/proto/sourcetransformer}/transform_pb2.py (100%) rename {pynumaflow-old/sourcetransformer/proto => pynumaflow/proto/sourcetransformer}/transform_pb2_grpc.py (100%) create mode 100644 pynumaflow/sourcetransformer/__init__.py rename {pynumaflow-old => pynumaflow}/sourcetransformer/_dtypes.py (85%) rename {pynumaflow-old => pynumaflow}/sourcetransformer/server.py (66%) create mode 100644 pynumaflow/sourcetransformer/sourcetransform.py diff --git a/pynumaflow-old/sourcetransformer/multiproc_server.py b/pynumaflow-old/sourcetransformer/multiproc_server.py deleted file mode 100644 index 7aa58e9d..00000000 --- a/pynumaflow-old/sourcetransformer/multiproc_server.py +++ /dev/null @@ -1,206 +0,0 @@ -import contextlib -import logging -import multiprocessing -import os -import socket -from concurrent import futures -from collections.abc import Iterator - -import grpc -from google.protobuf import empty_pb2 as _empty_pb2 -from google.protobuf import timestamp_pb2 as _timestamp_pb2 - -from pynumaflow import setup_logging -from pynumaflow._constants import ( - MAX_MESSAGE_SIZE, -) -from pynumaflow._constants import MULTIPROC_MAP_SOCK_ADDR -from pynumaflow.exceptions import SocketError -from pynumaflow.info.server import ( - get_sdk_version, - write as info_server_write, - get_metadata_env, -) -from pynumaflow.info.types import ( - ServerInfo, - Protocol, - Language, - SERVER_INFO_FILE_PATH, - METADATA_ENVS, -) -from pynumaflow.sourcetransformer import Datum -from pynumaflow.sourcetransformer._dtypes import SourceTransformCallable -from pynumaflow.sourcetransformer.proto import transform_pb2 -from pynumaflow.sourcetransformer.proto import transform_pb2_grpc -from pynumaflow.types import NumaflowServicerContext - -_LOGGER = setup_logging(__name__) -if os.getenv("PYTHONDEBUG"): - _LOGGER.setLevel(logging.DEBUG) - - -class MultiProcSourceTransformer(transform_pb2_grpc.SourceTransformServicer): - """ - Provides an interface to write a Multi-Processor Source Transformer - which will be exposed over gRPC. - - Args: - - handler: Function callable following the type signature of SourceTransformCallable - max_message_size: The max message size in bytes the server can receive and send - - Example invocation: - >>> from typing import Iterator - >>> from pynumaflow.sourcetransformer import Messages, Message \ - ... Datum, MultiProcSourceTransformer - >>> def transform_handler(key: [str], datum: Datum) -> Messages: - ... val = datum.value - ... new_event_time = datetime.time() - ... _ = datum.watermark - ... message_t_s = Messages(Message(val, event_time=new_event_time, keys=key)) - ... return message_t_s - ... - ... - >>> grpc_server = MultiProcSourceTransformer(handler=transform_handler) - >>> grpc_server.start() - """ - - def __init__( - self, - handler: SourceTransformCallable, - max_message_size=MAX_MESSAGE_SIZE, - ): - self.__transform_handler: SourceTransformCallable = handler - self._max_message_size = max_message_size - - self._server_options = [ - ("grpc.max_send_message_length", self._max_message_size), - ("grpc.max_receive_message_length", self._max_message_size), - ("grpc.so_reuseport", 1), - ("grpc.so_reuseaddr", 1), - ] - # Set the number of processes to be spawned to the number of CPUs or the value - # of the env var NUM_CPU_MULTIPROC defined by the user - # Setting the max value to 2 * CPU count - self._process_count = min( - int(os.getenv("NUM_CPU_MULTIPROC", str(os.cpu_count()))), 2 * os.cpu_count() - ) - self._threads_per_proc = int(os.getenv("MAX_THREADS", "4")) - - def SourceTransformFn( - self, request: transform_pb2.SourceTransformRequest, context: NumaflowServicerContext - ) -> transform_pb2.SourceTransformResponse: - """ - Applies a function to each datum element. - The pascal case function name comes from the generated transform_pb2_grpc.py file. - """ - - # proto repeated field(keys) is of type google._upb._message.RepeatedScalarContainer - # we need to explicitly convert it to list - try: - msgts = self.__transform_handler( - list(request.keys), - Datum( - keys=list(request.keys), - value=request.value, - event_time=request.event_time.ToDatetime(), - watermark=request.watermark.ToDatetime(), - ), - ) - except Exception as err: - _LOGGER.critical("UDFError, re-raising the error", exc_info=True) - context.set_code(grpc.StatusCode.UNKNOWN) - context.set_details(str(err)) - return transform_pb2.SourceTransformResponse(results=[]) - - datums = [] - for msgt in msgts: - event_time_timestamp = _timestamp_pb2.Timestamp() - event_time_timestamp.FromDatetime(dt=msgt.event_time) - datums.append( - transform_pb2.SourceTransformResponse.Result( - keys=list(msgt.keys), - value=msgt.value, - tags=msgt.tags, - event_time=event_time_timestamp, - ) - ) - return transform_pb2.SourceTransformResponse(results=datums) - - def IsReady( - self, request: _empty_pb2.Empty, context: NumaflowServicerContext - ) -> transform_pb2.ReadyResponse: - """ - IsReady is the heartbeat endpoint for gRPC. - The pascal case function name comes from the proto transform_pb2_grpc.py file. - """ - return transform_pb2.ReadyResponse(ready=True) - - def _run_server(self, bind_address): - """Start a server in a subprocess.""" - _LOGGER.info("Starting new server.") - server = grpc.server( - futures.ThreadPoolExecutor( - max_workers=self._threads_per_proc, - ), - options=self._server_options, - ) - transform_pb2_grpc.add_SourceTransformServicer_to_server(self, server) - server.add_insecure_port(bind_address) - server.start() - _LOGGER.info("GRPC Multi-Processor Server listening on: %s %d", bind_address, os.getpid()) - server.wait_for_termination() - - @contextlib.contextmanager - def _reserve_port(self, port_num: int) -> Iterator[int]: - """Find and reserve a port for all subprocesses to use.""" - sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) - if sock.getsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR) == 0: - raise SocketError("Failed to set SO_REUSEADDR.") - try: - sock.bind(("", port_num)) - yield sock.getsockname()[1] - finally: - sock.close() - - def start(self) -> None: - """ - Start N grpc servers in different processes where N = The number of CPUs or the - value of the env var NUM_CPU_MULTIPROC defined by the user. The max value - is set to 2 * CPU count. - Each server will be bound to a different port, and we will create equal number of - workers to handle each server. - On the client side there will be same number of connections as the number of servers. - """ - workers = [] - server_ports = [] - for _ in range(self._process_count): - # Find a port to bind to for each server, thus sending the port number = 0 - # to the _reserve_port function so that kernel can find and return a free port - with self._reserve_port(0) as port: - bind_address = f"{MULTIPROC_MAP_SOCK_ADDR}:{port}" - _LOGGER.info("Starting server on port: %s", port) - # NOTE: It is imperative that the worker subprocesses be forked before - # any gRPC servers start up. See - # https://github.com/grpc/grpc/issues/16001 for more details. - worker = multiprocessing.Process(target=self._run_server, args=(bind_address,)) - worker.start() - workers.append(worker) - server_ports.append(port) - - # Convert the available ports to a comma separated string - ports = ",".join(map(str, server_ports)) - - serv_info = ServerInfo( - protocol=Protocol.TCP, - language=Language.PYTHON, - version=get_sdk_version(), - metadata=get_metadata_env(envs=METADATA_ENVS), - ) - # Add the PORTS metadata using the available ports - serv_info.metadata["SERV_PORTS"] = ports - info_server_write(server_info=serv_info, info_file=SERVER_INFO_FILE_PATH) - - for worker in workers: - worker.join() diff --git a/pynumaflow/mapper/map.py b/pynumaflow/mapper/map.py index b2546170..11de212a 100644 --- a/pynumaflow/mapper/map.py +++ b/pynumaflow/mapper/map.py @@ -81,6 +81,7 @@ def start(self) -> None: elif self.server_type == ServerType.Multiproc: self.exec_multiproc() else: + _LOGGER.error("Server type not supported", self.server_type) raise NotImplementedError def exec(self): diff --git a/pynumaflow/mapstreamer/async_server.py b/pynumaflow/mapstreamer/async_server.py index 0ecb4de7..0639b4a0 100644 --- a/pynumaflow/mapstreamer/async_server.py +++ b/pynumaflow/mapstreamer/async_server.py @@ -9,8 +9,7 @@ from pynumaflow import setup_logging from pynumaflow.mapstreamer import Datum from pynumaflow.mapstreamer._dtypes import MapStreamCallable -from pynumaflow.mapstreamer.proto import mapstream_pb2 -from pynumaflow.mapstreamer.proto import mapstream_pb2_grpc +from pynumaflow.proto.mapstreamer import mapstream_pb2_grpc, mapstream_pb2 from pynumaflow.types import NumaflowServicerContext _LOGGER = setup_logging(__name__) diff --git a/pynumaflow/mapstreamer/mapstream.py b/pynumaflow/mapstreamer/mapstream.py index 9fd318c3..71d16b92 100644 --- a/pynumaflow/mapstreamer/mapstream.py +++ b/pynumaflow/mapstreamer/mapstream.py @@ -4,7 +4,7 @@ import grpc from pynumaflow.mapstreamer.async_server import AsyncMapStreamer -from pynumaflow.mapstreamer.proto import mapstream_pb2_grpc +from pynumaflow.proto.mapstreamer import mapstream_pb2_grpc from pynumaflow._constants import ( MAP_STREAM_SOCK_PATH, diff --git a/pynumaflow-old/sourcetransformer/proto/__init__.py b/pynumaflow/proto/mapstreamer/__init__.py similarity index 100% rename from pynumaflow-old/sourcetransformer/proto/__init__.py rename to pynumaflow/proto/mapstreamer/__init__.py diff --git a/pynumaflow/mapstreamer/proto/mapstream.proto b/pynumaflow/proto/mapstreamer/mapstream.proto similarity index 100% rename from pynumaflow/mapstreamer/proto/mapstream.proto rename to pynumaflow/proto/mapstreamer/mapstream.proto diff --git a/pynumaflow/mapstreamer/proto/mapstream_pb2.py b/pynumaflow/proto/mapstreamer/mapstream_pb2.py similarity index 100% rename from pynumaflow/mapstreamer/proto/mapstream_pb2.py rename to pynumaflow/proto/mapstreamer/mapstream_pb2.py diff --git a/pynumaflow/mapstreamer/proto/mapstream_pb2_grpc.py b/pynumaflow/proto/mapstreamer/mapstream_pb2_grpc.py similarity index 98% rename from pynumaflow/mapstreamer/proto/mapstream_pb2_grpc.py rename to pynumaflow/proto/mapstreamer/mapstream_pb2_grpc.py index 305c8e05..3f2b7901 100644 --- a/pynumaflow/mapstreamer/proto/mapstream_pb2_grpc.py +++ b/pynumaflow/proto/mapstreamer/mapstream_pb2_grpc.py @@ -3,7 +3,7 @@ import grpc from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2 -from . import mapstream_pb2 as mapstream__pb2 +from pynumaflow.proto.mapstreamer import mapstream_pb2 as mapstream__pb2 class MapStreamStub(object): diff --git a/pynumaflow/mapstreamer/proto/__init__.py b/pynumaflow/proto/reducer/__init__.py similarity index 100% rename from pynumaflow/mapstreamer/proto/__init__.py rename to pynumaflow/proto/reducer/__init__.py diff --git a/pynumaflow/reducer/proto/reduce.proto b/pynumaflow/proto/reducer/reduce.proto similarity index 100% rename from pynumaflow/reducer/proto/reduce.proto rename to pynumaflow/proto/reducer/reduce.proto diff --git a/pynumaflow/reducer/proto/reduce_pb2.py b/pynumaflow/proto/reducer/reduce_pb2.py similarity index 100% rename from pynumaflow/reducer/proto/reduce_pb2.py rename to pynumaflow/proto/reducer/reduce_pb2.py diff --git a/pynumaflow/reducer/proto/reduce_pb2_grpc.py b/pynumaflow/proto/reducer/reduce_pb2_grpc.py similarity index 98% rename from pynumaflow/reducer/proto/reduce_pb2_grpc.py rename to pynumaflow/proto/reducer/reduce_pb2_grpc.py index 5a0a15f6..1e36317e 100644 --- a/pynumaflow/reducer/proto/reduce_pb2_grpc.py +++ b/pynumaflow/proto/reducer/reduce_pb2_grpc.py @@ -3,7 +3,7 @@ import grpc from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2 -from . import reduce_pb2 as reduce__pb2 +from pynumaflow.proto.reducer import reduce_pb2 as reduce__pb2 class ReduceStub(object): diff --git a/pynumaflow/reducer/proto/__init__.py b/pynumaflow/proto/sideinput/__init__.py similarity index 100% rename from pynumaflow/reducer/proto/__init__.py rename to pynumaflow/proto/sideinput/__init__.py diff --git a/pynumaflow/sideinput/proto/sideinput.proto b/pynumaflow/proto/sideinput/sideinput.proto similarity index 100% rename from pynumaflow/sideinput/proto/sideinput.proto rename to pynumaflow/proto/sideinput/sideinput.proto diff --git a/pynumaflow/sideinput/proto/sideinput_pb2.py b/pynumaflow/proto/sideinput/sideinput_pb2.py similarity index 100% rename from pynumaflow/sideinput/proto/sideinput_pb2.py rename to pynumaflow/proto/sideinput/sideinput_pb2.py diff --git a/pynumaflow/sideinput/proto/sideinput_pb2_grpc.py b/pynumaflow/proto/sideinput/sideinput_pb2_grpc.py similarity index 98% rename from pynumaflow/sideinput/proto/sideinput_pb2_grpc.py rename to pynumaflow/proto/sideinput/sideinput_pb2_grpc.py index 72ea87ed..abcd0b79 100644 --- a/pynumaflow/sideinput/proto/sideinput_pb2_grpc.py +++ b/pynumaflow/proto/sideinput/sideinput_pb2_grpc.py @@ -3,7 +3,7 @@ import grpc from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2 -from . import sideinput_pb2 as sideinput__pb2 +from pynumaflow.proto.sideinput import sideinput_pb2 as sideinput__pb2 class SideInputStub(object): diff --git a/pynumaflow/sideinput/proto/__init__.py b/pynumaflow/proto/sourcetransformer/__init__.py similarity index 100% rename from pynumaflow/sideinput/proto/__init__.py rename to pynumaflow/proto/sourcetransformer/__init__.py diff --git a/pynumaflow-old/sourcetransformer/proto/transform.proto b/pynumaflow/proto/sourcetransformer/transform.proto similarity index 100% rename from pynumaflow-old/sourcetransformer/proto/transform.proto rename to pynumaflow/proto/sourcetransformer/transform.proto diff --git a/pynumaflow-old/sourcetransformer/proto/transform_pb2.py b/pynumaflow/proto/sourcetransformer/transform_pb2.py similarity index 100% rename from pynumaflow-old/sourcetransformer/proto/transform_pb2.py rename to pynumaflow/proto/sourcetransformer/transform_pb2.py diff --git a/pynumaflow-old/sourcetransformer/proto/transform_pb2_grpc.py b/pynumaflow/proto/sourcetransformer/transform_pb2_grpc.py similarity index 100% rename from pynumaflow-old/sourcetransformer/proto/transform_pb2_grpc.py rename to pynumaflow/proto/sourcetransformer/transform_pb2_grpc.py diff --git a/pynumaflow/reducer/async_server.py b/pynumaflow/reducer/async_server.py index 7b8c6e33..0a6d5b3b 100644 --- a/pynumaflow/reducer/async_server.py +++ b/pynumaflow/reducer/async_server.py @@ -19,8 +19,7 @@ from pynumaflow.reducer._dtypes import Datum, IntervalWindow, Metadata from pynumaflow.reducer._dtypes import ReduceResult, ReduceCallable from pynumaflow.reducer.asynciter import NonBlockingIterator -from pynumaflow.reducer.proto import reduce_pb2 -from pynumaflow.reducer.proto import reduce_pb2_grpc +from pynumaflow.proto.reducer import reduce_pb2, reduce_pb2_grpc from pynumaflow.types import NumaflowServicerContext _LOGGER = setup_logging(__name__) diff --git a/pynumaflow/reducer/reduce.py b/pynumaflow/reducer/reduce.py index 3b2ca06b..1b3e1711 100644 --- a/pynumaflow/reducer/reduce.py +++ b/pynumaflow/reducer/reduce.py @@ -1,7 +1,7 @@ import aiorun import grpc -from pynumaflow.reducer.proto import reduce_pb2_grpc +from pynumaflow.proto.reducer import reduce_pb2_grpc from pynumaflow.reducer.async_server import AsyncReducer diff --git a/pynumaflow/shared/server.py b/pynumaflow/shared/server.py index a7832831..899961b2 100644 --- a/pynumaflow/shared/server.py +++ b/pynumaflow/shared/server.py @@ -23,6 +23,7 @@ ) from pynumaflow.proto.mapper import map_pb2_grpc from pynumaflow.proto.sinker import sink_pb2_grpc +from pynumaflow.proto.sourcetransformer import transform_pb2_grpc class NumaflowServer: @@ -126,6 +127,8 @@ def _run_server( map_pb2_grpc.add_MapServicer_to_server(servicer, server) elif udf_type == UDFType.Sink: sink_pb2_grpc.add_SinkServicer_to_server(servicer, server) + elif udf_type == UDFType.SourceTransformer: + transform_pb2_grpc.add_SourceTransformServicer_to_server(servicer, server) server.add_insecure_port(bind_address) server.start() diff --git a/pynumaflow/sideinput/server.py b/pynumaflow/sideinput/server.py index d786f0d7..7d700a61 100644 --- a/pynumaflow/sideinput/server.py +++ b/pynumaflow/sideinput/server.py @@ -13,7 +13,7 @@ SIDE_INPUT_SOCK_PATH, ) from pynumaflow.sideinput import Response -from pynumaflow.sideinput.proto import sideinput_pb2, sideinput_pb2_grpc +from pynumaflow.proto.sideinput import sideinput_pb2_grpc, sideinput_pb2 from pynumaflow.types import NumaflowServicerContext _LOGGER = setup_logging(__name__) diff --git a/pynumaflow/sourcetransformer/__init__.py b/pynumaflow/sourcetransformer/__init__.py new file mode 100644 index 00000000..f636e458 --- /dev/null +++ b/pynumaflow/sourcetransformer/__init__.py @@ -0,0 +1,20 @@ +from pynumaflow._constants import ServerType + +from pynumaflow.sourcetransformer._dtypes import ( + Message, + Messages, + Datum, + DROP, + SourceTransformerClass, +) +from pynumaflow.sourcetransformer.sourcetransform import SourceTransformServer + +__all__ = [ + "Message", + "Messages", + "Datum", + "DROP", + "SourceTransformServer", + "SourceTransformerClass", + "ServerType", +] diff --git a/pynumaflow-old/sourcetransformer/_dtypes.py b/pynumaflow/sourcetransformer/_dtypes.py similarity index 85% rename from pynumaflow-old/sourcetransformer/_dtypes.py rename to pynumaflow/sourcetransformer/_dtypes.py index b3242cd2..2cb2987f 100644 --- a/pynumaflow-old/sourcetransformer/_dtypes.py +++ b/pynumaflow/sourcetransformer/_dtypes.py @@ -1,7 +1,8 @@ +from abc import ABCMeta, abstractmethod from collections.abc import Iterator, Sequence from dataclasses import dataclass from datetime import datetime -from typing import TypeVar, Callable +from typing import TypeVar, Callable, Union from warnings import warn from pynumaflow._constants import DROP @@ -172,4 +173,28 @@ def watermark(self) -> datetime: return self._watermark -SourceTransformCallable = Callable[[list[str], Datum], Messages] +class SourceTransformerClass(metaclass=ABCMeta): + """ + Provides an interface to write a Source Transformer + which will be exposed over a GRPC server. + + Args: + + """ + + def __call__(self, *args, **kwargs): + """ + Allow to call handler function directly if class instance is sent + """ + return self.handler(*args, **kwargs) + + @abstractmethod + def handler(self, keys: list[str], datum: Datum) -> Messages: + """ + Write a handler function which implements the MapCallable interface. + """ + pass + + +SourceTransformHandler = Callable[[list[str], Datum], Messages] +SourceTransformCallable = Union[SourceTransformHandler, SourceTransformerClass] diff --git a/pynumaflow-old/sourcetransformer/server.py b/pynumaflow/sourcetransformer/server.py similarity index 66% rename from pynumaflow-old/sourcetransformer/server.py rename to pynumaflow/sourcetransformer/server.py index 5320668e..cd9c39dd 100644 --- a/pynumaflow-old/sourcetransformer/server.py +++ b/pynumaflow/sourcetransformer/server.py @@ -1,32 +1,21 @@ import logging -import multiprocessing import os -from concurrent.futures import ThreadPoolExecutor import grpc from google.protobuf import empty_pb2 as _empty_pb2 from google.protobuf import timestamp_pb2 as _timestamp_pb2 from pynumaflow import setup_logging -from pynumaflow._constants import ( - SOURCE_TRANSFORMER_SOCK_PATH, - MAX_MESSAGE_SIZE, -) -from pynumaflow.info.server import get_sdk_version, write as info_server_write -from pynumaflow.info.types import ServerInfo, Protocol, Language, SERVER_INFO_FILE_PATH from pynumaflow.sourcetransformer import Datum from pynumaflow.sourcetransformer._dtypes import SourceTransformCallable -from pynumaflow.sourcetransformer.proto import transform_pb2 -from pynumaflow.sourcetransformer.proto import transform_pb2_grpc +from pynumaflow.proto.sourcetransformer import transform_pb2 +from pynumaflow.proto.sourcetransformer import transform_pb2_grpc from pynumaflow.types import NumaflowServicerContext _LOGGER = setup_logging(__name__) if os.getenv("PYTHONDEBUG"): _LOGGER.setLevel(logging.DEBUG) -_PROCESS_COUNT = multiprocessing.cpu_count() -MAX_THREADS = int(os.getenv("MAX_THREADS", 0)) or (_PROCESS_COUNT * 4) - class SourceTransformer(transform_pb2_grpc.SourceTransformServicer): """ @@ -58,19 +47,8 @@ class SourceTransformer(transform_pb2_grpc.SourceTransformServicer): def __init__( self, handler: SourceTransformCallable, - sock_path=SOURCE_TRANSFORMER_SOCK_PATH, - max_message_size=MAX_MESSAGE_SIZE, - max_threads=MAX_THREADS, ): self.__transform_handler: SourceTransformCallable = handler - self.sock_path = f"unix://{sock_path}" - self._max_message_size = max_message_size - self._max_threads = max_threads - - self._server_options = [ - ("grpc.max_send_message_length", self._max_message_size), - ("grpc.max_receive_message_length", self._max_message_size), - ] def SourceTransformFn( self, request: transform_pb2.SourceTransformRequest, context: NumaflowServicerContext @@ -120,24 +98,3 @@ def IsReady( The pascal case function name comes from the proto transform_pb2_grpc.py file. """ return transform_pb2.ReadyResponse(ready=True) - - def start(self) -> None: - """ - Starts the gRPC server on the given UNIX socket with given max threads. - """ - server = grpc.server( - ThreadPoolExecutor(max_workers=self._max_threads), options=self._server_options - ) - transform_pb2_grpc.add_SourceTransformServicer_to_server(self, server) - server.add_insecure_port(self.sock_path) - server.start() - serv_info = ServerInfo( - protocol=Protocol.UDS, - language=Language.PYTHON, - version=get_sdk_version(), - ) - info_server_write(server_info=serv_info, info_file=SERVER_INFO_FILE_PATH) - _LOGGER.info( - "GRPC Server listening on: %s with max threads: %s", self.sock_path, self._max_threads - ) - server.wait_for_termination() diff --git a/pynumaflow/sourcetransformer/sourcetransform.py b/pynumaflow/sourcetransformer/sourcetransform.py new file mode 100644 index 00000000..97734532 --- /dev/null +++ b/pynumaflow/sourcetransformer/sourcetransform.py @@ -0,0 +1,121 @@ +import os + +from pynumaflow.sourcetransformer.server import SourceTransformer + +from pynumaflow.shared.server import sync_server_start, start_multiproc_server + +from pynumaflow._constants import ( + MAX_MESSAGE_SIZE, + SOURCE_TRANSFORMER_SOCK_PATH, + MAX_THREADS, + ServerType, + _LOGGER, + UDFType, +) + +from pynumaflow.sourcetransformer._dtypes import SourceTransformCallable + +from pynumaflow.shared import NumaflowServer + + +class SourceTransformServer(NumaflowServer): + """ """ + + def __init__( + self, + source_transform_instance: SourceTransformCallable, + sock_path=SOURCE_TRANSFORMER_SOCK_PATH, + max_message_size=MAX_MESSAGE_SIZE, + max_threads=MAX_THREADS, + server_type=ServerType.Sync, + ): + """ + Create a new grpc Server instance. + A new servicer instance is created and attached to the server. + The server instance is returned. + + max_message_size: The max message size in bytes the server can receive and send + max_threads: The max number of threads to be spawned; + defaults to number of processors x4 + """ + self.sock_path = f"unix://{sock_path}" + self.max_threads = min(max_threads, int(os.getenv("MAX_THREADS", "4"))) + self.max_message_size = max_message_size + + self.source_transform_instance = source_transform_instance + self.server_type = server_type + + self._server_options = [ + ("grpc.max_send_message_length", self.max_message_size), + ("grpc.max_receive_message_length", self.max_message_size), + ] + if server_type == ServerType.Multiproc: + self._server_options.append(("grpc.so_reuseport", 1)) + self._server_options.append(("grpc.so_reuseaddr", 1)) + + # Set the number of processes to be spawned to the number of CPUs or + # the value of the env var NUM_CPU_MULTIPROC defined by the user + # Setting the max value to 2 * CPU count + # Used for multiproc server + self._process_count = min( + int(os.getenv("NUM_CPU_MULTIPROC", str(os.cpu_count()))), 2 * os.cpu_count() + ) + + def start(self): + """ + Starts the gRPC server on the given UNIX socket with given max threads. + """ + if self.server_type == ServerType.Sync: + self.exec() + elif self.server_type == ServerType.Multiproc: + self.exec_multiproc() + else: + _LOGGER.error("Server type not supported", self.server_type) + raise NotImplementedError + + def exec(self): + """ + Starts the Synchronous gRPC server on the given UNIX socket with given max threads. + """ + transform_servicer = self.get_servicer( + source_transform_instance=self.source_transform_instance, server_type=self.server_type + ) + _LOGGER.info( + "Sync GRPC Server listening on: %s with max threads: %s", + self.sock_path, + self.max_threads, + ) + + sync_server_start( + servicer=transform_servicer, + bind_address=self.sock_path, + max_threads=self.max_threads, + server_options=self._server_options, + udf_type=UDFType.SourceTransformer, + ) + + def exec_multiproc(self): + """ + Starts the Multiproc gRPC server on the given UNIX socket with given max threads. + """ + transform_servicer = self.get_servicer( + source_transform_instance=self.source_transform_instance, server_type=self.server_type + ) + start_multiproc_server( + max_threads=self.max_threads, + servicer=transform_servicer, + process_count=self._process_count, + server_options=self._server_options, + udf_type=UDFType.Map, + ) + + def get_servicer( + self, source_transform_instance: SourceTransformCallable, server_type: ServerType + ): + if server_type == ServerType.Sync: + transform_servicer = SourceTransformer(handler=source_transform_instance) + elif server_type == ServerType.Multiproc: + transform_servicer = SourceTransformer(handler=source_transform_instance) + else: + raise NotImplementedError + return transform_servicer From 9e268f8d60ba32db98ac11920bfd0a2772b1d0a3 Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Tue, 9 Jan 2024 14:28:20 -0800 Subject: [PATCH 35/78] source Signed-off-by: Sidhant Kohli --- pynumaflow/proto/sourcer/__init__.py | 0 pynumaflow/proto/sourcer/source.proto | 148 +++++++++++ pynumaflow/proto/sourcer/source_pb2.py | 58 +++++ pynumaflow/proto/sourcer/source_pb2_grpc.py | 266 +++++++++++++++++++ pynumaflow/shared/server.py | 7 + pynumaflow/sourcer/__init__.py | 23 ++ pynumaflow/sourcer/_dtypes.py | 267 ++++++++++++++++++++ pynumaflow/sourcer/async_server.py | 170 +++++++++++++ pynumaflow/sourcer/server.py | 175 +++++++++++++ pynumaflow/sourcer/source.py | 105 ++++++++ 10 files changed, 1219 insertions(+) create mode 100644 pynumaflow/proto/sourcer/__init__.py create mode 100644 pynumaflow/proto/sourcer/source.proto create mode 100644 pynumaflow/proto/sourcer/source_pb2.py create mode 100644 pynumaflow/proto/sourcer/source_pb2_grpc.py create mode 100644 pynumaflow/sourcer/__init__.py create mode 100644 pynumaflow/sourcer/_dtypes.py create mode 100644 pynumaflow/sourcer/async_server.py create mode 100644 pynumaflow/sourcer/server.py create mode 100644 pynumaflow/sourcer/source.py diff --git a/pynumaflow/proto/sourcer/__init__.py b/pynumaflow/proto/sourcer/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pynumaflow/proto/sourcer/source.proto b/pynumaflow/proto/sourcer/source.proto new file mode 100644 index 00000000..87c6ff72 --- /dev/null +++ b/pynumaflow/proto/sourcer/source.proto @@ -0,0 +1,148 @@ +syntax = "proto3"; +import "google/protobuf/timestamp.proto"; +import "google/protobuf/empty.proto"; + +package source.v1; + +service Source { + // Read returns a stream of datum responses. + // The size of the returned ReadResponse is less than or equal to the num_records specified in ReadRequest. + // If the request timeout is reached on server side, the returned ReadResponse will contain all the datum that have been read (which could be an empty list). + rpc ReadFn(ReadRequest) returns (stream ReadResponse); + + // AckFn acknowledges a list of datum offsets. + // When AckFn is called, it implicitly indicates that the datum stream has been processed by the source vertex. + // The caller (numa) expects the AckFn to be successful, and it does not expect any errors. + // If there are some irrecoverable errors when the callee (UDSource) is processing the AckFn request, + // then it is best to crash because there are no other retry mechanisms possible. + rpc AckFn(AckRequest) returns (AckResponse); + + // PendingFn returns the number of pending records at the user defined source. + rpc PendingFn(google.protobuf.Empty) returns (PendingResponse); + + // PartitionsFn returns the list of partitions for the user defined source. + rpc PartitionsFn(google.protobuf.Empty) returns (PartitionsResponse); + + // IsReady is the heartbeat endpoint for user defined source gRPC. + rpc IsReady(google.protobuf.Empty) returns (ReadyResponse); +} + +/* + * ReadRequest is the request for reading datum stream from user defined source. + */ +message ReadRequest { + message Request { + // Required field indicating the number of records to read. + uint64 num_records = 1; + // Required field indicating the request timeout in milliseconds. + // uint32 can represent 2^32 milliseconds, which is about 49 days. + // We don't use uint64 because time.Duration takes int64 as nano seconds. Using uint64 for milli will cause overflow. + uint32 timeout_in_ms = 2; + } + // Required field indicating the request. + Request request = 1; +} + +/* + * ReadResponse is the response for reading datum stream from user defined source. + */ +message ReadResponse { + message Result { + // Required field holding the payload of the datum. + bytes payload = 1; + // Required field indicating the offset information of the datum. + Offset offset = 2; + // Required field representing the time associated with each datum. It is used for watermarking. + google.protobuf.Timestamp event_time = 3; + // Optional list of keys associated with the datum. + // Key is the "key" attribute in (key,value) as in the map-reduce paradigm. + // We add this optional field to support the use case where the user defined source can provide keys for the datum. + // e.g. Kafka and Redis Stream message usually include information about the keys. + repeated string keys = 4; + } + // Required field holding the result. + Result result = 1; +} + +/* + * AckRequest is the request for acknowledging datum. + * It takes a list of offsets to be acknowledged. + */ +message AckRequest { + message Request { + // Required field holding a list of offsets to be acknowledged. + // The offsets must be strictly corresponding to the previously read batch, + // meaning the offsets must be in the same order as the datum responses in the ReadResponse. + // By enforcing ordering, we can save deserialization effort on the server side, assuming the server keeps a local copy of the raw/un-serialized offsets. + repeated Offset offsets = 1; + } + // Required field holding the request. The list will be ordered and will have the same order as the original Read response. + Request request = 1; +} + +/* + * AckResponse is the response for acknowledging datum. It contains one empty field confirming + * the batch of offsets that have been successfully acknowledged. The contract between client and server + * is that the server will only return the AckResponse if the ack request is successful. + * If the server hangs during the ack request, the client can decide to timeout and error out the data forwarder. + * The reason why we define such contract is that we always expect the server to be able to process the ack request. + * Client is expected to send the AckRequest to the server with offsets that are strictly + * corresponding to the previously read batch. If the client sends the AckRequest with offsets that are not, + * it is considered as a client error and the server will not return the AckResponse. + */ +message AckResponse { + message Result { + // Required field indicating the ack request is successful. + google.protobuf.Empty success = 1; + } + // Required field holding the result. + Result result = 1; +} + +/* + * ReadyResponse is the health check result for user defined source. + */ +message ReadyResponse { + // Required field holding the health check result. + bool ready = 1; +} + +/* + * PendingResponse is the response for the pending request. + */ +message PendingResponse { + message Result { + // Required field holding the number of pending records at the user defined source. + // A negative count indicates that the pending information is not available. + int64 count = 1; + } + // Required field holding the result. + Result result = 1; +} + +/* + * PartitionsResponse is the response for the partitions request. + */ +message PartitionsResponse { + message Result { + // Required field holding the list of partitions. + repeated int32 partitions = 1; + } + // Required field holding the result. + Result result = 1; +} + +/* + * Offset is the offset of the datum. + */ +message Offset { + // offset is the offset of the datum. This field is required. + // We define Offset as a byte array because different input data sources can have different representations for Offset. + // The only way to generalize it is to define it as a byte array, + // Such that we can let the UDSource to de-serialize the offset using its own interpretation logics. + bytes offset = 1; + // Optional partition_id indicates which partition of the source the datum belongs to. + // It is useful for sources that have multiple partitions. e.g. Kafka. + // If the partition_id is not specified, it is assumed that the source has a single partition. + int32 partition_id = 2; +} \ No newline at end of file diff --git a/pynumaflow/proto/sourcer/source_pb2.py b/pynumaflow/proto/sourcer/source_pb2.py new file mode 100644 index 00000000..73c282e1 --- /dev/null +++ b/pynumaflow/proto/sourcer/source_pb2.py @@ -0,0 +1,58 @@ +# -*- coding: utf-8 -*- +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: source.proto +"""Generated protocol buffer code.""" +from google.protobuf import descriptor as _descriptor +from google.protobuf import descriptor_pool as _descriptor_pool +from google.protobuf import symbol_database as _symbol_database +from google.protobuf.internal import builder as _builder + +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + +from google.protobuf import timestamp_pb2 as google_dot_protobuf_dot_timestamp__pb2 +from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2 + + +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( + b'\n\x0csource.proto\x12\tsource.v1\x1a\x1fgoogle/protobuf/timestamp.proto\x1a\x1bgoogle/protobuf/empty.proto"u\n\x0bReadRequest\x12/\n\x07request\x18\x01 \x01(\x0b\x32\x1e.source.v1.ReadRequest.Request\x1a\x35\n\x07Request\x12\x13\n\x0bnum_records\x18\x01 \x01(\x04\x12\x15\n\rtimeout_in_ms\x18\x02 \x01(\r"\xba\x01\n\x0cReadResponse\x12.\n\x06result\x18\x01 \x01(\x0b\x32\x1e.source.v1.ReadResponse.Result\x1az\n\x06Result\x12\x0f\n\x07payload\x18\x01 \x01(\x0c\x12!\n\x06offset\x18\x02 \x01(\x0b\x32\x11.source.v1.Offset\x12.\n\nevent_time\x18\x03 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12\x0c\n\x04keys\x18\x04 \x03(\t"k\n\nAckRequest\x12.\n\x07request\x18\x01 \x01(\x0b\x32\x1d.source.v1.AckRequest.Request\x1a-\n\x07Request\x12"\n\x07offsets\x18\x01 \x03(\x0b\x32\x11.source.v1.Offset"o\n\x0b\x41\x63kResponse\x12-\n\x06result\x18\x01 \x01(\x0b\x32\x1d.source.v1.AckResponse.Result\x1a\x31\n\x06Result\x12\'\n\x07success\x18\x01 \x01(\x0b\x32\x16.google.protobuf.Empty"\x1e\n\rReadyResponse\x12\r\n\x05ready\x18\x01 \x01(\x08"]\n\x0fPendingResponse\x12\x31\n\x06result\x18\x01 \x01(\x0b\x32!.source.v1.PendingResponse.Result\x1a\x17\n\x06Result\x12\r\n\x05\x63ount\x18\x01 \x01(\x03"h\n\x12PartitionsResponse\x12\x34\n\x06result\x18\x01 \x01(\x0b\x32$.source.v1.PartitionsResponse.Result\x1a\x1c\n\x06Result\x12\x12\n\npartitions\x18\x01 \x03(\x05".\n\x06Offset\x12\x0e\n\x06offset\x18\x01 \x01(\x0c\x12\x14\n\x0cpartition_id\x18\x02 \x01(\x05\x32\xc2\x02\n\x06Source\x12;\n\x06ReadFn\x12\x16.source.v1.ReadRequest\x1a\x17.source.v1.ReadResponse0\x01\x12\x36\n\x05\x41\x63kFn\x12\x15.source.v1.AckRequest\x1a\x16.source.v1.AckResponse\x12?\n\tPendingFn\x12\x16.google.protobuf.Empty\x1a\x1a.source.v1.PendingResponse\x12\x45\n\x0cPartitionsFn\x12\x16.google.protobuf.Empty\x1a\x1d.source.v1.PartitionsResponse\x12;\n\x07IsReady\x12\x16.google.protobuf.Empty\x1a\x18.source.v1.ReadyResponseb\x06proto3' +) + +_globals = globals() +_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) +_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "source_pb2", _globals) +if _descriptor._USE_C_DESCRIPTORS == False: + DESCRIPTOR._options = None + _globals["_READREQUEST"]._serialized_start = 89 + _globals["_READREQUEST"]._serialized_end = 206 + _globals["_READREQUEST_REQUEST"]._serialized_start = 153 + _globals["_READREQUEST_REQUEST"]._serialized_end = 206 + _globals["_READRESPONSE"]._serialized_start = 209 + _globals["_READRESPONSE"]._serialized_end = 395 + _globals["_READRESPONSE_RESULT"]._serialized_start = 273 + _globals["_READRESPONSE_RESULT"]._serialized_end = 395 + _globals["_ACKREQUEST"]._serialized_start = 397 + _globals["_ACKREQUEST"]._serialized_end = 504 + _globals["_ACKREQUEST_REQUEST"]._serialized_start = 459 + _globals["_ACKREQUEST_REQUEST"]._serialized_end = 504 + _globals["_ACKRESPONSE"]._serialized_start = 506 + _globals["_ACKRESPONSE"]._serialized_end = 617 + _globals["_ACKRESPONSE_RESULT"]._serialized_start = 568 + _globals["_ACKRESPONSE_RESULT"]._serialized_end = 617 + _globals["_READYRESPONSE"]._serialized_start = 619 + _globals["_READYRESPONSE"]._serialized_end = 649 + _globals["_PENDINGRESPONSE"]._serialized_start = 651 + _globals["_PENDINGRESPONSE"]._serialized_end = 744 + _globals["_PENDINGRESPONSE_RESULT"]._serialized_start = 721 + _globals["_PENDINGRESPONSE_RESULT"]._serialized_end = 744 + _globals["_PARTITIONSRESPONSE"]._serialized_start = 746 + _globals["_PARTITIONSRESPONSE"]._serialized_end = 850 + _globals["_PARTITIONSRESPONSE_RESULT"]._serialized_start = 822 + _globals["_PARTITIONSRESPONSE_RESULT"]._serialized_end = 850 + _globals["_OFFSET"]._serialized_start = 852 + _globals["_OFFSET"]._serialized_end = 898 + _globals["_SOURCE"]._serialized_start = 901 + _globals["_SOURCE"]._serialized_end = 1223 +# @@protoc_insertion_point(module_scope) diff --git a/pynumaflow/proto/sourcer/source_pb2_grpc.py b/pynumaflow/proto/sourcer/source_pb2_grpc.py new file mode 100644 index 00000000..3a132eea --- /dev/null +++ b/pynumaflow/proto/sourcer/source_pb2_grpc.py @@ -0,0 +1,266 @@ +# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT! +"""Client and server classes corresponding to protobuf-defined services.""" +import grpc + +from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2 +from . import source_pb2 as source__pb2 + + +class SourceStub(object): + """Missing associated documentation comment in .proto file.""" + + def __init__(self, channel): + """Constructor. + + Args: + channel: A grpc.Channel. + """ + self.ReadFn = channel.unary_stream( + "/source.v1.Source/ReadFn", + request_serializer=source__pb2.ReadRequest.SerializeToString, + response_deserializer=source__pb2.ReadResponse.FromString, + ) + self.AckFn = channel.unary_unary( + "/source.v1.Source/AckFn", + request_serializer=source__pb2.AckRequest.SerializeToString, + response_deserializer=source__pb2.AckResponse.FromString, + ) + self.PendingFn = channel.unary_unary( + "/source.v1.Source/PendingFn", + request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, + response_deserializer=source__pb2.PendingResponse.FromString, + ) + self.PartitionsFn = channel.unary_unary( + "/source.v1.Source/PartitionsFn", + request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, + response_deserializer=source__pb2.PartitionsResponse.FromString, + ) + self.IsReady = channel.unary_unary( + "/source.v1.Source/IsReady", + request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, + response_deserializer=source__pb2.ReadyResponse.FromString, + ) + + +class SourceServicer(object): + """Missing associated documentation comment in .proto file.""" + + def ReadFn(self, request, context): + """Read returns a stream of datum responses. + The size of the returned ReadResponse is less than or equal to the num_records specified in ReadRequest. + If the request timeout is reached on server side, the returned ReadResponse will contain all the datum that have been read (which could be an empty list). + """ + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details("Method not implemented!") + raise NotImplementedError("Method not implemented!") + + def AckFn(self, request, context): + """AckFn acknowledges a list of datum offsets. + When AckFn is called, it implicitly indicates that the datum stream has been processed by the source vertex. + The caller (numa) expects the AckFn to be successful, and it does not expect any errors. + If there are some irrecoverable errors when the callee (UDSource) is processing the AckFn request, + then it is best to crash because there are no other retry mechanisms possible. + """ + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details("Method not implemented!") + raise NotImplementedError("Method not implemented!") + + def PendingFn(self, request, context): + """PendingFn returns the number of pending records at the user defined source.""" + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details("Method not implemented!") + raise NotImplementedError("Method not implemented!") + + def PartitionsFn(self, request, context): + """PartitionsFn returns the list of partitions for the user defined source.""" + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details("Method not implemented!") + raise NotImplementedError("Method not implemented!") + + def IsReady(self, request, context): + """IsReady is the heartbeat endpoint for user defined source gRPC.""" + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details("Method not implemented!") + raise NotImplementedError("Method not implemented!") + + +def add_SourceServicer_to_server(servicer, server): + rpc_method_handlers = { + "ReadFn": grpc.unary_stream_rpc_method_handler( + servicer.ReadFn, + request_deserializer=source__pb2.ReadRequest.FromString, + response_serializer=source__pb2.ReadResponse.SerializeToString, + ), + "AckFn": grpc.unary_unary_rpc_method_handler( + servicer.AckFn, + request_deserializer=source__pb2.AckRequest.FromString, + response_serializer=source__pb2.AckResponse.SerializeToString, + ), + "PendingFn": grpc.unary_unary_rpc_method_handler( + servicer.PendingFn, + request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + response_serializer=source__pb2.PendingResponse.SerializeToString, + ), + "PartitionsFn": grpc.unary_unary_rpc_method_handler( + servicer.PartitionsFn, + request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + response_serializer=source__pb2.PartitionsResponse.SerializeToString, + ), + "IsReady": grpc.unary_unary_rpc_method_handler( + servicer.IsReady, + request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + response_serializer=source__pb2.ReadyResponse.SerializeToString, + ), + } + generic_handler = grpc.method_handlers_generic_handler("source.v1.Source", rpc_method_handlers) + server.add_generic_rpc_handlers((generic_handler,)) + + +# This class is part of an EXPERIMENTAL API. +class Source(object): + """Missing associated documentation comment in .proto file.""" + + @staticmethod + def ReadFn( + request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None, + ): + return grpc.experimental.unary_stream( + request, + target, + "/source.v1.Source/ReadFn", + source__pb2.ReadRequest.SerializeToString, + source__pb2.ReadResponse.FromString, + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + ) + + @staticmethod + def AckFn( + request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None, + ): + return grpc.experimental.unary_unary( + request, + target, + "/source.v1.Source/AckFn", + source__pb2.AckRequest.SerializeToString, + source__pb2.AckResponse.FromString, + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + ) + + @staticmethod + def PendingFn( + request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None, + ): + return grpc.experimental.unary_unary( + request, + target, + "/source.v1.Source/PendingFn", + google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, + source__pb2.PendingResponse.FromString, + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + ) + + @staticmethod + def PartitionsFn( + request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None, + ): + return grpc.experimental.unary_unary( + request, + target, + "/source.v1.Source/PartitionsFn", + google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, + source__pb2.PartitionsResponse.FromString, + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + ) + + @staticmethod + def IsReady( + request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None, + ): + return grpc.experimental.unary_unary( + request, + target, + "/source.v1.Source/IsReady", + google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, + source__pb2.ReadyResponse.FromString, + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + ) diff --git a/pynumaflow/shared/server.py b/pynumaflow/shared/server.py index 899961b2..216fdb22 100644 --- a/pynumaflow/shared/server.py +++ b/pynumaflow/shared/server.py @@ -23,6 +23,7 @@ ) from pynumaflow.proto.mapper import map_pb2_grpc from pynumaflow.proto.sinker import sink_pb2_grpc +from pynumaflow.proto.sourcer import source_pb2_grpc from pynumaflow.proto.sourcetransformer import transform_pb2_grpc @@ -123,14 +124,20 @@ def _run_server( ), options=server_options, ) + + # add the correct servicer to the server based on the UDF type if udf_type == UDFType.Map: map_pb2_grpc.add_MapServicer_to_server(servicer, server) elif udf_type == UDFType.Sink: sink_pb2_grpc.add_SinkServicer_to_server(servicer, server) elif udf_type == UDFType.SourceTransformer: transform_pb2_grpc.add_SourceTransformServicer_to_server(servicer, server) + elif udf_type == UDFType.Sink: + source_pb2_grpc.add_SourceServicer_to_server(servicer, server) + # bind the server to the UDS/TCP socket server.add_insecure_port(bind_address) + # start the gRPC server server.start() # Add the server information to the server info file if provided diff --git a/pynumaflow/sourcer/__init__.py b/pynumaflow/sourcer/__init__.py new file mode 100644 index 00000000..f846b5f7 --- /dev/null +++ b/pynumaflow/sourcer/__init__.py @@ -0,0 +1,23 @@ +from pynumaflow.sourcer._dtypes import ( + Message, + ReadRequest, + PendingResponse, + AckRequest, + Offset, + PartitionsResponse, + get_default_partitions, +) +from pynumaflow.sourcer.async_server import AsyncSourcer +from pynumaflow.sourcer.server import Sourcer + +__all__ = [ + "Message", + "ReadRequest", + "PendingResponse", + "AckRequest", + "Offset", + "AsyncSourcer", + "Sourcer", + "PartitionsResponse", + "get_default_partitions", +] diff --git a/pynumaflow/sourcer/_dtypes.py b/pynumaflow/sourcer/_dtypes.py new file mode 100644 index 00000000..27246c43 --- /dev/null +++ b/pynumaflow/sourcer/_dtypes.py @@ -0,0 +1,267 @@ +import os +from abc import ABCMeta, abstractmethod +from collections.abc import Iterable +from dataclasses import dataclass +from datetime import datetime +from typing import Callable +from collections.abc import AsyncIterable + + +@dataclass(init=False) +class Offset: + """ + Args: + offset: the offset of the datum. + partition_id: partition_id indicates which partition of the source the datum belongs to. + """ + + __slots__ = ("_offset", "_partition_id") + + _offset: bytes + _partition_id: int + + def __init__(self, offset: bytes, partition_id: int): + self._offset = offset + self._partition_id = partition_id + + @classmethod + def offset_with_default_partition_id(cls, offset: bytes): + """ + Returns an Offset object with the given offset and default partition id. + """ + return Offset(offset=offset, partition_id=get_default_partitions()[0]) + + @property + def as_dict(self): + return {"offset": self._offset, "partition_id": self._partition_id} + + @property + def offset(self) -> bytes: + return self._offset + + @property + def partition_id(self) -> int: + return self._partition_id + + +@dataclass(init=False) +class Message: + """ + Basic datatype for data passing to the next vertex/vertices. + + Args: + payload: data in bytes + offset: the offset of the datum. + event_time: event time of the message, usually extracted from the payload. + keys: []string keys for vertex (optional) + """ + + __slots__ = ("_payload", "_offset", "_event_time", "_keys") + + _payload: bytes + _offset: Offset + _event_time: datetime + _keys: list[str] + + def __init__( + self, payload: bytes, offset: Offset, event_time: datetime, keys: list[str] = None + ): + """ + Creates a Message object to send value to a vertex. + """ + self._payload = payload + self._offset = offset + self._event_time = event_time + self._keys = keys or [] + + @property + def payload(self) -> bytes: + return self._payload + + @property + def keys(self) -> list[str]: + return self._keys + + @property + def offset(self) -> Offset: + return self._offset + + @property + def event_time(self) -> datetime: + return self._event_time + + +@dataclass(init=False) +class ReadRequest: + """ + Class to define the request for reading datum stream from user defined source. + Args: + num_records: the number of records to read. + timeout_in_ms: the request timeout in milliseconds. + >>> # Example usage + >>> from pynumaflow.sourcer import ReadRequest + >>> read_request = ReadRequest(num_records=10, timeout_in_ms=1000) + """ + + __slots__ = ("_num_records", "_timeout_in_ms") + + _num_records: int + _timeout_in_ms: int + + def __init__( + self, + num_records: int, + timeout_in_ms: int, + ): + if not isinstance(num_records, int): + raise TypeError(f"Wrong data type: {type(num_records)} for ReadRequest.num_records") + self._num_records = num_records + if not isinstance(timeout_in_ms, int): + raise TypeError(f"Wrong data type: {type(timeout_in_ms)} for ReadRequest.timeout_in_ms") + self._timeout_in_ms = timeout_in_ms + + @property + def num_records(self) -> int: + """Returns the num_records of the request""" + return self._num_records + + @property + def timeout_in_ms(self) -> int: + """Returns the timeout_in_ms of the request.""" + return self._timeout_in_ms + + +@dataclass(init=False) +class AckRequest: + """ + Class for defining the request for acknowledging datum. + It takes a list of offsets that need to be acknowledged. + Args: + offsets: the offsets to be acknowledged. + >>> # Example usage + >>> from pynumaflow.sourcer import AckRequest, Offset + >>> offset = Offset(offset=b"123", partition_id="0") + >>> ack_request = AckRequest(offsets=[offset, offset]) + """ + + __slots__ = ("_offsets",) + _offsets: list[Offset] + + def __init__(self, offsets: list[Offset]): + self._offsets = offsets + + @property + def offset(self) -> list[Offset]: + """Returns the offsets to be acknowledged.""" + return self._offsets + + +@dataclass(init=False) +class PendingResponse: + """ + PendingResponse is the response for the pending request. + It indicates the number of pending records at the user defined source. + A negative count indicates that the pending information is not available. + Args: + count: the number of pending records. + """ + + __slots__ = ("_count",) + _count: int + + def __init__(self, count: int): + if not isinstance(count, int): + raise TypeError(f"Wrong data type: {type(count)} for Pending.count") + self._count = count + + @property + def count(self) -> int: + """Returns the count of pending records""" + return self._count + + +@dataclass(init=False) +class PartitionsResponse: + """ + PartitionsResponse is the response for the partition request. + It indicates the number of partitions at the user defined source. + A negative count indicates that the partition information is not available. + Args: + count: the number of partitions. + """ + + __slots__ = ("_partitions",) + _partitions: list[int] + + def __init__(self, partitions: list[int]): + if not isinstance(partitions, list): + raise TypeError(f"Wrong data type: {type(partitions)} for Partition.partitions") + self._partitions = partitions + + @property + def partitions(self) -> list[int]: + """Returns the list of partitions""" + return self._partitions + + +class SourcerClass(metaclass=ABCMeta): + """ + Provides an interface to write a Sourcer + which will be exposed over an gRPC server. + + Args: + + """ + + def __call__(self, *args, **kwargs): + """ + Allow to call handler function directly if class instance is sent + """ + return self.handler(*args, **kwargs) + + @abstractmethod + def read_handler(self, datum: ReadRequest) -> Iterable[Message]: + """ + Write a handler function which implements the SourceReadCallable interface. + read_handler is used to read the data from the source and send the data forward + for each read request we process num_records and increment the read_idx to indicate that + the message has been read and the same is added to the ack set + """ + pass + + @abstractmethod + def ack_handler(self, ack_request: AckRequest): + """ + The ack handler is used acknowledge the offsets that have been read, and remove them + from the to_ack_set + """ + pass + + @abstractmethod + def pending_handler(self) -> PendingResponse: + """ + The simple source always returns zero to indicate there is no pending record. + """ + pass + + @abstractmethod + def partitions_handler(self) -> PartitionsResponse: + """ + The simple source always returns zero to indicate there is no pending record. + """ + pass + + +# Create default partition id from the environment variable "NUMAFLOW_REPLICA" +DefaultPartitionId = int(os.getenv("NUMAFLOW_REPLICA", "0")) +SourceReadCallable = Callable[[ReadRequest], Iterable[Message]] +AsyncSourceReadCallable = Callable[[ReadRequest], AsyncIterable[Message]] +SourceAckCallable = Callable[[AckRequest], None] +SourceCallable = SourcerClass + + +def get_default_partitions() -> list[int]: + """ + Returns the default partition ids. + """ + return [DefaultPartitionId] diff --git a/pynumaflow/sourcer/async_server.py b/pynumaflow/sourcer/async_server.py new file mode 100644 index 00000000..a867f7fa --- /dev/null +++ b/pynumaflow/sourcer/async_server.py @@ -0,0 +1,170 @@ +import logging +import os + +from collections.abc import AsyncIterable +from google.protobuf import timestamp_pb2 as _timestamp_pb2 +import grpc +from google.protobuf import empty_pb2 as _empty_pb2 + +from pynumaflow import setup_logging +from pynumaflow.sourcer import ReadRequest +from pynumaflow.sourcer._dtypes import Offset, AckRequest, SourceCallable +from pynumaflow.proto.sourcer import source_pb2 +from pynumaflow.proto.sourcer import source_pb2_grpc +from pynumaflow.types import NumaflowServicerContext + +_LOGGER = setup_logging(__name__) +if os.getenv("PYTHONDEBUG"): + _LOGGER.setLevel(logging.DEBUG) + + +class AsyncSourcer(source_pb2_grpc.SourceServicer): + """ + Provides an interface to write an Asynchronous Sourcer + which will be exposed over gRPC. + + Args: + read_handler: Function callable following the type signature of AsyncSourceReadCallable + ack_handler: Function handler for AckFn + pending_handler: Function handler for PendingFn + partitions_handler: Function handler for PartitionsFn + + sock_path: Path to the UNIX Domain Socket + max_message_size: The max message size in bytes the server can receive and send + max_threads: The max number of threads to be spawned; + defaults to number of processors x4 + + Example invocation: + >>> from typing import Iterator + >>> from pynumaflow.sourcer import Message, get_default_partitions \ + ... ReadRequest, AsyncSourcer, + ... import aiorun + ... async def read_handler(datum: ReadRequest) -> AsyncIterable[Message]: + ... payload = b"payload:test_mock_message" + ... keys = ["test_key"] + ... offset = mock_offset() + ... event_time = mock_event_time() + ... for i in range(10): + ... yield Message(payload=payload, keys=keys, offset=offset, event_time=event_time) + ... async def ack_handler(ack_request: AckRequest): + ... return + ... async def pending_handler() -> PendingResponse: + ... PendingResponse(count=10) + ... async def partitions_handler() -> PartitionsResponse: + ... return PartitionsResponse(partitions=get_default_partitions()) + >>> grpc_server = AsyncSourcer(read_handler=read_handler, + ... ack_handler=ack_handler, + ... pending_handler=pending_handler, + ... partitions_handler=partitions_handler) + >>> aiorun.run(grpc_server.start()) + """ + + def __init__(self, source_handler: SourceCallable): + self.source_handler = source_handler + self.__source_read_handler = source_handler.read_handler + self.__source_ack_handler = source_handler.ack_handler + self.__source_pending_handler = source_handler.pending_handler + self.__source_partitions_handler = source_handler.partitions_handler + self.cleanup_coroutines = [] + + async def ReadFn( + self, + request: source_pb2.ReadRequest, + context: NumaflowServicerContext, + ) -> AsyncIterable[source_pb2.ReadResponse]: + """ + Applies a Read function and returns a stream of datum responses. + The pascal case function name comes from the proto source_pb2_grpc.py file. + """ + + async for res in self.__invoke_source_read_stream( + ReadRequest( + num_records=request.request.num_records, + timeout_in_ms=request.request.timeout_in_ms, + ) + ): + yield source_pb2.ReadResponse(result=res) + + async def __invoke_source_read_stream(self, req: ReadRequest): + try: + async for msg in self.__source_read_handler(req): + event_time_timestamp = _timestamp_pb2.Timestamp() + event_time_timestamp.FromDatetime(dt=msg.event_time) + yield source_pb2.ReadResponse.Result( + payload=msg.payload, + keys=msg.keys, + offset=msg.offset.as_dict, + event_time=event_time_timestamp, + ) + except Exception as err: + _LOGGER.critical("User-Defined Source ReadError ", exc_info=True) + raise err + + async def AckFn( + self, request: source_pb2.AckRequest, context: NumaflowServicerContext + ) -> source_pb2.AckResponse: + """ + Applies an Ack function in User Defined Source + """ + # proto repeated field(offsets) is of type google._upb._message.RepeatedScalarContainer + # we need to explicitly convert it to list + offsets = [] + for offset in request.request.offsets: + offsets.append(Offset(offset.offset, offset.partition_id)) + try: + await self.__invoke_ack(ack_req=offsets) + except Exception as e: + context.set_code(grpc.StatusCode.UNKNOWN) + context.set_details(str(e)) + raise e + + return source_pb2.AckResponse() + + async def __invoke_ack(self, ack_req: list[Offset]): + """ + Invokes the Source Ack Function. + """ + try: + await self.__source_ack_handler(AckRequest(offsets=ack_req)) + except Exception as err: + _LOGGER.critical("AckFn Error", exc_info=True) + raise err + return source_pb2.AckResponse.Result() + + async def IsReady( + self, request: _empty_pb2.Empty, context: NumaflowServicerContext + ) -> source_pb2.ReadyResponse: + """ + IsReady is the heartbeat endpoint for gRPC. + The pascal case function name comes from the proto source_pb2_grpc.py file. + """ + return source_pb2.ReadyResponse(ready=True) + + async def PendingFn( + self, request: _empty_pb2.Empty, context: NumaflowServicerContext + ) -> source_pb2.PendingResponse: + """ + PendingFn returns the number of pending records + at the user defined source. + """ + try: + count = await self.__source_pending_handler() + except Exception as err: + _LOGGER.critical("PendingFn Error", exc_info=True) + raise err + resp = source_pb2.PendingResponse.Result(count=count.count) + return source_pb2.PendingResponse(result=resp) + + async def PartitionsFn( + self, request: _empty_pb2.Empty, context: NumaflowServicerContext + ) -> source_pb2.PartitionsResponse: + """ + PartitionsFn returns the partitions of the user defined source. + """ + try: + partitions = await self.__source_partitions_handler() + except Exception as err: + _LOGGER.critical("PartitionsFn Error", exc_info=True) + raise err + resp = source_pb2.PartitionsResponse.Result(partitions=partitions.partitions) + return source_pb2.PartitionsResponse(result=resp) diff --git a/pynumaflow/sourcer/server.py b/pynumaflow/sourcer/server.py new file mode 100644 index 00000000..ee6463bd --- /dev/null +++ b/pynumaflow/sourcer/server.py @@ -0,0 +1,175 @@ +import logging +import os + +from collections.abc import Iterable + +from google.protobuf import timestamp_pb2 as _timestamp_pb2 +import grpc +from google.protobuf import empty_pb2 as _empty_pb2 + +from pynumaflow import setup_logging +from pynumaflow.sourcer import ReadRequest +from pynumaflow.sourcer._dtypes import ( + SourceReadCallable, + Offset, + AckRequest, + SourceAckCallable, + SourceCallable, +) +from pynumaflow.proto.sourcer import source_pb2 +from pynumaflow.proto.sourcer import source_pb2_grpc +from pynumaflow.types import NumaflowServicerContext + +_LOGGER = setup_logging(__name__) +if os.getenv("PYTHONDEBUG"): + _LOGGER.setLevel(logging.DEBUG) + + +class Sourcer(source_pb2_grpc.SourceServicer): + """ + Provides an interface to write a Sourcer + which will be exposed over gRPC. + + Args: + source_handler: Class of the type SourcerClass which implements the UDS methods + + Example invocation: + >>> from typing import Iterator + >>> from pynumaflow.sourcer import Message, get_default_partitions, PartitionsResponse \ + ... ReadRequest, Sourcer, AckRequest, + ... def read_handler(datum: ReadRequest) -> Iterable[Message]: + ... payload = b"payload:test_mock_message" + ... keys = ["test_key"] + ... offset = mock_offset() + ... event_time = mock_event_time() + ... for i in range(10): + ... yield Message(payload=payload, keys=keys, offset=offset, event_time=event_time) + ... def ack_handler(ack_request: AckRequest): + ... return + ... def pending_handler() -> PendingResponse: + ... PendingResponse(count=10) + ... def partitions_handler() -> PartitionsResponse: + ... return PartitionsResponse(partitions=get_default_partitions()) + >>> grpc_server = Sourcer(read_handler=read_handler, + ... ack_handler=ack_handler, + ... pending_handler=pending_handler, + ... partitions_handler=partition_handler,) + >>> grpc_server.start() + """ + + def __init__( + self, + source_handler: SourceCallable, + ): + self.source_handler = source_handler + self.__source_read_handler: SourceReadCallable = source_handler.read_handler + self.__source_ack_handler: SourceAckCallable = source_handler.ack_handler + self.__source_pending_handler = source_handler.pending_handler + self.__source_partitions_handler = source_handler.partitions_handler + + def ReadFn( + self, + request: source_pb2.ReadRequest, + context: NumaflowServicerContext, + ) -> Iterable[source_pb2.ReadResponse]: + """ + Applies a Read function to a datum stream in streaming mode. + The pascal case function name comes from the proto source_pb2_grpc.py file. + """ + + for res in self.__invoke_source_read_stream( + ReadRequest( + num_records=request.request.num_records, + timeout_in_ms=request.request.timeout_in_ms, + ) + ): + yield source_pb2.ReadResponse(result=res) + + def __invoke_source_read_stream(self, req: ReadRequest): + try: + for msg in self.__source_read_handler(req): + event_time_timestamp = _timestamp_pb2.Timestamp() + event_time_timestamp.FromDatetime(dt=msg.event_time) + yield source_pb2.ReadResponse.Result( + payload=msg.payload, + keys=msg.keys, + offset=msg.offset.as_dict, + event_time=event_time_timestamp, + ) + except Exception as err: + _LOGGER.critical("User-Defined Source ReadError ", exc_info=True) + raise err + + def AckFn( + self, request: source_pb2.AckRequest, context: NumaflowServicerContext + ) -> source_pb2.AckResponse: + """ + Applies an Ack function in User Defined Source + """ + # proto repeated field(offsets) is of type google._upb._message.RepeatedScalarContainer + # we need to explicitly convert it to list + offsets = [] + for offset in request.request.offsets: + offsets.append(Offset(offset.offset, offset.partition_id)) + try: + self.__invoke_ack(ack_req=offsets) + except Exception as e: + context.set_code(grpc.StatusCode.UNKNOWN) + context.set_details(str(e)) + raise e + + return source_pb2.AckResponse() + + def __invoke_ack(self, ack_req: list[Offset]): + """ + Invokes the Source Ack Function. + """ + try: + self.__source_ack_handler(AckRequest(offsets=ack_req)) + except Exception as err: + _LOGGER.critical("AckFn Error", exc_info=True) + raise err + return source_pb2.AckResponse.Result() + + def IsReady( + self, request: _empty_pb2.Empty, context: NumaflowServicerContext + ) -> source_pb2.ReadyResponse: + """ + IsReady is the heartbeat endpoint for gRPC. + The pascal case function name comes from the proto source_pb2_grpc.py file. + """ + return source_pb2.ReadyResponse(ready=True) + + def PendingFn( + self, request: _empty_pb2.Empty, context: NumaflowServicerContext + ) -> source_pb2.PendingResponse: + """ + PendingFn returns the number of pending records + at the user defined source. + """ + try: + count = self.__source_pending_handler() + except Exception as err: + _LOGGER.critical("PendingFn error", exc_info=True) + raise err + resp = source_pb2.PendingResponse.Result(count=count.count) + return source_pb2.PendingResponse(result=resp) + + def PartitionsFn( + self, request: _empty_pb2.Empty, context: NumaflowServicerContext + ) -> source_pb2.PartitionsResponse: + """ + Partitions returns the partitions associated with the source, will be used by + the platform to determine the partitions to which the watermark should be published. + If the source doesn't have partitions, get_default_partitions() can be used to + return the default partitions. In most cases, the get_default_partitions() + should be enough; the cases where we need to implement custom partitions_handler() + is in a case like Kafka, where a reader can read from multiple Kafka partitions. + """ + try: + partitions = self.__source_partitions_handler() + except Exception as err: + _LOGGER.critical("PartitionFn error", exc_info=True) + raise err + resp = source_pb2.PartitionsResponse.Result(partitions=partitions.partitions) + return source_pb2.PartitionsResponse(result=resp) diff --git a/pynumaflow/sourcer/source.py b/pynumaflow/sourcer/source.py new file mode 100644 index 00000000..af755ea0 --- /dev/null +++ b/pynumaflow/sourcer/source.py @@ -0,0 +1,105 @@ +import os + +import aiorun +import grpc +from pynumaflow.sourcer.async_server import AsyncSourcer + +from pynumaflow.sourcer.server import Sourcer + +from pynumaflow._constants import ( + SOURCE_SOCK_PATH, + MAX_MESSAGE_SIZE, + MAX_THREADS, + ServerType, + _LOGGER, + UDFType, +) +from pynumaflow.proto.sourcer import source_pb2_grpc + +from pynumaflow.shared.server import NumaflowServer, sync_server_start, start_async_server +from pynumaflow.sourcer._dtypes import SourceCallable + + +class SourceServer(NumaflowServer): + def __init__( + self, + sourcer_instance: SourceCallable, + sock_path=SOURCE_SOCK_PATH, + max_message_size=MAX_MESSAGE_SIZE, + max_threads=MAX_THREADS, + server_type=ServerType.Sync, + ): + """ + Create a new grpc Server instance. + A new servicer instance is created and attached to the server. + The server instance is returned. + + max_message_size: The max message size in bytes the server can receive and send + max_threads: The max number of threads to be spawned; + defaults to number of processors x4 + """ + self.sock_path = f"unix://{sock_path}" + self.max_threads = min(max_threads, int(os.getenv("MAX_THREADS", "4"))) + self.max_message_size = max_message_size + + self.sourcer_instance = sourcer_instance + self.server_type = server_type + + self._server_options = [ + ("grpc.max_send_message_length", self.max_message_size), + ("grpc.max_receive_message_length", self.max_message_size), + ] + + def start(self): + """ + Starts the gRPC server on the given UNIX socket with given max threads. + """ + if self.server_type == ServerType.Sync: + self.exec() + elif self.server_type == ServerType.Async: + aiorun.run(self.aexec()) + else: + _LOGGER.error("Server type not supported", self.server_type) + raise NotImplementedError + + def exec(self): + """ + Starts the Synchronous gRPC server on the given UNIX socket with given max threads. + """ + source_servicer = self.get_servicer( + sourcer_instance=self.sourcer_instance, server_type=self.server_type + ) + _LOGGER.info( + "Sync GRPC Server listening on: %s with max threads: %s", + self.sock_path, + self.max_threads, + ) + + sync_server_start( + servicer=source_servicer, + bind_address=self.sock_path, + max_threads=self.max_threads, + server_options=self._server_options, + udf_type=UDFType.Sink, + ) + + def aexec(self): + """ + Starts the Async gRPC server on the given UNIX socket with given max threads + """ + server = grpc.aio.server() + server.add_insecure_port(self.sock_path) + source_servicer = self.get_servicer( + sourcer_instance=self.sourcer_instance, server_type=self.server_type + ) + source_pb2_grpc.add_SourceServicer_to_server(source_servicer, server) + await start_async_server(server, self.sock_path, self.max_threads, self._server_options) + + def get_servicer(self, sourcer_instance: SourceCallable, server_type: ServerType): + if server_type == ServerType.Sync: + source_servicer = Sourcer(source_handler=sourcer_instance) + elif server_type == ServerType.Async: + source_servicer = AsyncSourcer(source_handler=sourcer_instance) + else: + raise NotImplementedError + return source_servicer From b517b6c918b4ee61b2067dd544357213500d442e Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Tue, 9 Jan 2024 14:33:47 -0800 Subject: [PATCH 36/78] source Signed-off-by: Sidhant Kohli --- pynumaflow/sourcer/__init__.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/pynumaflow/sourcer/__init__.py b/pynumaflow/sourcer/__init__.py index f846b5f7..9d4fd24d 100644 --- a/pynumaflow/sourcer/__init__.py +++ b/pynumaflow/sourcer/__init__.py @@ -1,3 +1,7 @@ +from pynumaflow._constants import ServerType + +from pynumaflow.sourcer.source import SourceServer + from pynumaflow.sourcer._dtypes import ( Message, ReadRequest, @@ -5,7 +9,7 @@ AckRequest, Offset, PartitionsResponse, - get_default_partitions, + get_default_partitions, SourcerClass, ) from pynumaflow.sourcer.async_server import AsyncSourcer from pynumaflow.sourcer.server import Sourcer @@ -16,8 +20,9 @@ "PendingResponse", "AckRequest", "Offset", - "AsyncSourcer", - "Sourcer", "PartitionsResponse", "get_default_partitions", + "SourceServer", + "SourcerClass", + "ServerType", ] From 9c814c628a268e572aeb160c8a0b16fe0e0ba0e9 Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Tue, 9 Jan 2024 14:45:32 -0800 Subject: [PATCH 37/78] source Signed-off-by: Sidhant Kohli --- pynumaflow/sourcer/source.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pynumaflow/sourcer/source.py b/pynumaflow/sourcer/source.py index af755ea0..62fdfe48 100644 --- a/pynumaflow/sourcer/source.py +++ b/pynumaflow/sourcer/source.py @@ -83,7 +83,7 @@ def exec(self): udf_type=UDFType.Sink, ) - def aexec(self): + async def aexec(self): """ Starts the Async gRPC server on the given UNIX socket with given max threads """ From 111684a598c0c0d7053d8e7de587596cdd6c3d1c Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Tue, 9 Jan 2024 14:58:47 -0800 Subject: [PATCH 38/78] source Signed-off-by: Sidhant Kohli --- pynumaflow/sourcer/async_server.py | 2 +- pynumaflow/sourcer/server.py | 2 +- pynumaflow/sourcer/source.py | 3 +-- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/pynumaflow/sourcer/async_server.py b/pynumaflow/sourcer/async_server.py index a867f7fa..8b68b01e 100644 --- a/pynumaflow/sourcer/async_server.py +++ b/pynumaflow/sourcer/async_server.py @@ -7,7 +7,7 @@ from google.protobuf import empty_pb2 as _empty_pb2 from pynumaflow import setup_logging -from pynumaflow.sourcer import ReadRequest +from pynumaflow.sourcer._dtypes import ReadRequest from pynumaflow.sourcer._dtypes import Offset, AckRequest, SourceCallable from pynumaflow.proto.sourcer import source_pb2 from pynumaflow.proto.sourcer import source_pb2_grpc diff --git a/pynumaflow/sourcer/server.py b/pynumaflow/sourcer/server.py index ee6463bd..ae4c1cc9 100644 --- a/pynumaflow/sourcer/server.py +++ b/pynumaflow/sourcer/server.py @@ -8,7 +8,7 @@ from google.protobuf import empty_pb2 as _empty_pb2 from pynumaflow import setup_logging -from pynumaflow.sourcer import ReadRequest +from pynumaflow.sourcer._dtypes import ReadRequest from pynumaflow.sourcer._dtypes import ( SourceReadCallable, Offset, diff --git a/pynumaflow/sourcer/source.py b/pynumaflow/sourcer/source.py index 62fdfe48..2f1c203e 100644 --- a/pynumaflow/sourcer/source.py +++ b/pynumaflow/sourcer/source.py @@ -3,7 +3,6 @@ import aiorun import grpc from pynumaflow.sourcer.async_server import AsyncSourcer - from pynumaflow.sourcer.server import Sourcer from pynumaflow._constants import ( @@ -80,7 +79,7 @@ def exec(self): bind_address=self.sock_path, max_threads=self.max_threads, server_options=self._server_options, - udf_type=UDFType.Sink, + udf_type=UDFType.Source, ) async def aexec(self): From 39788cd300d1db413a7e2c34de1c57be532fc73f Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Tue, 9 Jan 2024 15:14:17 -0800 Subject: [PATCH 39/78] source Signed-off-by: Sidhant Kohli --- pynumaflow/shared/server.py | 2 +- pynumaflow/sourcer/__init__.py | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/pynumaflow/shared/server.py b/pynumaflow/shared/server.py index 216fdb22..de3be5a5 100644 --- a/pynumaflow/shared/server.py +++ b/pynumaflow/shared/server.py @@ -132,7 +132,7 @@ def _run_server( sink_pb2_grpc.add_SinkServicer_to_server(servicer, server) elif udf_type == UDFType.SourceTransformer: transform_pb2_grpc.add_SourceTransformServicer_to_server(servicer, server) - elif udf_type == UDFType.Sink: + elif udf_type == UDFType.Source: source_pb2_grpc.add_SourceServicer_to_server(servicer, server) # bind the server to the UDS/TCP socket diff --git a/pynumaflow/sourcer/__init__.py b/pynumaflow/sourcer/__init__.py index 9d4fd24d..5d558433 100644 --- a/pynumaflow/sourcer/__init__.py +++ b/pynumaflow/sourcer/__init__.py @@ -9,10 +9,9 @@ AckRequest, Offset, PartitionsResponse, - get_default_partitions, SourcerClass, + get_default_partitions, + SourcerClass, ) -from pynumaflow.sourcer.async_server import AsyncSourcer -from pynumaflow.sourcer.server import Sourcer __all__ = [ "Message", From d7fe470e27111242ab48f9f778c03fc9a1c3f26d Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Wed, 10 Jan 2024 11:21:16 -0800 Subject: [PATCH 40/78] tests Signed-off-by: Sidhant Kohli --- pynumaflow-old/__init__.py | 36 --- pynumaflow-old/_constants.py | 19 -- pynumaflow-old/exceptions.py | 6 - pynumaflow-old/info/__init__.py | 0 pynumaflow-old/info/server.py | 59 ---- pynumaflow-old/info/types.py | 49 ---- pynumaflow-old/mapper/__init__.py | 19 -- pynumaflow-old/mapper/_dtypes.py | 166 ----------- pynumaflow-old/mapper/async_server.py | 161 ----------- pynumaflow-old/mapper/multiproc_server.py | 208 -------------- pynumaflow-old/mapper/proto/__init__.py | 0 pynumaflow-old/mapper/proto/map.proto | 43 --- pynumaflow-old/mapper/proto/map_pb2.py | 38 --- pynumaflow-old/mapper/proto/map_pb2_grpc.py | 123 -------- pynumaflow-old/mapper/server.py | 136 --------- pynumaflow-old/mapstreamer/__init__.py | 15 - pynumaflow-old/mapstreamer/_dtypes.py | 166 ----------- pynumaflow-old/mapstreamer/async_server.py | 151 ---------- pynumaflow-old/mapstreamer/proto/__init__.py | 0 .../mapstreamer/proto/mapstream.proto | 44 --- .../mapstreamer/proto/mapstream_pb2.py | 38 --- .../mapstreamer/proto/mapstream_pb2_grpc.py | 125 -------- pynumaflow-old/reducer/__init__.py | 19 -- pynumaflow-old/reducer/_dtypes.py | 235 ---------------- pynumaflow-old/reducer/async_server.py | 251 ----------------- pynumaflow-old/reducer/asynciter.py | 23 -- pynumaflow-old/reducer/proto/__init__.py | 0 pynumaflow-old/reducer/proto/reduce.proto | 44 --- pynumaflow-old/reducer/proto/reduce_pb2.py | 38 --- .../reducer/proto/reduce_pb2_grpc.py | 123 -------- pynumaflow-old/sideinput/__init__.py | 4 - pynumaflow-old/sideinput/_dtypes.py | 38 --- pynumaflow-old/sideinput/proto/__init__.py | 0 .../sideinput/proto/sideinput.proto | 40 --- .../sideinput/proto/sideinput_pb2.py | 33 --- .../sideinput/proto/sideinput_pb2_grpc.py | 149 ---------- pynumaflow-old/sideinput/server.py | 115 -------- pynumaflow-old/sinker/__init__.py | 5 - pynumaflow-old/sinker/_dtypes.py | 164 ----------- pynumaflow-old/sinker/async_sink.py | 153 ---------- pynumaflow-old/sinker/proto/__init__.py | 0 pynumaflow-old/sinker/proto/sink.proto | 50 ---- pynumaflow-old/sinker/proto/sink_pb2.py | 39 --- pynumaflow-old/sinker/proto/sink_pb2_grpc.py | 123 -------- pynumaflow-old/sinker/server.py | 139 --------- pynumaflow-old/sourcer/__init__.py | 23 -- pynumaflow-old/sourcer/_dtypes.py | 217 -------------- pynumaflow-old/sourcer/async_server.py | 237 ---------------- pynumaflow-old/sourcer/proto/__init__.py | 0 pynumaflow-old/sourcer/proto/source.proto | 148 ---------- pynumaflow-old/sourcer/proto/source_pb2.py | 58 ---- .../sourcer/proto/source_pb2_grpc.py | 266 ------------------ pynumaflow-old/sourcer/server.py | 231 --------------- pynumaflow-old/sourcetransformer/__init__.py | 12 - pynumaflow-old/types.py | 7 - pynumaflow/sinker/_dtypes.py | 4 +- tests/map/test_async_mapper.py | 10 +- tests/map/test_multiproc_mapper.py | 73 ++--- tests/map/test_sync_mapper.py | 21 +- tests/mapstream/test_async_map_stream.py | 12 +- tests/mapstream/test_async_map_stream_err.py | 18 +- tests/mapstream/utils.py | 2 +- tests/reduce/test_async_reduce.py | 14 +- tests/sideinput/test_side_input_server.py | 2 +- tests/sink/test_async_sink.py | 9 +- tests/sink/test_server.py | 14 +- tests/source/test_async_source.py | 30 +- tests/source/test_async_source_err.py | 19 +- tests/source/test_sync_source.py | 30 +- tests/source/test_sync_source_err.py | 21 +- tests/source/utils.py | 124 ++++---- tests/sourcetransform/test_multiproc.py | 59 ++-- tests/sourcetransform/test_sync_server.py | 34 ++- 73 files changed, 260 insertions(+), 4822 deletions(-) delete mode 100644 pynumaflow-old/__init__.py delete mode 100644 pynumaflow-old/_constants.py delete mode 100644 pynumaflow-old/exceptions.py delete mode 100644 pynumaflow-old/info/__init__.py delete mode 100644 pynumaflow-old/info/server.py delete mode 100644 pynumaflow-old/info/types.py delete mode 100644 pynumaflow-old/mapper/__init__.py delete mode 100644 pynumaflow-old/mapper/_dtypes.py delete mode 100644 pynumaflow-old/mapper/async_server.py delete mode 100644 pynumaflow-old/mapper/multiproc_server.py delete mode 100644 pynumaflow-old/mapper/proto/__init__.py delete mode 100644 pynumaflow-old/mapper/proto/map.proto delete mode 100644 pynumaflow-old/mapper/proto/map_pb2.py delete mode 100644 pynumaflow-old/mapper/proto/map_pb2_grpc.py delete mode 100644 pynumaflow-old/mapper/server.py delete mode 100644 pynumaflow-old/mapstreamer/__init__.py delete mode 100644 pynumaflow-old/mapstreamer/_dtypes.py delete mode 100644 pynumaflow-old/mapstreamer/async_server.py delete mode 100644 pynumaflow-old/mapstreamer/proto/__init__.py delete mode 100644 pynumaflow-old/mapstreamer/proto/mapstream.proto delete mode 100644 pynumaflow-old/mapstreamer/proto/mapstream_pb2.py delete mode 100644 pynumaflow-old/mapstreamer/proto/mapstream_pb2_grpc.py delete mode 100644 pynumaflow-old/reducer/__init__.py delete mode 100644 pynumaflow-old/reducer/_dtypes.py delete mode 100644 pynumaflow-old/reducer/async_server.py delete mode 100644 pynumaflow-old/reducer/asynciter.py delete mode 100644 pynumaflow-old/reducer/proto/__init__.py delete mode 100644 pynumaflow-old/reducer/proto/reduce.proto delete mode 100644 pynumaflow-old/reducer/proto/reduce_pb2.py delete mode 100644 pynumaflow-old/reducer/proto/reduce_pb2_grpc.py delete mode 100644 pynumaflow-old/sideinput/__init__.py delete mode 100644 pynumaflow-old/sideinput/_dtypes.py delete mode 100644 pynumaflow-old/sideinput/proto/__init__.py delete mode 100644 pynumaflow-old/sideinput/proto/sideinput.proto delete mode 100644 pynumaflow-old/sideinput/proto/sideinput_pb2.py delete mode 100644 pynumaflow-old/sideinput/proto/sideinput_pb2_grpc.py delete mode 100644 pynumaflow-old/sideinput/server.py delete mode 100644 pynumaflow-old/sinker/__init__.py delete mode 100644 pynumaflow-old/sinker/_dtypes.py delete mode 100644 pynumaflow-old/sinker/async_sink.py delete mode 100644 pynumaflow-old/sinker/proto/__init__.py delete mode 100644 pynumaflow-old/sinker/proto/sink.proto delete mode 100644 pynumaflow-old/sinker/proto/sink_pb2.py delete mode 100644 pynumaflow-old/sinker/proto/sink_pb2_grpc.py delete mode 100644 pynumaflow-old/sinker/server.py delete mode 100644 pynumaflow-old/sourcer/__init__.py delete mode 100644 pynumaflow-old/sourcer/_dtypes.py delete mode 100644 pynumaflow-old/sourcer/async_server.py delete mode 100644 pynumaflow-old/sourcer/proto/__init__.py delete mode 100644 pynumaflow-old/sourcer/proto/source.proto delete mode 100644 pynumaflow-old/sourcer/proto/source_pb2.py delete mode 100644 pynumaflow-old/sourcer/proto/source_pb2_grpc.py delete mode 100644 pynumaflow-old/sourcer/server.py delete mode 100644 pynumaflow-old/sourcetransformer/__init__.py delete mode 100644 pynumaflow-old/types.py diff --git a/pynumaflow-old/__init__.py b/pynumaflow-old/__init__.py deleted file mode 100644 index bd60d0b6..00000000 --- a/pynumaflow-old/__init__.py +++ /dev/null @@ -1,36 +0,0 @@ -import logging -import os -import sys - -if os.getenv("PYTHONDEBUG"): - os.environ["PYTHONASYNCIODEBUG"] = "1" - - -class StdoutFilter(logging.Filter): - """ - Filter logs with level less than logging.ERROR so they will go to stdout instead - of default stderr - """ - - def filter(self, record: logging.LogRecord) -> bool: - return record.levelno < logging.ERROR - - -def setup_logging(name): - formatter = logging.Formatter( - fmt="%(asctime)s %(levelname)-8s %(message)s", datefmt="%Y-%m-%d %H:%M:%S" - ) - logger = logging.getLogger(name) - - stdout_handler = logging.StreamHandler(sys.stdout) - stdout_handler.setFormatter(formatter) - stdout_handler.addFilter(StdoutFilter()) - stdout_handler.setLevel(logging.INFO) - logger.addHandler(stdout_handler) - - stderr_handler = logging.StreamHandler(sys.stderr) - stderr_handler.setFormatter(formatter) - stderr_handler.setLevel(logging.ERROR) - logger.addHandler(stderr_handler) - - return logger diff --git a/pynumaflow-old/_constants.py b/pynumaflow-old/_constants.py deleted file mode 100644 index 253d0401..00000000 --- a/pynumaflow-old/_constants.py +++ /dev/null @@ -1,19 +0,0 @@ -MAP_SOCK_PATH = "/var/run/numaflow/map.sock" -MAP_STREAM_SOCK_PATH = "/var/run/numaflow/mapstream.sock" -REDUCE_SOCK_PATH = "/var/run/numaflow/reduce.sock" -SOURCE_TRANSFORMER_SOCK_PATH = "/var/run/numaflow/sourcetransform.sock" -SINK_SOCK_PATH = "/var/run/numaflow/sink.sock" -MULTIPROC_MAP_SOCK_PORT = 55551 -MULTIPROC_MAP_SOCK_ADDR = "0.0.0.0" -SIDE_INPUT_SOCK_PATH = "/var/run/numaflow/sideinput.sock" -SOURCE_SOCK_PATH = "/var/run/numaflow/source.sock" - -# TODO: need to make sure the DATUM_KEY value is the same as -# https://github.com/numaproj/numaflow-go/blob/main/pkg/function/configs.go#L6 -WIN_START_TIME = "x-numaflow-win-start-time" -WIN_END_TIME = "x-numaflow-win-end-time" -MAX_MESSAGE_SIZE = 1024 * 1024 * 64 -# TODO: None instead of "EOF" ? -STREAM_EOF = "EOF" -DELIMITER = ":" -DROP = "U+005C__DROP__" diff --git a/pynumaflow-old/exceptions.py b/pynumaflow-old/exceptions.py deleted file mode 100644 index 30dc9a76..00000000 --- a/pynumaflow-old/exceptions.py +++ /dev/null @@ -1,6 +0,0 @@ -class NoPublicConstructorError(TypeError): - """Raise when using ClassName() to create objects while public constructor is not supported""" - - -class SocketError(Exception): - """To raise an error while creating socket or setting its property""" diff --git a/pynumaflow-old/info/__init__.py b/pynumaflow-old/info/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/pynumaflow-old/info/server.py b/pynumaflow-old/info/server.py deleted file mode 100644 index abbb7cad..00000000 --- a/pynumaflow-old/info/server.py +++ /dev/null @@ -1,59 +0,0 @@ -import os -from importlib.metadata import version -from typing import Any - -from pynumaflow import setup_logging -from pynumaflow.info.types import ServerInfo, EOF -import json -import logging - -_LOGGER = setup_logging(__name__) -if os.getenv("PYTHONDEBUG"): - _LOGGER.setLevel(logging.DEBUG) - - -def get_sdk_version() -> str: - """ - Return the pynumaflow SDK version - """ - try: - return version("pynumaflow") - except Exception as e: - # Adding this to handle the case for local test/CI where pynumaflow - # will not be installed as a package - _LOGGER.error("Could not read SDK version %r", e, exc_info=True) - return "" - - -def write(server_info: ServerInfo, info_file: str): - """ - Write the ServerInfo to a file , shared with the client (numa container). - - args: - serv: The ServerInfo object to be shared - info_file: the shared file path - """ - try: - data = server_info.__dict__ - with open(info_file, "w+") as f: - json.dump(data, f, ensure_ascii=False) - f.write(EOF) - except Exception as err: - _LOGGER.critical("Could not write data to Info-Server %r", err, exc_info=True) - raise err - - -def get_metadata_env(envs: list[tuple[str, str]]) -> dict[str, Any]: - """ - Extract the environment var value from the provided list, - and assign them to the given key in the metadata - - args: - envs: List of tuples (key, env_var) - """ - meta = {} - for key, val in envs: - res = os.getenv(val, None) - if res: - meta[key] = res - return meta diff --git a/pynumaflow-old/info/types.py b/pynumaflow-old/info/types.py deleted file mode 100644 index 0e640082..00000000 --- a/pynumaflow-old/info/types.py +++ /dev/null @@ -1,49 +0,0 @@ -from dataclasses import dataclass, field -from enum import Enum - -# Constants for using in the info-server -# Need to keep consistent with all SDKs and client -SERVER_INFO_FILE_PATH = "/var/run/numaflow/server-info" -EOF = "U+005C__END__" - -# Env variables to be passed in the info server metadata. -# These need to be accessed in the client using the same key. -# Format - (key, env_var) -METADATA_ENVS = [("CPU_LIMIT", "NUMAFLOW_CPU_LIMIT")] - - -class Protocol(str, Enum): - """ - Enumerate grpc server connection protocol. - """ - - UDS = "uds" - TCP = "tcp" - - -class Language(str, Enum): - """ - Enumerate Numaflow SDK language. - """ - - GO = "go" - PYTHON = "python" - JAVA = "java" - - -@dataclass -class ServerInfo: - """ - ServerInfo is used for the gRPC server to provide the information such as protocol, - sdk version, language, metadata to the client. - Args: - protocol: Protocol to use (UDS or TCP) - language: Language used by the server(Python, Golang, Java) - version: Numaflow sdk version used by the server - metadata: Any additional information to be provided (env vars) - """ - - protocol: Protocol - language: Language - version: str - metadata: dict = field(default_factory=dict) diff --git a/pynumaflow-old/mapper/__init__.py b/pynumaflow-old/mapper/__init__.py deleted file mode 100644 index 374b123f..00000000 --- a/pynumaflow-old/mapper/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -from pynumaflow.mapper._dtypes import ( - Message, - Messages, - Datum, - DROP, -) -from pynumaflow.mapper.async_server import AsyncMapper -from pynumaflow.mapper.multiproc_server import MultiProcMapper -from pynumaflow.mapper.server import Mapper - -__all__ = [ - "Message", - "Messages", - "Datum", - "DROP", - "Mapper", - "AsyncMapper", - "MultiProcMapper", -] diff --git a/pynumaflow-old/mapper/_dtypes.py b/pynumaflow-old/mapper/_dtypes.py deleted file mode 100644 index 92556a9d..00000000 --- a/pynumaflow-old/mapper/_dtypes.py +++ /dev/null @@ -1,166 +0,0 @@ -from collections.abc import Iterator, Sequence, Awaitable -from dataclasses import dataclass -from datetime import datetime -from typing import TypeVar, Callable -from warnings import warn - -from pynumaflow._constants import DROP - -M = TypeVar("M", bound="Message") -Ms = TypeVar("Ms", bound="Messages") - - -@dataclass(init=False) -class Message: - """ - Basic datatype for data passing to the next vertex/vertices. - - Args: - value: data in bytes - keys: []string keys for vertex (optional) - tags: []string tags for conditional forwarding (optional) - """ - - __slots__ = ("_value", "_keys", "_tags") - - _value: bytes - _keys: list[str] - _tags: list[str] - - def __init__(self, value: bytes, keys: list[str] = None, tags: list[str] = None): - """ - Creates a Message object to send value to a vertex. - """ - self._keys = keys or [] - self._tags = tags or [] - self._value = value or b"" - - # returns the Message Object which will be dropped - @classmethod - def to_drop(cls: type[M]) -> M: - return cls(b"", None, [DROP]) - - @property - def value(self) -> bytes: - return self._value - - @property - def keys(self) -> list[str]: - return self._keys - - @property - def tags(self) -> list[str]: - return self._tags - - -class Messages(Sequence[M]): - """ - Class to define a list of Message objects. - - Args: - messages: list of Message objects. - """ - - __slots__ = ("_messages",) - - def __init__(self, *messages: M): - self._messages = list(messages) or [] - - def __str__(self) -> str: - return str(self._messages) - - def __repr__(self) -> str: - return str(self) - - def __len__(self) -> int: - return len(self._messages) - - def __iter__(self) -> Iterator[M]: - return iter(self._messages) - - def __getitem__(self, index: int) -> M: - if isinstance(index, slice): - raise TypeError("Slicing is not supported for Messages") - return self._messages[index] - - def append(self, message: Message) -> None: - self._messages.append(message) - - def items(self) -> list[Message]: - warn( - "Using items is deprecated and will be removed in v0.5. " - "Iterate or index the Messages object instead.", - DeprecationWarning, - stacklevel=2, - ) - return self._messages - - -@dataclass(init=False) -class Datum: - """ - Class to define the important information for the event. - Args: - keys: the keys of the event. - value: the payload of the event. - event_time: the event time of the event. - watermark: the watermark of the event. - >>> # Example usage - >>> from pynumaflow.mapper import Datum - >>> from datetime import datetime, timezone - >>> payload = bytes("test_mock_message", encoding="utf-8") - >>> t1 = datetime.fromtimestamp(1662998400, timezone.utc) - >>> t2 = datetime.fromtimestamp(1662998460, timezone.utc) - >>> d = Datum( - ... keys=["test_key"], - ... value=payload, - ... event_time=t1, - ... watermark=t2, - ... ) - """ - - __slots__ = ("_keys", "_value", "_event_time", "_watermark") - - _keys: list[str] - _value: bytes - _event_time: datetime - _watermark: datetime - - def __init__( - self, - keys: list[str], - value: bytes, - event_time: datetime, - watermark: datetime, - ): - self._keys = keys or list() - self._value = value or b"" - if not isinstance(event_time, datetime): - raise TypeError(f"Wrong data type: {type(event_time)} for Datum.event_time") - self._event_time = event_time - if not isinstance(watermark, datetime): - raise TypeError(f"Wrong data type: {type(watermark)} for Datum.watermark") - self._watermark = watermark - - def keys(self) -> list[str]: - """Returns the keys of the event""" - return self._keys - - @property - def value(self) -> bytes: - """Returns the value of the event.""" - return self._value - - @property - def event_time(self) -> datetime: - """Returns the event time of the event.""" - return self._event_time - - @property - def watermark(self) -> datetime: - """Returns the watermark of the event.""" - return self._watermark - - -MapCallable = Callable[[list[str], Datum], Messages] -MapAsyncCallable = Callable[[list[str], Datum], Awaitable[Messages]] diff --git a/pynumaflow-old/mapper/async_server.py b/pynumaflow-old/mapper/async_server.py deleted file mode 100644 index 2c479848..00000000 --- a/pynumaflow-old/mapper/async_server.py +++ /dev/null @@ -1,161 +0,0 @@ -import logging -import multiprocessing -import os - - -import grpc -from google.protobuf import empty_pb2 as _empty_pb2 - -from pynumaflow import setup_logging -from pynumaflow._constants import ( - MAX_MESSAGE_SIZE, - MAP_SOCK_PATH, -) -from pynumaflow.mapper import Datum -from pynumaflow.mapper._dtypes import MapAsyncCallable -from pynumaflow.mapper.proto import map_pb2 -from pynumaflow.mapper.proto import map_pb2_grpc -from pynumaflow.types import NumaflowServicerContext -from pynumaflow.info.server import get_sdk_version, write as info_server_write -from pynumaflow.info.types import ServerInfo, Protocol, Language, SERVER_INFO_FILE_PATH - -_LOGGER = setup_logging(__name__) -if os.getenv("PYTHONDEBUG"): - _LOGGER.setLevel(logging.DEBUG) - -_PROCESS_COUNT = multiprocessing.cpu_count() -MAX_THREADS = int(os.getenv("MAX_THREADS", 0)) or (_PROCESS_COUNT * 4) - - -class AsyncMapper(map_pb2_grpc.MapServicer): - """ - Provides an interface to write an Async Mapper - which will be exposed over gRPC. - - Args: - handler: Function callable following the type signature of MapCallable - sock_path: Path to the UNIX Domain Socket - max_message_size: The max message size in bytes the server can receive and send - max_threads: The max number of threads to be spawned; - defaults to number of processors x4 - - Example invocation: - >>> from typing import Iterator - >>> from pynumaflow.mapper import Messages, Message\ - ... Datum, AsyncMapper - ... import aiorun - ... - >>> async def map_handler(key: [str], datum: Datum) -> Messages: - ... val = datum.value - ... _ = datum.event_time - ... _ = datum.watermark - ... messages = Messages(Message(val, keys=keys)) - ... return messages - ... - >>> grpc_server = AsyncMapper(handler=map_handler) - >>> aiorun.run(grpc_server.start()) - """ - - def __init__( - self, - handler: MapAsyncCallable, - sock_path=MAP_SOCK_PATH, - max_message_size=MAX_MESSAGE_SIZE, - max_threads=MAX_THREADS, - ): - self.__map_handler: MapAsyncCallable = handler - self.sock_path = f"unix://{sock_path}" - self._max_message_size = max_message_size - self._max_threads = max_threads - self.cleanup_coroutines = [] - # Collection for storing strong references to all running tasks. - # Event loop only keeps a weak reference, which can cause it to - # get lost during execution. - self.background_tasks = set() - - self._server_options = [ - ("grpc.max_send_message_length", self._max_message_size), - ("grpc.max_receive_message_length", self._max_message_size), - ] - - async def MapFn( - self, request: map_pb2.MapRequest, context: NumaflowServicerContext - ) -> map_pb2.MapResponse: - """ - Applies a function to each datum element. - The pascal case function name comes from the proto map_pb2_grpc.py file. - """ - # proto repeated field(keys) is of type google._upb._message.RepeatedScalarContainer - # we need to explicitly convert it to list - try: - res = await self.__invoke_map( - list(request.keys), - Datum( - keys=list(request.keys), - value=request.value, - event_time=request.event_time.ToDatetime(), - watermark=request.watermark.ToDatetime(), - ), - ) - except Exception as e: - context.set_code(grpc.StatusCode.UNKNOWN) - context.set_details(str(e)) - return map_pb2.MapResponse(results=[]) - - return map_pb2.MapResponse(results=res) - - async def __invoke_map(self, keys: list[str], req: Datum): - """ - Invokes the user defined function. - """ - try: - msgs = await self.__map_handler(keys, req) - except Exception as err: - _LOGGER.critical("UDFError, re-raising the error", exc_info=True) - raise err - datums = [] - for msg in msgs: - datums.append(map_pb2.MapResponse.Result(keys=msg.keys, value=msg.value, tags=msg.tags)) - - return datums - - async def IsReady( - self, request: _empty_pb2.Empty, context: NumaflowServicerContext - ) -> map_pb2.ReadyResponse: - """ - IsReady is the heartbeat endpoint for gRPC. - The pascal case function name comes from the proto map_pb2_grpc.py file. - """ - return map_pb2.ReadyResponse(ready=True) - - async def __serve_async(self, server) -> None: - map_pb2_grpc.add_MapServicer_to_server( - AsyncMapper(handler=self.__map_handler), - server, - ) - server.add_insecure_port(self.sock_path) - _LOGGER.info("gRPC Async Map Server listening on: %s", self.sock_path) - await server.start() - serv_info = ServerInfo( - protocol=Protocol.UDS, - language=Language.PYTHON, - version=get_sdk_version(), - ) - info_server_write(server_info=serv_info, info_file=SERVER_INFO_FILE_PATH) - - async def server_graceful_shutdown(): - """ - Shuts down the server with 5 seconds of grace period. During the - grace period, the server won't accept new connections and allow - existing RPCs to continue within the grace period. - """ - _LOGGER.info("Starting graceful shutdown...") - await server.stop(5) - - self.cleanup_coroutines.append(server_graceful_shutdown()) - await server.wait_for_termination() - - async def start(self) -> None: - """Starts the Async gRPC mapper on the given UNIX socket.""" - server = grpc.aio.server(options=self._server_options) - await self.__serve_async(server) diff --git a/pynumaflow-old/mapper/multiproc_server.py b/pynumaflow-old/mapper/multiproc_server.py deleted file mode 100644 index d14fde93..00000000 --- a/pynumaflow-old/mapper/multiproc_server.py +++ /dev/null @@ -1,208 +0,0 @@ -import contextlib -import logging -import multiprocessing -import os -import socket -from concurrent import futures -from collections.abc import Iterator - -import grpc -from google.protobuf import empty_pb2 as _empty_pb2 - -from pynumaflow import setup_logging -from pynumaflow._constants import ( - MAX_MESSAGE_SIZE, -) -from pynumaflow._constants import MULTIPROC_MAP_SOCK_ADDR -from pynumaflow.exceptions import SocketError -from pynumaflow.mapper import Datum -from pynumaflow.mapper._dtypes import MapCallable -from pynumaflow.mapper.proto import map_pb2 -from pynumaflow.mapper.proto import map_pb2_grpc -from pynumaflow.types import NumaflowServicerContext -from pynumaflow.info.server import ( - get_sdk_version, - write as info_server_write, - get_metadata_env, -) -from pynumaflow.info.types import ( - ServerInfo, - Protocol, - Language, - SERVER_INFO_FILE_PATH, - METADATA_ENVS, -) - -_LOGGER = setup_logging(__name__) -if os.getenv("PYTHONDEBUG"): - _LOGGER.setLevel(logging.DEBUG) - - -class MultiProcMapper(map_pb2_grpc.MapServicer): - """ - Provides an interface to write a Multi Proc Mapper - which will be exposed over gRPC. - - Args: - handler: Function callable following the type signature of MapCallable - max_message_size: The max message size in bytes the server can receive and send - - Example invocation: - >>> from typing import Iterator - >>> from pynumaflow.mapper import Messages, Message \ - ... Datum, MultiProcMapper - ... - >>> def map_handler(keys: list[str], datum: Datum) -> Messages: - ... val = datum.value - ... _ = datum.event_time - ... _ = datum.watermark - ... messages = Messages(Message(val, keys=keys)) - ... return messages - ... - >>> grpc_server = MultiProcMapper(handler=map_handler) - >>> grpc_server.start() - """ - - __slots__ = ( - "__map_handler", - "_max_message_size", - "_server_options", - "_process_count", - "_threads_per_proc", - ) - - def __init__( - self, - handler: MapCallable, - max_message_size=MAX_MESSAGE_SIZE, - ): - self.__map_handler: MapCallable = handler - self._max_message_size = max_message_size - - self._server_options = [ - ("grpc.max_send_message_length", self._max_message_size), - ("grpc.max_receive_message_length", self._max_message_size), - ("grpc.so_reuseport", 1), - ("grpc.so_reuseaddr", 1), - ] - # Set the number of processes to be spawned to the number of CPUs or - # the value of the env var NUM_CPU_MULTIPROC defined by the user - # Setting the max value to 2 * CPU count - self._process_count = min( - int(os.getenv("NUM_CPU_MULTIPROC", str(os.cpu_count()))), 2 * os.cpu_count() - ) - self._threads_per_proc = int(os.getenv("MAX_THREADS", "4")) - - def MapFn( - self, request: map_pb2.MapRequest, context: NumaflowServicerContext - ) -> map_pb2.MapResponse: - """ - Applies a function to each datum element. - The pascal case function name comes from the proto map_pb2_grpc.py file. - """ - # proto repeated field(keys) is of type google._upb._message.RepeatedScalarContainer - # we need to explicitly convert it to list - try: - msgs = self.__map_handler( - list(request.keys), - Datum( - keys=list(request.keys), - value=request.value, - event_time=request.event_time.ToDatetime(), - watermark=request.watermark.ToDatetime(), - ), - ) - except Exception as err: - _LOGGER.critical("UDFError, re-raising the error", exc_info=True) - context.set_code(grpc.StatusCode.UNKNOWN) - context.set_details(str(err)) - return map_pb2.MapResponse(results=[]) - - datums = [] - - for msg in msgs: - datums.append(map_pb2.MapResponse.Result(keys=msg.keys, value=msg.value, tags=msg.tags)) - - return map_pb2.MapResponse(results=datums) - - def IsReady( - self, request: _empty_pb2.Empty, context: NumaflowServicerContext - ) -> map_pb2.ReadyResponse: - """ - IsReady is the heartbeat endpoint for gRPC. - The pascal case function name comes from the proto map_pb2_grpc.py file. - """ - return map_pb2.ReadyResponse(ready=True) - - def _run_server(self, bind_address: str) -> None: - """Start a server in a subprocess.""" - _LOGGER.info( - "Starting new server with num_procs: %s, num_threads/proc: %s", - self._process_count, - self._threads_per_proc, - ) - server = grpc.server( - futures.ThreadPoolExecutor( - max_workers=self._threads_per_proc, - ), - options=self._server_options, - ) - map_pb2_grpc.add_MapServicer_to_server(self, server) - server.add_insecure_port(bind_address) - server.start() - _LOGGER.info("GRPC Multi-Processor Server listening on: %s %d", bind_address, os.getpid()) - server.wait_for_termination() - - @contextlib.contextmanager - def _reserve_port(self, port_num: int) -> Iterator[int]: - """Find and reserve a port for all subprocesses to use.""" - sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) - if sock.getsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR) == 0: - raise SocketError("Failed to set SO_REUSEADDR.") - try: - sock.bind(("", port_num)) - yield sock.getsockname()[1] - finally: - sock.close() - - def start(self) -> None: - """ - Start N grpc servers in different processes where N = The number of CPUs or the - value of the env var NUM_CPU_MULTIPROC defined by the user. The max value - is set to 2 * CPU count. - Each server will be bound to a different port, and we will create equal number of - workers to handle each server. - On the client side there will be same number of connections as the number of servers. - """ - workers = [] - server_ports = [] - for _ in range(self._process_count): - # Find a port to bind to for each server, thus sending the port number = 0 - # to the _reserve_port function so that kernel can find and return a free port - with self._reserve_port(0) as port: - bind_address = f"{MULTIPROC_MAP_SOCK_ADDR}:{port}" - _LOGGER.info("Starting server on port: %s", port) - # NOTE: It is imperative that the worker subprocesses be forked before - # any gRPC servers start up. See - # https://github.com/grpc/grpc/issues/16001 for more details. - worker = multiprocessing.Process(target=self._run_server, args=(bind_address,)) - worker.start() - workers.append(worker) - server_ports.append(port) - - # Convert the available ports to a comma separated string - ports = ",".join(map(str, server_ports)) - - serv_info = ServerInfo( - protocol=Protocol.TCP, - language=Language.PYTHON, - version=get_sdk_version(), - metadata=get_metadata_env(envs=METADATA_ENVS), - ) - # Add the PORTS metadata using the available ports - serv_info.metadata["SERV_PORTS"] = ports - info_server_write(server_info=serv_info, info_file=SERVER_INFO_FILE_PATH) - - for worker in workers: - worker.join() diff --git a/pynumaflow-old/mapper/proto/__init__.py b/pynumaflow-old/mapper/proto/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/pynumaflow-old/mapper/proto/map.proto b/pynumaflow-old/mapper/proto/map.proto deleted file mode 100644 index a8ab49be..00000000 --- a/pynumaflow-old/mapper/proto/map.proto +++ /dev/null @@ -1,43 +0,0 @@ -syntax = "proto3"; - -import "google/protobuf/empty.proto"; -import "google/protobuf/timestamp.proto"; - -package map.v1; - -service Map { - // MapFn applies a function to each map request element. - rpc MapFn(MapRequest) returns (MapResponse); - - // IsReady is the heartbeat endpoint for gRPC. - rpc IsReady(google.protobuf.Empty) returns (ReadyResponse); -} - -/** - * MapRequest represents a request element. - */ -message MapRequest { - repeated string keys = 1; - bytes value = 2; - google.protobuf.Timestamp event_time = 3; - google.protobuf.Timestamp watermark = 4; -} - -/** - * MapResponse represents a response element. - */ -message MapResponse { - message Result { - repeated string keys = 1; - bytes value = 2; - repeated string tags = 3; - } - repeated Result results = 1; -} - -/** - * ReadyResponse is the health check result. - */ -message ReadyResponse { - bool ready = 1; -} \ No newline at end of file diff --git a/pynumaflow-old/mapper/proto/map_pb2.py b/pynumaflow-old/mapper/proto/map_pb2.py deleted file mode 100644 index ddb812df..00000000 --- a/pynumaflow-old/mapper/proto/map_pb2.py +++ /dev/null @@ -1,38 +0,0 @@ -# -*- coding: utf-8 -*- -# Generated by the protocol buffer compiler. DO NOT EDIT! -# source: map.proto -"""Generated protocol buffer code.""" -from google.protobuf import descriptor as _descriptor -from google.protobuf import descriptor_pool as _descriptor_pool -from google.protobuf import symbol_database as _symbol_database -from google.protobuf.internal import builder as _builder - -# @@protoc_insertion_point(imports) - -_sym_db = _symbol_database.Default() - - -from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2 -from google.protobuf import timestamp_pb2 as google_dot_protobuf_dot_timestamp__pb2 - - -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b'\n\tmap.proto\x12\x06map.v1\x1a\x1bgoogle/protobuf/empty.proto\x1a\x1fgoogle/protobuf/timestamp.proto"\x88\x01\n\nMapRequest\x12\x0c\n\x04keys\x18\x01 \x03(\t\x12\r\n\x05value\x18\x02 \x01(\x0c\x12.\n\nevent_time\x18\x03 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12-\n\twatermark\x18\x04 \x01(\x0b\x32\x1a.google.protobuf.Timestamp"o\n\x0bMapResponse\x12+\n\x07results\x18\x01 \x03(\x0b\x32\x1a.map.v1.MapResponse.Result\x1a\x33\n\x06Result\x12\x0c\n\x04keys\x18\x01 \x03(\t\x12\r\n\x05value\x18\x02 \x01(\x0c\x12\x0c\n\x04tags\x18\x03 \x03(\t"\x1e\n\rReadyResponse\x12\r\n\x05ready\x18\x01 \x01(\x08\x32q\n\x03Map\x12\x30\n\x05MapFn\x12\x12.map.v1.MapRequest\x1a\x13.map.v1.MapResponse\x12\x38\n\x07IsReady\x12\x16.google.protobuf.Empty\x1a\x15.map.v1.ReadyResponseb\x06proto3' -) - -_globals = globals() -_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) -_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "map_pb2", _globals) -if _descriptor._USE_C_DESCRIPTORS == False: - DESCRIPTOR._options = None - _globals["_MAPREQUEST"]._serialized_start = 84 - _globals["_MAPREQUEST"]._serialized_end = 220 - _globals["_MAPRESPONSE"]._serialized_start = 222 - _globals["_MAPRESPONSE"]._serialized_end = 333 - _globals["_MAPRESPONSE_RESULT"]._serialized_start = 282 - _globals["_MAPRESPONSE_RESULT"]._serialized_end = 333 - _globals["_READYRESPONSE"]._serialized_start = 335 - _globals["_READYRESPONSE"]._serialized_end = 365 - _globals["_MAP"]._serialized_start = 367 - _globals["_MAP"]._serialized_end = 480 -# @@protoc_insertion_point(module_scope) diff --git a/pynumaflow-old/mapper/proto/map_pb2_grpc.py b/pynumaflow-old/mapper/proto/map_pb2_grpc.py deleted file mode 100644 index da8edc68..00000000 --- a/pynumaflow-old/mapper/proto/map_pb2_grpc.py +++ /dev/null @@ -1,123 +0,0 @@ -# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT! -"""Client and server classes corresponding to protobuf-defined services.""" -import grpc - -from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2 -from . import map_pb2 as map__pb2 - - -class MapStub(object): - """Missing associated documentation comment in .proto file.""" - - def __init__(self, channel): - """Constructor. - - Args: - channel: A grpc.Channel. - """ - self.MapFn = channel.unary_unary( - "/map.v1.Map/MapFn", - request_serializer=map__pb2.MapRequest.SerializeToString, - response_deserializer=map__pb2.MapResponse.FromString, - ) - self.IsReady = channel.unary_unary( - "/map.v1.Map/IsReady", - request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, - response_deserializer=map__pb2.ReadyResponse.FromString, - ) - - -class MapServicer(object): - """Missing associated documentation comment in .proto file.""" - - def MapFn(self, request, context): - """MapFn applies a function to each map request element.""" - context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") - - def IsReady(self, request, context): - """IsReady is the heartbeat endpoint for gRPC.""" - context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") - - -def add_MapServicer_to_server(servicer, server): - rpc_method_handlers = { - "MapFn": grpc.unary_unary_rpc_method_handler( - servicer.MapFn, - request_deserializer=map__pb2.MapRequest.FromString, - response_serializer=map__pb2.MapResponse.SerializeToString, - ), - "IsReady": grpc.unary_unary_rpc_method_handler( - servicer.IsReady, - request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, - response_serializer=map__pb2.ReadyResponse.SerializeToString, - ), - } - generic_handler = grpc.method_handlers_generic_handler("map.v1.Map", rpc_method_handlers) - server.add_generic_rpc_handlers((generic_handler,)) - - -# This class is part of an EXPERIMENTAL API. -class Map(object): - """Missing associated documentation comment in .proto file.""" - - @staticmethod - def MapFn( - request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.unary_unary( - request, - target, - "/map.v1.Map/MapFn", - map__pb2.MapRequest.SerializeToString, - map__pb2.MapResponse.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - ) - - @staticmethod - def IsReady( - request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.unary_unary( - request, - target, - "/map.v1.Map/IsReady", - google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, - map__pb2.ReadyResponse.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - ) diff --git a/pynumaflow-old/mapper/server.py b/pynumaflow-old/mapper/server.py deleted file mode 100644 index 0ae779ee..00000000 --- a/pynumaflow-old/mapper/server.py +++ /dev/null @@ -1,136 +0,0 @@ -import logging -import multiprocessing -import os -from concurrent.futures import ThreadPoolExecutor - -import grpc -from google.protobuf import empty_pb2 as _empty_pb2 -from pynumaflow.info.server import get_sdk_version, write as info_server_write -from pynumaflow.info.types import ServerInfo, Protocol, Language, SERVER_INFO_FILE_PATH - -from pynumaflow import setup_logging -from pynumaflow._constants import ( - MAX_MESSAGE_SIZE, - MAP_SOCK_PATH, -) -from pynumaflow.mapper import Datum -from pynumaflow.mapper._dtypes import MapCallable -from pynumaflow.mapper.proto import map_pb2 -from pynumaflow.mapper.proto import map_pb2_grpc -from pynumaflow.types import NumaflowServicerContext - -_LOGGER = setup_logging(__name__) -if os.getenv("PYTHONDEBUG"): - _LOGGER.setLevel(logging.DEBUG) - - -_PROCESS_COUNT = multiprocessing.cpu_count() -MAX_THREADS = int(os.getenv("MAX_THREADS", 0)) or (_PROCESS_COUNT * 4) - - -class Mapper(map_pb2_grpc.MapServicer): - """ - Provides an interface to write a Mapper - which will be exposed over a Synchronous gRPC server. - - Args: - handler: Function callable following the type signature of MapCallable - max_message_size: The max message size in bytes the server can receive and send - max_threads: The max number of threads to be spawned; - defaults to number of processors x4 - - Example invocation: - >>> from typing import Iterator - >>> from pynumaflow.mapper import Messages, Message\ - ... Datum, Mapper - ... - >>> def map_handler(key: [str], datum: Datum) -> Messages: - ... val = datum.value - ... _ = datum.event_time - ... _ = datum.watermark - ... messages = Messages(Message(val, keys=keys)) - ... return messages - ... - >>> grpc_server = Mapper(handler=map_handler) - >>> grpc_server.start() - """ - - def __init__( - self, - handler: MapCallable, - sock_path=MAP_SOCK_PATH, - max_message_size=MAX_MESSAGE_SIZE, - max_threads=MAX_THREADS, - ): - self.__map_handler: MapCallable = handler - self.sock_path = f"unix://{sock_path}" - self._max_message_size = max_message_size - self._max_threads = max_threads - self.cleanup_coroutines = [] - - self._server_options = [ - ("grpc.max_send_message_length", self._max_message_size), - ("grpc.max_receive_message_length", self._max_message_size), - ] - - def MapFn( - self, request: map_pb2.MapRequest, context: NumaflowServicerContext - ) -> map_pb2.MapResponse: - """ - Applies a function to each datum element. - The pascal case function name comes from the proto map_pb2_grpc.py file. - """ - # proto repeated field(keys) is of type google._upb._message.RepeatedScalarContainer - # we need to explicitly convert it to list - try: - msgs = self.__map_handler( - list(request.keys), - Datum( - keys=list(request.keys), - value=request.value, - event_time=request.event_time.ToDatetime(), - watermark=request.watermark.ToDatetime(), - ), - ) - except Exception as err: - _LOGGER.critical("UDFError, re-raising the error", exc_info=True) - context.set_code(grpc.StatusCode.UNKNOWN) - context.set_details(str(err)) - return map_pb2.MapResponse(results=[]) - - datums = [] - - for msg in msgs: - datums.append(map_pb2.MapResponse.Result(keys=msg.keys, value=msg.value, tags=msg.tags)) - - return map_pb2.MapResponse(results=datums) - - def IsReady( - self, request: _empty_pb2.Empty, context: NumaflowServicerContext - ) -> map_pb2.ReadyResponse: - """ - IsReady is the heartbeat endpoint for gRPC. - The pascal case function name comes from the proto map_pb2_grpc.py file. - """ - return map_pb2.ReadyResponse(ready=True) - - def start(self) -> None: - """ - Starts the gRPC server on the given UNIX socket with given max threads. - """ - server = grpc.server( - ThreadPoolExecutor(max_workers=self._max_threads), options=self._server_options - ) - map_pb2_grpc.add_MapServicer_to_server(self, server) - server.add_insecure_port(self.sock_path) - server.start() - serv_info = ServerInfo( - protocol=Protocol.UDS, - language=Language.PYTHON, - version=get_sdk_version(), - ) - info_server_write(server_info=serv_info, info_file=SERVER_INFO_FILE_PATH) - _LOGGER.info( - "GRPC Server listening on: %s with max threads: %s", self.sock_path, self._max_threads - ) - server.wait_for_termination() diff --git a/pynumaflow-old/mapstreamer/__init__.py b/pynumaflow-old/mapstreamer/__init__.py deleted file mode 100644 index 2896a903..00000000 --- a/pynumaflow-old/mapstreamer/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -from pynumaflow.mapstreamer._dtypes import ( - Message, - Messages, - Datum, - DROP, -) -from pynumaflow.mapstreamer.async_server import AsyncMapStreamer - -__all__ = [ - "Message", - "Messages", - "Datum", - "DROP", - "AsyncMapStreamer", -] diff --git a/pynumaflow-old/mapstreamer/_dtypes.py b/pynumaflow-old/mapstreamer/_dtypes.py deleted file mode 100644 index 27a1fb14..00000000 --- a/pynumaflow-old/mapstreamer/_dtypes.py +++ /dev/null @@ -1,166 +0,0 @@ -from collections.abc import Iterator, Sequence -from dataclasses import dataclass -from datetime import datetime -from typing import TypeVar, Callable -from collections.abc import AsyncIterable -from warnings import warn - -from pynumaflow._constants import DROP - -M = TypeVar("M", bound="Message") -Ms = TypeVar("Ms", bound="Messages") - - -@dataclass(init=False) -class Message: - """ - Basic datatype for data passing to the next vertex/vertices. - - Args: - value: data in bytes - keys: []string keys for vertex (optional) - tags: []string tags for conditional forwarding (optional) - """ - - __slots__ = ("_value", "_keys", "_tags") - - _value: bytes - _keys: list[str] - _tags: list[str] - - def __init__(self, value: bytes, keys: list[str] = None, tags: list[str] = None): - """ - Creates a Message object to send value to a vertex. - """ - self._keys = keys or [] - self._tags = tags or [] - self._value = value or b"" - - # returns the Message Object which will be dropped - @classmethod - def to_drop(cls: type[M]) -> M: - return cls(b"", None, [DROP]) - - @property - def value(self) -> bytes: - return self._value - - @property - def keys(self) -> list[str]: - return self._keys - - @property - def tags(self) -> list[str]: - return self._tags - - -class Messages(Sequence[M]): - """ - Class to define a list of Message objects. - - Args: - messages: list of Message objects. - """ - - __slots__ = ("_messages",) - - def __init__(self, *messages: M): - self._messages = list(messages) or [] - - def __str__(self) -> str: - return str(self._messages) - - def __repr__(self) -> str: - return str(self) - - def __len__(self) -> int: - return len(self._messages) - - def __iter__(self) -> Iterator[M]: - return iter(self._messages) - - def __getitem__(self, index: int) -> M: - if isinstance(index, slice): - raise TypeError("Slicing is not supported for Messages") - return self._messages[index] - - def append(self, message: Message) -> None: - self._messages.append(message) - - def items(self) -> list[Message]: - warn( - "Using items is deprecated and will be removed in v0.5. " - "Iterate or index the Messages object instead.", - DeprecationWarning, - stacklevel=2, - ) - return self._messages - - -@dataclass(init=False) -class Datum: - """ - Class to define the important information for the event. - Args: - keys: the keys of the event. - value: the payload of the event. - event_time: the event time of the event. - watermark: the watermark of the event. - >>> # Example usage - >>> from pynumaflow.mapstreamer import Datum - >>> from datetime import datetime, timezone - >>> payload = bytes("test_mock_message", encoding="utf-8") - >>> t1 = datetime.fromtimestamp(1662998400, timezone.utc) - >>> t2 = datetime.fromtimestamp(1662998460, timezone.utc) - >>> d = Datum( - ... keys=["test_key"], - ... value=payload, - ... event_time=t1, - ... watermark=t2, - ... ) - """ - - __slots__ = ("_keys", "_value", "_event_time", "_watermark") - - _keys: list[str] - _value: bytes - _event_time: datetime - _watermark: datetime - - def __init__( - self, - keys: list[str], - value: bytes, - event_time: datetime, - watermark: datetime, - ): - self._keys = keys or list() - self._value = value or b"" - if not isinstance(event_time, datetime): - raise TypeError(f"Wrong data type: {type(event_time)} for Datum.event_time") - self._event_time = event_time - if not isinstance(watermark, datetime): - raise TypeError(f"Wrong data type: {type(watermark)} for Datum.watermark") - self._watermark = watermark - - def keys(self) -> list[str]: - """Returns the keys of the event""" - return self._keys - - @property - def value(self) -> bytes: - """Returns the value of the event.""" - return self._value - - @property - def event_time(self) -> datetime: - """Returns the event time of the event.""" - return self._event_time - - @property - def watermark(self) -> datetime: - """Returns the watermark of the event.""" - return self._watermark - - -MapStreamCallable = Callable[[list[str], Datum], AsyncIterable[Message]] diff --git a/pynumaflow-old/mapstreamer/async_server.py b/pynumaflow-old/mapstreamer/async_server.py deleted file mode 100644 index 284517ee..00000000 --- a/pynumaflow-old/mapstreamer/async_server.py +++ /dev/null @@ -1,151 +0,0 @@ -import logging -import multiprocessing -import os - -from collections.abc import AsyncIterable - -import grpc -from google.protobuf import empty_pb2 as _empty_pb2 - -from pynumaflow import setup_logging -from pynumaflow._constants import ( - MAX_MESSAGE_SIZE, - MAP_STREAM_SOCK_PATH, -) -from pynumaflow.mapstreamer import Datum -from pynumaflow.mapstreamer._dtypes import MapStreamCallable -from pynumaflow.mapstreamer.proto import mapstream_pb2 -from pynumaflow.mapstreamer.proto import mapstream_pb2_grpc -from pynumaflow.types import NumaflowServicerContext -from pynumaflow.info.server import get_sdk_version, write as info_server_write -from pynumaflow.info.types import ServerInfo, Protocol, Language, SERVER_INFO_FILE_PATH - -_LOGGER = setup_logging(__name__) -if os.getenv("PYTHONDEBUG"): - _LOGGER.setLevel(logging.DEBUG) - -_PROCESS_COUNT = multiprocessing.cpu_count() -MAX_THREADS = int(os.getenv("MAX_THREADS", 0)) or (_PROCESS_COUNT * 4) - - -class AsyncMapStreamer(mapstream_pb2_grpc.MapStreamServicer): - """ - Provides an interface to write a Map Streamer - which will be exposed over gRPC. - - Args: - handler: Function callable following the type signature of MapStreamCallable - sock_path: Path to the UNIX Domain Socket - max_message_size: The max message size in bytes the server can receive and send - max_threads: The max number of threads to be spawned; - defaults to number of processors x4 - - Example invocation: - >>> from typing import Iterator - >>> from pynumaflow.mapstreamer import Messages, Message \ - ... Datum, AsyncMapStreamer - ... import aiorun - >>> async def map_stream_handler(key: [str], datums: Datum) -> AsyncIterable[Message]: - ... val = datum.value - ... _ = datum.event_time - ... _ = datum.watermark - ... for i in range(10): - ... yield Message(val, keys=keys) - ... - >>> grpc_server = AsyncMapStreamer(handler=map_stream_handler) - >>> aiorun.run(grpc_server.start()) - """ - - def __init__( - self, - handler: MapStreamCallable, - sock_path=MAP_STREAM_SOCK_PATH, - max_message_size=MAX_MESSAGE_SIZE, - max_threads=MAX_THREADS, - ): - self.__map_stream_handler: MapStreamCallable = handler - self.sock_path = f"unix://{sock_path}" - self._max_message_size = max_message_size - self._max_threads = max_threads - self.cleanup_coroutines = [] - # Collection for storing strong references to all running tasks. - # Event loop only keeps a weak reference, which can cause it to - # get lost during execution. - self.background_tasks = set() - - self._server_options = [ - ("grpc.max_send_message_length", self._max_message_size), - ("grpc.max_receive_message_length", self._max_message_size), - ] - - async def MapStreamFn( - self, - request: mapstream_pb2.MapStreamRequest, - context: NumaflowServicerContext, - ) -> AsyncIterable[mapstream_pb2.MapStreamResponse]: - """ - Applies a map function to a datum stream in streaming mode. - The pascal case function name comes from the proto mapstream_pb2_grpc.py file. - """ - - async for res in self.__invoke_map_stream( - list(request.keys), - Datum( - keys=list(request.keys), - value=request.value, - event_time=request.event_time.ToDatetime(), - watermark=request.watermark.ToDatetime(), - ), - ): - yield mapstream_pb2.MapStreamResponse(result=res) - - async def __invoke_map_stream(self, keys: list[str], req: Datum): - try: - async for msg in self.__map_stream_handler(keys, req): - yield mapstream_pb2.MapStreamResponse.Result( - keys=msg.keys, value=msg.value, tags=msg.tags - ) - except Exception as err: - _LOGGER.critical("UDFError, re-raising the error", exc_info=True) - raise err - - async def IsReady( - self, request: _empty_pb2.Empty, context: NumaflowServicerContext - ) -> mapstream_pb2.ReadyResponse: - """ - IsReady is the heartbeat endpoint for gRPC. - The pascal case function name comes from the proto mapstream_pb2_grpc.py file. - """ - return mapstream_pb2.ReadyResponse(ready=True) - - async def __serve_async(self, server) -> None: - mapstream_pb2_grpc.add_MapStreamServicer_to_server( - AsyncMapStreamer(handler=self.__map_stream_handler), - server, - ) - server.add_insecure_port(self.sock_path) - _LOGGER.info("GRPC Async Server listening on: %s", self.sock_path) - await server.start() - serv_info = ServerInfo( - protocol=Protocol.UDS, - language=Language.PYTHON, - version=get_sdk_version(), - ) - info_server_write(server_info=serv_info, info_file=SERVER_INFO_FILE_PATH) - - async def server_graceful_shutdown(): - """ - Shuts down the server with 5 seconds of grace period. During the - grace period, the server won't accept new connections and allow - existing RPCs to continue within the grace period. - """ - _LOGGER.info("Starting graceful shutdown...") - await server.stop(5) - - self.cleanup_coroutines.append(server_graceful_shutdown()) - await server.wait_for_termination() - - async def start(self) -> None: - """Starts the Async gRPC server on the given UNIX socket.""" - server = grpc.aio.server(options=self._server_options) - await self.__serve_async(server) diff --git a/pynumaflow-old/mapstreamer/proto/__init__.py b/pynumaflow-old/mapstreamer/proto/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/pynumaflow-old/mapstreamer/proto/mapstream.proto b/pynumaflow-old/mapstreamer/proto/mapstream.proto deleted file mode 100644 index 45605169..00000000 --- a/pynumaflow-old/mapstreamer/proto/mapstream.proto +++ /dev/null @@ -1,44 +0,0 @@ -syntax = "proto3"; - -import "google/protobuf/empty.proto"; -import "google/protobuf/timestamp.proto"; - - -package mapstream.v1; - -service MapStream { - // MapStreamFn applies a function to each request element and returns a stream. - rpc MapStreamFn(MapStreamRequest) returns (stream MapStreamResponse); - - // IsReady is the heartbeat endpoint for gRPC. - rpc IsReady(google.protobuf.Empty) returns (ReadyResponse); -} - -/** - * MapStreamRequest represents a request element. - */ -message MapStreamRequest { - repeated string keys = 1; - bytes value = 2; - google.protobuf.Timestamp event_time = 3; - google.protobuf.Timestamp watermark = 4; -} - -/** - * MapStreamResponse represents a response element. - */ -message MapStreamResponse { - message Result { - repeated string keys = 1; - bytes value = 2; - repeated string tags = 3; - } - Result result = 1; -} - -/** - * ReadyResponse is the health check result. - */ -message ReadyResponse { - bool ready = 1; -} \ No newline at end of file diff --git a/pynumaflow-old/mapstreamer/proto/mapstream_pb2.py b/pynumaflow-old/mapstreamer/proto/mapstream_pb2.py deleted file mode 100644 index f1c2c169..00000000 --- a/pynumaflow-old/mapstreamer/proto/mapstream_pb2.py +++ /dev/null @@ -1,38 +0,0 @@ -# -*- coding: utf-8 -*- -# Generated by the protocol buffer compiler. DO NOT EDIT! -# source: mapstream.proto -"""Generated protocol buffer code.""" -from google.protobuf import descriptor as _descriptor -from google.protobuf import descriptor_pool as _descriptor_pool -from google.protobuf import symbol_database as _symbol_database -from google.protobuf.internal import builder as _builder - -# @@protoc_insertion_point(imports) - -_sym_db = _symbol_database.Default() - - -from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2 -from google.protobuf import timestamp_pb2 as google_dot_protobuf_dot_timestamp__pb2 - - -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b'\n\x0fmapstream.proto\x12\x0cmapstream.v1\x1a\x1bgoogle/protobuf/empty.proto\x1a\x1fgoogle/protobuf/timestamp.proto"\x8e\x01\n\x10MapStreamRequest\x12\x0c\n\x04keys\x18\x01 \x03(\t\x12\r\n\x05value\x18\x02 \x01(\x0c\x12.\n\nevent_time\x18\x03 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12-\n\twatermark\x18\x04 \x01(\x0b\x32\x1a.google.protobuf.Timestamp"\x80\x01\n\x11MapStreamResponse\x12\x36\n\x06result\x18\x01 \x01(\x0b\x32&.mapstream.v1.MapStreamResponse.Result\x1a\x33\n\x06Result\x12\x0c\n\x04keys\x18\x01 \x03(\t\x12\r\n\x05value\x18\x02 \x01(\x0c\x12\x0c\n\x04tags\x18\x03 \x03(\t"\x1e\n\rReadyResponse\x12\r\n\x05ready\x18\x01 \x01(\x08\x32\x9d\x01\n\tMapStream\x12P\n\x0bMapStreamFn\x12\x1e.mapstream.v1.MapStreamRequest\x1a\x1f.mapstream.v1.MapStreamResponse0\x01\x12>\n\x07IsReady\x12\x16.google.protobuf.Empty\x1a\x1b.mapstream.v1.ReadyResponseb\x06proto3' -) - -_globals = globals() -_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) -_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "mapstream_pb2", _globals) -if _descriptor._USE_C_DESCRIPTORS == False: - DESCRIPTOR._options = None - _globals["_MAPSTREAMREQUEST"]._serialized_start = 96 - _globals["_MAPSTREAMREQUEST"]._serialized_end = 238 - _globals["_MAPSTREAMRESPONSE"]._serialized_start = 241 - _globals["_MAPSTREAMRESPONSE"]._serialized_end = 369 - _globals["_MAPSTREAMRESPONSE_RESULT"]._serialized_start = 318 - _globals["_MAPSTREAMRESPONSE_RESULT"]._serialized_end = 369 - _globals["_READYRESPONSE"]._serialized_start = 371 - _globals["_READYRESPONSE"]._serialized_end = 401 - _globals["_MAPSTREAM"]._serialized_start = 404 - _globals["_MAPSTREAM"]._serialized_end = 561 -# @@protoc_insertion_point(module_scope) diff --git a/pynumaflow-old/mapstreamer/proto/mapstream_pb2_grpc.py b/pynumaflow-old/mapstreamer/proto/mapstream_pb2_grpc.py deleted file mode 100644 index 305c8e05..00000000 --- a/pynumaflow-old/mapstreamer/proto/mapstream_pb2_grpc.py +++ /dev/null @@ -1,125 +0,0 @@ -# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT! -"""Client and server classes corresponding to protobuf-defined services.""" -import grpc - -from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2 -from . import mapstream_pb2 as mapstream__pb2 - - -class MapStreamStub(object): - """Missing associated documentation comment in .proto file.""" - - def __init__(self, channel): - """Constructor. - - Args: - channel: A grpc.Channel. - """ - self.MapStreamFn = channel.unary_stream( - "/mapstream.v1.MapStream/MapStreamFn", - request_serializer=mapstream__pb2.MapStreamRequest.SerializeToString, - response_deserializer=mapstream__pb2.MapStreamResponse.FromString, - ) - self.IsReady = channel.unary_unary( - "/mapstream.v1.MapStream/IsReady", - request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, - response_deserializer=mapstream__pb2.ReadyResponse.FromString, - ) - - -class MapStreamServicer(object): - """Missing associated documentation comment in .proto file.""" - - def MapStreamFn(self, request, context): - """MapStreamFn applies a function to each request element and returns a stream.""" - context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") - - def IsReady(self, request, context): - """IsReady is the heartbeat endpoint for gRPC.""" - context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") - - -def add_MapStreamServicer_to_server(servicer, server): - rpc_method_handlers = { - "MapStreamFn": grpc.unary_stream_rpc_method_handler( - servicer.MapStreamFn, - request_deserializer=mapstream__pb2.MapStreamRequest.FromString, - response_serializer=mapstream__pb2.MapStreamResponse.SerializeToString, - ), - "IsReady": grpc.unary_unary_rpc_method_handler( - servicer.IsReady, - request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, - response_serializer=mapstream__pb2.ReadyResponse.SerializeToString, - ), - } - generic_handler = grpc.method_handlers_generic_handler( - "mapstream.v1.MapStream", rpc_method_handlers - ) - server.add_generic_rpc_handlers((generic_handler,)) - - -# This class is part of an EXPERIMENTAL API. -class MapStream(object): - """Missing associated documentation comment in .proto file.""" - - @staticmethod - def MapStreamFn( - request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.unary_stream( - request, - target, - "/mapstream.v1.MapStream/MapStreamFn", - mapstream__pb2.MapStreamRequest.SerializeToString, - mapstream__pb2.MapStreamResponse.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - ) - - @staticmethod - def IsReady( - request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.unary_unary( - request, - target, - "/mapstream.v1.MapStream/IsReady", - google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, - mapstream__pb2.ReadyResponse.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - ) diff --git a/pynumaflow-old/reducer/__init__.py b/pynumaflow-old/reducer/__init__.py deleted file mode 100644 index 36fe4a9f..00000000 --- a/pynumaflow-old/reducer/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -from pynumaflow.reducer._dtypes import ( - Message, - Messages, - Datum, - IntervalWindow, - Metadata, - DROP, -) -from pynumaflow.reducer.async_server import AsyncReducer - -__all__ = [ - "Message", - "Messages", - "Datum", - "IntervalWindow", - "Metadata", - "DROP", - "AsyncReducer", -] diff --git a/pynumaflow-old/reducer/_dtypes.py b/pynumaflow-old/reducer/_dtypes.py deleted file mode 100644 index 534d4e28..00000000 --- a/pynumaflow-old/reducer/_dtypes.py +++ /dev/null @@ -1,235 +0,0 @@ -from asyncio import Task -from collections.abc import Iterator, Sequence, Awaitable -from dataclasses import dataclass -from datetime import datetime -from typing import TypeVar, Callable -from collections.abc import AsyncIterable -from warnings import warn - -from pynumaflow.reducer.asynciter import NonBlockingIterator -from pynumaflow._constants import DROP - -M = TypeVar("M", bound="Message") -Ms = TypeVar("Ms", bound="Messages") - - -@dataclass(init=False) -class Message: - """ - Basic datatype for data passing to the next vertex/vertices. - - Args: - value: data in bytes - keys: []string keys for vertex (optional) - tags: []string tags for conditional forwarding (optional) - """ - - __slots__ = ("_value", "_keys", "_tags") - - _value: bytes - _keys: list[str] - _tags: list[str] - - def __init__(self, value: bytes, keys: list[str] = None, tags: list[str] = None): - """ - Creates a Message object to send value to a vertex. - """ - self._keys = keys or [] - self._tags = tags or [] - self._value = value or b"" - - # returns the Message Object which will be dropped - @classmethod - def to_drop(cls: type[M]) -> M: - return cls(b"", None, [DROP]) - - @property - def value(self) -> bytes: - return self._value - - @property - def keys(self) -> list[str]: - return self._keys - - @property - def tags(self) -> list[str]: - return self._tags - - -class Messages(Sequence[M]): - """ - Class to define a list of Message objects. - - Args: - messages: list of Message objects. - """ - - __slots__ = ("_messages",) - - def __init__(self, *messages: M): - self._messages = list(messages) or [] - - def __str__(self) -> str: - return str(self._messages) - - def __repr__(self) -> str: - return str(self) - - def __len__(self) -> int: - return len(self._messages) - - def __iter__(self) -> Iterator[M]: - return iter(self._messages) - - def __getitem__(self, index: int) -> M: - if isinstance(index, slice): - raise TypeError("Slicing is not supported for Messages") - return self._messages[index] - - def append(self, message: Message) -> None: - self._messages.append(message) - - def items(self) -> list[Message]: - warn( - "Using items is deprecated and will be removed in v0.5. " - "Iterate or index the Messages object instead.", - DeprecationWarning, - stacklevel=2, - ) - return self._messages - - -@dataclass(init=False) -class Datum: - """ - Class to define the important information for the event. - Args: - keys: the keys of the event. - value: the payload of the event. - event_time: the event time of the event. - watermark: the watermark of the event. - >>> # Example usage - >>> from pynumaflow.reducer import Datum - >>> from datetime import datetime, timezone - >>> payload = bytes("test_mock_message", encoding="utf-8") - >>> t1 = datetime.fromtimestamp(1662998400, timezone.utc) - >>> t2 = datetime.fromtimestamp(1662998460, timezone.utc) - >>> d = Datum( - ... keys=["test_key"], - ... value=payload, - ... event_time=t1, - ... watermark=t2, - ... ) - """ - - __slots__ = ("_keys", "_value", "_event_time", "_watermark") - - _keys: list[str] - _value: bytes - _event_time: datetime - _watermark: datetime - - def __init__( - self, - keys: list[str], - value: bytes, - event_time: datetime, - watermark: datetime, - ): - self._keys = keys or list() - self._value = value or b"" - if not isinstance(event_time, datetime): - raise TypeError(f"Wrong data type: {type(event_time)} for Datum.event_time") - self._event_time = event_time - if not isinstance(watermark, datetime): - raise TypeError(f"Wrong data type: {type(watermark)} for Datum.watermark") - self._watermark = watermark - - def keys(self) -> list[str]: - """Returns the keys of the event""" - return self._keys - - @property - def value(self) -> bytes: - """Returns the value of the event.""" - return self._value - - @property - def event_time(self) -> datetime: - """Returns the event time of the event.""" - return self._event_time - - @property - def watermark(self) -> datetime: - """Returns the watermark of the event.""" - return self._watermark - - -@dataclass(init=False) -class IntervalWindow: - """Defines the start and end of the interval window for the event.""" - - __slots__ = ("_start", "_end") - - _start: datetime - _end: datetime - - def __init__(self, start: datetime, end: datetime): - self._start = start - self._end = end - - @property - def start(self): - """Returns the start point of the interval window.""" - return self._start - - @property - def end(self): - """Returns the end point of the interval window.""" - return self._end - - -@dataclass(init=False) -class Metadata: - """Defines the metadata for the event.""" - - __slots__ = ("_interval_window",) - - _interval_window: IntervalWindow - - def __init__(self, interval_window: IntervalWindow): - self._interval_window = interval_window - - @property - def interval_window(self): - """Returns the interval window for the event.""" - return self._interval_window - - -@dataclass -class ReduceResult: - """Defines the object to hold the result of reduce computation.""" - - __slots__ = ("_future", "_iterator", "_key") - - _future: Task - _iterator: NonBlockingIterator - _key: list[str] - - @property - def future(self): - """Returns the future result of computation.""" - return self._future - - @property - def iterator(self): - """Returns the handle to the producer queue.""" - return self._iterator - - @property - def keys(self) -> list[str]: - """Returns the keys of the partition.""" - return self._key - - -ReduceCallable = Callable[[list[str], AsyncIterable[Datum], Metadata], Awaitable[Messages]] diff --git a/pynumaflow-old/reducer/async_server.py b/pynumaflow-old/reducer/async_server.py deleted file mode 100644 index 90d83e1e..00000000 --- a/pynumaflow-old/reducer/async_server.py +++ /dev/null @@ -1,251 +0,0 @@ -import asyncio -import logging -import multiprocessing -import os - -from datetime import datetime, timezone -from collections.abc import AsyncIterable - -import grpc -from google.protobuf import empty_pb2 as _empty_pb2 - -from pynumaflow import setup_logging -from pynumaflow._constants import ( - WIN_START_TIME, - WIN_END_TIME, - MAX_MESSAGE_SIZE, - STREAM_EOF, - DELIMITER, - REDUCE_SOCK_PATH, -) -from pynumaflow.reducer import Datum, IntervalWindow, Metadata -from pynumaflow.reducer._dtypes import ReduceResult, ReduceCallable -from pynumaflow.reducer.asynciter import NonBlockingIterator -from pynumaflow.reducer.proto import reduce_pb2 -from pynumaflow.reducer.proto import reduce_pb2_grpc -from pynumaflow.types import NumaflowServicerContext -from pynumaflow.info.server import get_sdk_version, write as info_server_write -from pynumaflow.info.types import ServerInfo, Protocol, Language, SERVER_INFO_FILE_PATH - -_LOGGER = setup_logging(__name__) -if os.getenv("PYTHONDEBUG"): - _LOGGER.setLevel(logging.DEBUG) - -_PROCESS_COUNT = multiprocessing.cpu_count() -MAX_THREADS = int(os.getenv("MAX_THREADS", 0)) or (_PROCESS_COUNT * 4) - - -async def datum_generator( - request_iterator: AsyncIterable[reduce_pb2.ReduceRequest], -) -> AsyncIterable[Datum]: - async for d in request_iterator: - datum = Datum( - keys=list(d.keys), - value=d.value, - event_time=d.event_time.ToDatetime(), - watermark=d.watermark.ToDatetime(), - ) - yield datum - - -class AsyncReducer(reduce_pb2_grpc.ReduceServicer): - """ - Provides an interface to write a Reduce Function - which will be exposed over gRPC. - - Args: - handler: Function callable following the type signature of ReduceCallable - sock_path: Path to the UNIX Domain Socket - max_message_size: The max message size in bytes the server can receive and send - max_threads: The max number of threads to be spawned; - defaults to number of processors x4 - - Example invocation: - >>> from typing import Iterator - >>> from pynumaflow.reducer import Messages, Message\ - ... Datum, Metadata, AsyncReducer - ... import aiorun - ... - >>> async def reduce_handler(key: list[str], datums: AsyncIterable[Datum], - >>> md: Metadata) -> Messages: - ... interval_window = md.interval_window - ... counter = 0 - ... async for _ in datums: - ... counter += 1 - ... msg = ( - ... f"counter:{counter} interval_window_start:{interval_window.start} " - ... f"interval_window_end:{interval_window.end}" - ... ) - ... return Messages(Message(value=str.encode(msg), keys=keys)) - ... - >>> grpc_server = AsyncReducer(handler=reduce_handler) - >>> aiorun.run(grpc_server.start()) - """ - - def __init__( - self, - handler: ReduceCallable, - sock_path=REDUCE_SOCK_PATH, - max_message_size=MAX_MESSAGE_SIZE, - max_threads=MAX_THREADS, - ): - self.__reduce_handler: ReduceCallable = handler - self.sock_path = f"unix://{sock_path}" - self._max_message_size = max_message_size - self._max_threads = max_threads - self.cleanup_coroutines = [] - # Collection for storing strong references to all running tasks. - # Event loop only keeps a weak reference, which can cause it to - # get lost during execution. - self.background_tasks = set() - - self._server_options = [ - ("grpc.max_send_message_length", self._max_message_size), - ("grpc.max_receive_message_length", self._max_message_size), - ] - - async def ReduceFn( - self, - request_iterator: AsyncIterable[reduce_pb2.ReduceRequest], - context: NumaflowServicerContext, - ) -> reduce_pb2.ReduceResponse: - """ - Applies a reduce function to a datum stream. - The pascal case function name comes from the proto reduce_pb2_grpc.py file. - """ - - start, end = None, None - for metadata_key, metadata_value in context.invocation_metadata(): - if metadata_key == WIN_START_TIME: - start = metadata_value - elif metadata_key == WIN_END_TIME: - end = metadata_value - if not (start or end): - context.set_code(grpc.StatusCode.INVALID_ARGUMENT) - context.set_details( - f"Expected to have all key/window_start_time/window_end_time; " - f"got start: {start}, end: {end}." - ) - yield reduce_pb2.ReduceResponse(results=[]) - return - - start_dt = datetime.fromtimestamp(int(start) / 1e3, timezone.utc) - end_dt = datetime.fromtimestamp(int(end) / 1e3, timezone.utc) - interval_window = IntervalWindow(start=start_dt, end=end_dt) - - datum_iterator = datum_generator(request_iterator=request_iterator) - - response_task = asyncio.create_task( - self.__async_reduce_handler(interval_window, datum_iterator) - ) - - # Save a reference to the result of this function, to avoid a - # task disappearing mid-execution. - self.background_tasks.add(response_task) - response_task.add_done_callback(lambda t: self.background_tasks.remove(t)) - - await response_task - results_futures = response_task.result() - - try: - for fut in results_futures: - await fut - yield reduce_pb2.ReduceResponse(results=fut.result()) - except Exception as e: - context.set_code(grpc.StatusCode.UNKNOWN) - context.set_details(e.__str__()) - yield reduce_pb2.ReduceResponse(results=[]) - - async def __async_reduce_handler(self, interval_window, datum_iterator: AsyncIterable[Datum]): - callable_dict = {} - # iterate through all the values - async for d in datum_iterator: - keys = d.keys() - unified_key = DELIMITER.join(keys) - result = callable_dict.get(unified_key, None) - - if not result: - niter = NonBlockingIterator() - riter = niter.read_iterator() - # schedule an async task for consumer - # returns a future that will give the results later. - task = asyncio.create_task( - self.__invoke_reduce(keys, riter, Metadata(interval_window=interval_window)) - ) - # Save a reference to the result of this function, to avoid a - # task disappearing mid-execution. - self.background_tasks.add(task) - task.add_done_callback(lambda t: self.background_tasks.remove(t)) - result = ReduceResult(task, niter, keys) - - callable_dict[unified_key] = result - - await result.iterator.put(d) - - for unified_key in callable_dict: - await callable_dict[unified_key].iterator.put(STREAM_EOF) - - tasks = [] - for unified_key in callable_dict: - fut = callable_dict[unified_key].future - tasks.append(fut) - - return tasks - - async def __invoke_reduce( - self, keys: list[str], request_iterator: AsyncIterable[Datum], md: Metadata - ): - try: - msgs = await self.__reduce_handler(keys, request_iterator, md) - except Exception as err: - _LOGGER.critical("UDFError, re-raising the error", exc_info=True) - raise err - - datum_responses = [] - for msg in msgs: - datum_responses.append( - reduce_pb2.ReduceResponse.Result(keys=msg.keys, value=msg.value, tags=msg.tags) - ) - - return datum_responses - - async def IsReady( - self, request: _empty_pb2.Empty, context: NumaflowServicerContext - ) -> reduce_pb2.ReadyResponse: - """ - IsReady is the heartbeat endpoint for gRPC. - The pascal case function name comes from the proto reduce_pb2_grpc.py file. - """ - return reduce_pb2.ReadyResponse(ready=True) - - async def __serve_async(self, server) -> None: - reduce_pb2_grpc.add_ReduceServicer_to_server( - AsyncReducer(handler=self.__reduce_handler), - server, - ) - server.add_insecure_port(self.sock_path) - _LOGGER.info("GRPC Async Server listening on: %s", self.sock_path) - await server.start() - serv_info = ServerInfo( - protocol=Protocol.UDS, - language=Language.PYTHON, - version=get_sdk_version(), - ) - info_server_write(server_info=serv_info, info_file=SERVER_INFO_FILE_PATH) - - async def server_graceful_shutdown(): - """ - Shuts down the server with 5 seconds of grace period. During the - grace period, the server won't accept new connections and allow - existing RPCs to continue within the grace period. - """ - _LOGGER.info("Starting graceful shutdown...") - await server.stop(5) - - self.cleanup_coroutines.append(server_graceful_shutdown()) - await server.wait_for_termination() - - async def start(self) -> None: - """Starts the Async gRPC server on the given UNIX socket.""" - server = grpc.aio.server(options=self._server_options) - await self.__serve_async(server) diff --git a/pynumaflow-old/reducer/asynciter.py b/pynumaflow-old/reducer/asynciter.py deleted file mode 100644 index 3ab6135b..00000000 --- a/pynumaflow-old/reducer/asynciter.py +++ /dev/null @@ -1,23 +0,0 @@ -import asyncio - -from pynumaflow._constants import STREAM_EOF - - -class NonBlockingIterator: - """An Async Interator backed by a queue""" - - __slots__ = "_queue" - - def __init__(self): - self._queue = asyncio.Queue() - - async def read_iterator(self): - item = await self._queue.get() - while True: - if item == STREAM_EOF: - break - yield item - item = await self._queue.get() - - async def put(self, item): - await self._queue.put(item) diff --git a/pynumaflow-old/reducer/proto/__init__.py b/pynumaflow-old/reducer/proto/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/pynumaflow-old/reducer/proto/reduce.proto b/pynumaflow-old/reducer/proto/reduce.proto deleted file mode 100644 index 81571e14..00000000 --- a/pynumaflow-old/reducer/proto/reduce.proto +++ /dev/null @@ -1,44 +0,0 @@ -syntax = "proto3"; - -import "google/protobuf/empty.proto"; -import "google/protobuf/timestamp.proto"; - - -package reduce.v1; - -service Reduce { - // ReduceFn applies a reduce function to a request stream. - rpc ReduceFn(stream ReduceRequest) returns (stream ReduceResponse); - - // IsReady is the heartbeat endpoint for gRPC. - rpc IsReady(google.protobuf.Empty) returns (ReadyResponse); -} - -/** - * ReduceRequest represents a request element. - */ -message ReduceRequest { - repeated string keys = 1; - bytes value = 2; - google.protobuf.Timestamp event_time = 3; - google.protobuf.Timestamp watermark = 4; -} - -/** - * ReduceResponse represents a response element. - */ -message ReduceResponse { - message Result { - repeated string keys = 1; - bytes value = 2; - repeated string tags = 3; - } - repeated Result results = 1; -} - -/** - * ReadyResponse is the health check result. - */ -message ReadyResponse { - bool ready = 1; -} \ No newline at end of file diff --git a/pynumaflow-old/reducer/proto/reduce_pb2.py b/pynumaflow-old/reducer/proto/reduce_pb2.py deleted file mode 100644 index f61b8887..00000000 --- a/pynumaflow-old/reducer/proto/reduce_pb2.py +++ /dev/null @@ -1,38 +0,0 @@ -# -*- coding: utf-8 -*- -# Generated by the protocol buffer compiler. DO NOT EDIT! -# source: reduce.proto -"""Generated protocol buffer code.""" -from google.protobuf import descriptor as _descriptor -from google.protobuf import descriptor_pool as _descriptor_pool -from google.protobuf import symbol_database as _symbol_database -from google.protobuf.internal import builder as _builder - -# @@protoc_insertion_point(imports) - -_sym_db = _symbol_database.Default() - - -from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2 -from google.protobuf import timestamp_pb2 as google_dot_protobuf_dot_timestamp__pb2 - - -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b'\n\x0creduce.proto\x12\treduce.v1\x1a\x1bgoogle/protobuf/empty.proto\x1a\x1fgoogle/protobuf/timestamp.proto"\x8b\x01\n\rReduceRequest\x12\x0c\n\x04keys\x18\x01 \x03(\t\x12\r\n\x05value\x18\x02 \x01(\x0c\x12.\n\nevent_time\x18\x03 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12-\n\twatermark\x18\x04 \x01(\x0b\x32\x1a.google.protobuf.Timestamp"x\n\x0eReduceResponse\x12\x31\n\x07results\x18\x01 \x03(\x0b\x32 .reduce.v1.ReduceResponse.Result\x1a\x33\n\x06Result\x12\x0c\n\x04keys\x18\x01 \x03(\t\x12\r\n\x05value\x18\x02 \x01(\x0c\x12\x0c\n\x04tags\x18\x03 \x03(\t"\x1e\n\rReadyResponse\x12\r\n\x05ready\x18\x01 \x01(\x08\x32\x8a\x01\n\x06Reduce\x12\x43\n\x08ReduceFn\x12\x18.reduce.v1.ReduceRequest\x1a\x19.reduce.v1.ReduceResponse(\x01\x30\x01\x12;\n\x07IsReady\x12\x16.google.protobuf.Empty\x1a\x18.reduce.v1.ReadyResponseb\x06proto3' -) - -_globals = globals() -_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) -_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "reduce_pb2", _globals) -if _descriptor._USE_C_DESCRIPTORS == False: - DESCRIPTOR._options = None - _globals["_REDUCEREQUEST"]._serialized_start = 90 - _globals["_REDUCEREQUEST"]._serialized_end = 229 - _globals["_REDUCERESPONSE"]._serialized_start = 231 - _globals["_REDUCERESPONSE"]._serialized_end = 351 - _globals["_REDUCERESPONSE_RESULT"]._serialized_start = 300 - _globals["_REDUCERESPONSE_RESULT"]._serialized_end = 351 - _globals["_READYRESPONSE"]._serialized_start = 353 - _globals["_READYRESPONSE"]._serialized_end = 383 - _globals["_REDUCE"]._serialized_start = 386 - _globals["_REDUCE"]._serialized_end = 524 -# @@protoc_insertion_point(module_scope) diff --git a/pynumaflow-old/reducer/proto/reduce_pb2_grpc.py b/pynumaflow-old/reducer/proto/reduce_pb2_grpc.py deleted file mode 100644 index 5a0a15f6..00000000 --- a/pynumaflow-old/reducer/proto/reduce_pb2_grpc.py +++ /dev/null @@ -1,123 +0,0 @@ -# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT! -"""Client and server classes corresponding to protobuf-defined services.""" -import grpc - -from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2 -from . import reduce_pb2 as reduce__pb2 - - -class ReduceStub(object): - """Missing associated documentation comment in .proto file.""" - - def __init__(self, channel): - """Constructor. - - Args: - channel: A grpc.Channel. - """ - self.ReduceFn = channel.stream_stream( - "/reduce.v1.Reduce/ReduceFn", - request_serializer=reduce__pb2.ReduceRequest.SerializeToString, - response_deserializer=reduce__pb2.ReduceResponse.FromString, - ) - self.IsReady = channel.unary_unary( - "/reduce.v1.Reduce/IsReady", - request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, - response_deserializer=reduce__pb2.ReadyResponse.FromString, - ) - - -class ReduceServicer(object): - """Missing associated documentation comment in .proto file.""" - - def ReduceFn(self, request_iterator, context): - """ReduceFn applies a reduce function to a request stream.""" - context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") - - def IsReady(self, request, context): - """IsReady is the heartbeat endpoint for gRPC.""" - context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") - - -def add_ReduceServicer_to_server(servicer, server): - rpc_method_handlers = { - "ReduceFn": grpc.stream_stream_rpc_method_handler( - servicer.ReduceFn, - request_deserializer=reduce__pb2.ReduceRequest.FromString, - response_serializer=reduce__pb2.ReduceResponse.SerializeToString, - ), - "IsReady": grpc.unary_unary_rpc_method_handler( - servicer.IsReady, - request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, - response_serializer=reduce__pb2.ReadyResponse.SerializeToString, - ), - } - generic_handler = grpc.method_handlers_generic_handler("reduce.v1.Reduce", rpc_method_handlers) - server.add_generic_rpc_handlers((generic_handler,)) - - -# This class is part of an EXPERIMENTAL API. -class Reduce(object): - """Missing associated documentation comment in .proto file.""" - - @staticmethod - def ReduceFn( - request_iterator, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.stream_stream( - request_iterator, - target, - "/reduce.v1.Reduce/ReduceFn", - reduce__pb2.ReduceRequest.SerializeToString, - reduce__pb2.ReduceResponse.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - ) - - @staticmethod - def IsReady( - request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.unary_unary( - request, - target, - "/reduce.v1.Reduce/IsReady", - google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, - reduce__pb2.ReadyResponse.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - ) diff --git a/pynumaflow-old/sideinput/__init__.py b/pynumaflow-old/sideinput/__init__.py deleted file mode 100644 index 8a3c36f3..00000000 --- a/pynumaflow-old/sideinput/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from pynumaflow.sideinput._dtypes import Response -from pynumaflow.sideinput.server import SideInput - -__all__ = ["Response", "SideInput"] diff --git a/pynumaflow-old/sideinput/_dtypes.py b/pynumaflow-old/sideinput/_dtypes.py deleted file mode 100644 index 86826578..00000000 --- a/pynumaflow-old/sideinput/_dtypes.py +++ /dev/null @@ -1,38 +0,0 @@ -from dataclasses import dataclass -from typing import TypeVar - -R = TypeVar("R", bound="Response") - - -@dataclass -class Response: - """ - Class to define the important information for the event. - Args: - value: the payload of the event. - no_broadcast: the flag to indicate whether the event should be broadcasted. - >>> # Example usage - >>> Response.broadcast_message(b"hello") - >>> Response.no_broadcast_message() - """ - - __slots__ = ("value", "no_broadcast") - - value: bytes - no_broadcast: bool - - @classmethod - def broadcast_message(cls: type[R], value: bytes) -> R: - """ - Returns a SideInputResponse object with the given value, - and the No broadcast flag set to False. This event will be broadcasted. - """ - return Response(value=value, no_broadcast=False) - - @classmethod - def no_broadcast_message(cls: type[R]) -> R: - """ - Returns a SideInputResponse object with the No broadcast flag set to True. - This event will not be broadcasted. - """ - return Response(value=b"", no_broadcast=True) diff --git a/pynumaflow-old/sideinput/proto/__init__.py b/pynumaflow-old/sideinput/proto/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/pynumaflow-old/sideinput/proto/sideinput.proto b/pynumaflow-old/sideinput/proto/sideinput.proto deleted file mode 100644 index c53f055e..00000000 --- a/pynumaflow-old/sideinput/proto/sideinput.proto +++ /dev/null @@ -1,40 +0,0 @@ -syntax = "proto3"; - -import "google/protobuf/empty.proto"; - -package sideinput.v1; - -// SideInput is the gRPC service for user-defined Side Inputs. -// It is used to propagate changes in the values of the provided Side Inputs -// which allows access to slow updated data or configuration without needing to retrieve -// it during each message processing. -// Through this service we should should be able to:- -// 1) Invoke retrieval request for a single Side Input parameter, which in turn should -// check for updates and return its latest value. -// 2) Provide a health check endpoint to indicate whether the service is ready to be used. -service SideInput { - // RetrieveSideInput is the endpoint to retrieve the latest value of a given Side Input. - rpc RetrieveSideInput(google.protobuf.Empty) returns (SideInputResponse); - - // IsReady is the health check endpoint to indicate whether the service is ready to be used. - rpc IsReady(google.protobuf.Empty) returns (ReadyResponse); -} - -/** - * SideInputResponse represents a response to a given side input retrieval request. - */ -message SideInputResponse { - // value represents the latest value of the side input payload - bytes value = 1; - // noBroadcast indicates whether the side input value should be broadcasted to all - // True if value should not be broadcasted - // False if value should be broadcasted - bool no_broadcast = 2; -} - -/** - * ReadyResponse is the health check result. - */ -message ReadyResponse { - bool ready = 1; -} \ No newline at end of file diff --git a/pynumaflow-old/sideinput/proto/sideinput_pb2.py b/pynumaflow-old/sideinput/proto/sideinput_pb2.py deleted file mode 100644 index 8278c1df..00000000 --- a/pynumaflow-old/sideinput/proto/sideinput_pb2.py +++ /dev/null @@ -1,33 +0,0 @@ -# -*- coding: utf-8 -*- -# Generated by the protocol buffer compiler. DO NOT EDIT! -# source: sideinput.proto -"""Generated protocol buffer code.""" -from google.protobuf import descriptor as _descriptor -from google.protobuf import descriptor_pool as _descriptor_pool -from google.protobuf import symbol_database as _symbol_database -from google.protobuf.internal import builder as _builder - -# @@protoc_insertion_point(imports) - -_sym_db = _symbol_database.Default() - - -from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2 - - -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b'\n\x0fsideinput.proto\x12\x0csideinput.v1\x1a\x1bgoogle/protobuf/empty.proto"8\n\x11SideInputResponse\x12\r\n\x05value\x18\x01 \x01(\x0c\x12\x14\n\x0cno_broadcast\x18\x02 \x01(\x08"\x1e\n\rReadyResponse\x12\r\n\x05ready\x18\x01 \x01(\x08\x32\x99\x01\n\tSideInput\x12L\n\x11RetrieveSideInput\x12\x16.google.protobuf.Empty\x1a\x1f.sideinput.v1.SideInputResponse\x12>\n\x07IsReady\x12\x16.google.protobuf.Empty\x1a\x1b.sideinput.v1.ReadyResponseb\x06proto3' -) - -_globals = globals() -_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) -_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "sideinput_pb2", _globals) -if _descriptor._USE_C_DESCRIPTORS == False: - DESCRIPTOR._options = None - _globals["_SIDEINPUTRESPONSE"]._serialized_start = 62 - _globals["_SIDEINPUTRESPONSE"]._serialized_end = 118 - _globals["_READYRESPONSE"]._serialized_start = 120 - _globals["_READYRESPONSE"]._serialized_end = 150 - _globals["_SIDEINPUT"]._serialized_start = 153 - _globals["_SIDEINPUT"]._serialized_end = 306 -# @@protoc_insertion_point(module_scope) diff --git a/pynumaflow-old/sideinput/proto/sideinput_pb2_grpc.py b/pynumaflow-old/sideinput/proto/sideinput_pb2_grpc.py deleted file mode 100644 index 72ea87ed..00000000 --- a/pynumaflow-old/sideinput/proto/sideinput_pb2_grpc.py +++ /dev/null @@ -1,149 +0,0 @@ -# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT! -"""Client and server classes corresponding to protobuf-defined services.""" -import grpc - -from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2 -from . import sideinput_pb2 as sideinput__pb2 - - -class SideInputStub(object): - """SideInput is the gRPC service for user-defined Side Inputs. - It is used to propagate changes in the values of the provided Side Inputs - which allows access to slow updated data or configuration without needing to retrieve - it during each message processing. - Through this service we should should be able to:- - 1) Invoke retrieval request for a single Side Input parameter, which in turn should - check for updates and return its latest value. - 2) Provide a health check endpoint to indicate whether the service is ready to be used. - """ - - def __init__(self, channel): - """Constructor. - - Args: - channel: A grpc.Channel. - """ - self.RetrieveSideInput = channel.unary_unary( - "/sideinput.v1.SideInput/RetrieveSideInput", - request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, - response_deserializer=sideinput__pb2.SideInputResponse.FromString, - ) - self.IsReady = channel.unary_unary( - "/sideinput.v1.SideInput/IsReady", - request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, - response_deserializer=sideinput__pb2.ReadyResponse.FromString, - ) - - -class SideInputServicer(object): - """SideInput is the gRPC service for user-defined Side Inputs. - It is used to propagate changes in the values of the provided Side Inputs - which allows access to slow updated data or configuration without needing to retrieve - it during each message processing. - Through this service we should should be able to:- - 1) Invoke retrieval request for a single Side Input parameter, which in turn should - check for updates and return its latest value. - 2) Provide a health check endpoint to indicate whether the service is ready to be used. - """ - - def RetrieveSideInput(self, request, context): - """RetrieveSideInput is the endpoint to retrieve the latest value of a given Side Input.""" - context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") - - def IsReady(self, request, context): - """IsReady is the health check endpoint to indicate whether the service is ready to be used.""" - context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") - - -def add_SideInputServicer_to_server(servicer, server): - rpc_method_handlers = { - "RetrieveSideInput": grpc.unary_unary_rpc_method_handler( - servicer.RetrieveSideInput, - request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, - response_serializer=sideinput__pb2.SideInputResponse.SerializeToString, - ), - "IsReady": grpc.unary_unary_rpc_method_handler( - servicer.IsReady, - request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, - response_serializer=sideinput__pb2.ReadyResponse.SerializeToString, - ), - } - generic_handler = grpc.method_handlers_generic_handler( - "sideinput.v1.SideInput", rpc_method_handlers - ) - server.add_generic_rpc_handlers((generic_handler,)) - - -# This class is part of an EXPERIMENTAL API. -class SideInput(object): - """SideInput is the gRPC service for user-defined Side Inputs. - It is used to propagate changes in the values of the provided Side Inputs - which allows access to slow updated data or configuration without needing to retrieve - it during each message processing. - Through this service we should should be able to:- - 1) Invoke retrieval request for a single Side Input parameter, which in turn should - check for updates and return its latest value. - 2) Provide a health check endpoint to indicate whether the service is ready to be used. - """ - - @staticmethod - def RetrieveSideInput( - request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.unary_unary( - request, - target, - "/sideinput.v1.SideInput/RetrieveSideInput", - google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, - sideinput__pb2.SideInputResponse.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - ) - - @staticmethod - def IsReady( - request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.unary_unary( - request, - target, - "/sideinput.v1.SideInput/IsReady", - google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, - sideinput__pb2.ReadyResponse.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - ) diff --git a/pynumaflow-old/sideinput/server.py b/pynumaflow-old/sideinput/server.py deleted file mode 100644 index d786f0d7..00000000 --- a/pynumaflow-old/sideinput/server.py +++ /dev/null @@ -1,115 +0,0 @@ -import logging -import multiprocessing -import os -from concurrent.futures import ThreadPoolExecutor -from typing import Callable - -import grpc -from google.protobuf import empty_pb2 as _empty_pb2 - -from pynumaflow import setup_logging -from pynumaflow._constants import ( - MAX_MESSAGE_SIZE, - SIDE_INPUT_SOCK_PATH, -) -from pynumaflow.sideinput import Response -from pynumaflow.sideinput.proto import sideinput_pb2, sideinput_pb2_grpc -from pynumaflow.types import NumaflowServicerContext - -_LOGGER = setup_logging(__name__) -if os.getenv("PYTHONDEBUG"): - _LOGGER.setLevel(logging.DEBUG) - -RetrieverCallable = Callable[[], Response] -_PROCESS_COUNT = multiprocessing.cpu_count() -MAX_THREADS = int(os.getenv("MAX_THREADS", 0)) or (_PROCESS_COUNT * 4) - - -class SideInput(sideinput_pb2_grpc.SideInputServicer): - """ - Provides an interface to write a User Defined Side Input (UDSideInput) - which will be exposed over gRPC. - - Args: - handler: Function callable following the type signature of RetrieverCallable - sock_path: Path to the UNIX Domain Socket - max_message_size: The max message size in bytes the server can receive and send - max_threads: The max number of threads to be spawned; - defaults to number of processors x 4 - - Example invocation: - >>> from typing import List - >>> from pynumaflow.sideinput import Response, SideInput - >>> def my_handler() -> Response: - ... response = Response.broadcast_message(b"hello") - ... return response - >>> grpc_server = SideInput(my_handler) - >>> grpc_server.start() - """ - - SIDE_INPUT_DIR_PATH = "/var/numaflow/side-inputs" - - def __init__( - self, - handler: RetrieverCallable, - sock_path=SIDE_INPUT_SOCK_PATH, - max_message_size=MAX_MESSAGE_SIZE, - max_threads=MAX_THREADS, - ): - self.__retrieve_handler: RetrieverCallable = handler - self.sock_path = f"unix://{sock_path}" - self._max_message_size = max_message_size - self._max_threads = max_threads - self.cleanup_coroutines = [] - - self._server_options = [ - ("grpc.max_send_message_length", self._max_message_size), - ("grpc.max_receive_message_length", self._max_message_size), - ] - - def RetrieveSideInput( - self, request: _empty_pb2.Empty, context: NumaflowServicerContext - ) -> sideinput_pb2.SideInputResponse: - """ - Applies a sideinput function for a retrieval request. - The pascal case function name comes from the proto sideinput_pb2_grpc.py file. - """ - # if there is an exception, we will mark all the responses as a failure - try: - rspn = self.__retrieve_handler() - except Exception as err: - err_msg = "RetrieveSideInputErr: %r" % err - _LOGGER.critical(err_msg, exc_info=True) - context.set_code(grpc.StatusCode.UNKNOWN) - context.set_details(str(err)) - return sideinput_pb2.SideInputResponse(value=None, no_broadcast=True) - - return sideinput_pb2.SideInputResponse(value=rspn.value, no_broadcast=rspn.no_broadcast) - - def IsReady( - self, request: _empty_pb2.Empty, context: NumaflowServicerContext - ) -> sideinput_pb2.ReadyResponse: - """ - IsReady is the heartbeat endpoint for gRPC. - The pascal case function name comes from the proto sideinput_pb2_grpc.py file. - """ - return sideinput_pb2.ReadyResponse(ready=True) - - def start(self) -> None: - """ - Starts the gRPC server on the given UNIX socket with given max threads. - """ - server = grpc.server( - ThreadPoolExecutor(max_workers=self._max_threads), options=self._server_options - ) - sideinput_pb2_grpc.add_SideInputServicer_to_server( - SideInput(self.__retrieve_handler), server - ) - server.add_insecure_port(self.sock_path) - server.start() - _LOGGER.info( - "Side Input gRPC Server listening on: %s with max threads: %s", - self.sock_path, - self._max_threads, - ) - server.wait_for_termination() diff --git a/pynumaflow-old/sinker/__init__.py b/pynumaflow-old/sinker/__init__.py deleted file mode 100644 index c6b5e679..00000000 --- a/pynumaflow-old/sinker/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from pynumaflow.sinker._dtypes import Response, Responses, Datum -from pynumaflow.sinker.async_sink import AsyncSinker -from pynumaflow.sinker.server import Sinker - -__all__ = ["Response", "Responses", "Datum", "Sinker", "AsyncSinker"] diff --git a/pynumaflow-old/sinker/_dtypes.py b/pynumaflow-old/sinker/_dtypes.py deleted file mode 100644 index 1a020ac7..00000000 --- a/pynumaflow-old/sinker/_dtypes.py +++ /dev/null @@ -1,164 +0,0 @@ -from dataclasses import dataclass -from datetime import datetime -from typing import TypeVar, Optional, Callable -from collections.abc import Sequence, Iterator -from warnings import warn - -R = TypeVar("R", bound="Response") -Rs = TypeVar("Rs", bound="Responses") - - -@dataclass -class Response: - """ - Basic datatype for UDSink response. - - Args: - id: the id of the event. - success: boolean indicating whether the event was successfully processed. - err: error message if the event was not successfully processed. - """ - - id: str - success: bool - err: Optional[str] - - __slots__ = ("id", "success", "err") - - @classmethod - def as_success(cls: type[R], id_: str) -> R: - return Response(id=id_, success=True, err=None) - - @classmethod - def as_failure(cls: type[R], id_: str, err_msg: str) -> R: - return Response(id=id_, success=False, err=err_msg) - - -class Responses(Sequence[R]): - """ - Container to hold a list of Response instances. - - Args: - responses: list of Response instances. - """ - - __slots__ = ("_responses",) - - def __init__(self, *responses: R): - self._responses = list(responses) or [] - - def __str__(self) -> str: - return str(self._responses) - - def __repr__(self) -> str: - return str(self) - - def __len__(self) -> int: - return len(self._responses) - - def __iter__(self) -> Iterator[R]: - return iter(self._responses) - - def __getitem__(self, index: int) -> R: - if isinstance(index, slice): - raise TypeError("Slicing is not supported for Responses") - return self._responses[index] - - def append(self, response: R) -> None: - self._responses.append(response) - - def items(self) -> list[R]: - warn( - "Using items is deprecated and will be removed in v0.5. " - "Iterate or index the Responses object instead.", - DeprecationWarning, - stacklevel=2, - ) - return self._responses - - -@dataclass(init=False, repr=False) -class Datum: - """ - Class to define the important information for the event. - Args: - keys: the keys of the event. - value: the payload of the event. - event_time: the event time of the event. - watermark: the watermark of the event. - >>> # Example usage - >>> from pynumaflow.sinker import Datum - >>> from datetime import datetime, timezone - >>> payload = bytes("test_mock_message", encoding="utf-8") - >>> t1 = datetime.fromtimestamp(1662998400, timezone.utc) - >>> t2 = datetime.fromtimestamp(1662998460, timezone.utc) - >>> msg_id = "test_id" - >>> output_keys = ["test_key"] - >>> d = Datum(keys=output_keys, sink_msg_id=msg_id, value=payload, event_time=t1, watermark=t2) - """ - - __slots__ = ("_keys", "_id", "_value", "_event_time", "_watermark") - - _keys: list[str] - _id: str - _value: bytes - _event_time: datetime - _watermark: datetime - - def __init__( - self, - keys: list[str], - sink_msg_id: str, - value: bytes, - event_time: datetime, - watermark: datetime, - ): - self._keys = keys - self._id = sink_msg_id or "" - self._value = value or b"" - if not isinstance(event_time, datetime): - raise TypeError(f"Wrong data type: {type(event_time)} for Datum.event_time") - self._event_time = event_time - if not isinstance(watermark, datetime): - raise TypeError(f"Wrong data type: {type(watermark)} for Datum.watermark") - self._watermark = watermark - - def __str__(self): - value_string = self._value.decode("utf-8") - return ( - f"keys: {self._keys}, " - f"id: {self._id}, value: {value_string}, " - f"event_time: {str(self._event_time)}, " - f"watermark: {str(self._watermark)}" - ) - - def __repr__(self): - return str(self) - - @property - def id(self) -> str: - """Returns the id of the event.""" - return self._id - - @property - def keys(self) -> list[str]: - """Returns the keys of the event.""" - return self._keys - - @property - def value(self) -> bytes: - """Returns the value of the event.""" - return self._value - - @property - def event_time(self) -> datetime: - """Returns the event time of the event.""" - return self._event_time - - @property - def watermark(self) -> datetime: - """Returns the watermark of the event.""" - return self._watermark - - -SinkCallable = Callable[[Iterator[Datum]], Responses] diff --git a/pynumaflow-old/sinker/async_sink.py b/pynumaflow-old/sinker/async_sink.py deleted file mode 100644 index 8333710c..00000000 --- a/pynumaflow-old/sinker/async_sink.py +++ /dev/null @@ -1,153 +0,0 @@ -import logging -import multiprocessing -import os -from collections.abc import AsyncIterable - -import grpc -from google.protobuf import empty_pb2 as _empty_pb2 - -from pynumaflow import setup_logging -from pynumaflow._constants import ( - SINK_SOCK_PATH, - MAX_MESSAGE_SIZE, -) -from pynumaflow.info.server import get_sdk_version, write as info_server_write -from pynumaflow.info.types import ServerInfo, Protocol, Language, SERVER_INFO_FILE_PATH -from pynumaflow.sinker import Responses, Datum, Response -from pynumaflow.sinker._dtypes import SinkCallable -from pynumaflow.sinker.proto import sink_pb2_grpc, sink_pb2 -from pynumaflow.types import NumaflowServicerContext - -_LOGGER = setup_logging(__name__) -if os.getenv("PYTHONDEBUG"): - _LOGGER.setLevel(logging.DEBUG) - -_PROCESS_COUNT = multiprocessing.cpu_count() -MAX_THREADS = int(os.getenv("MAX_THREADS", 0)) or (_PROCESS_COUNT * 4) - - -async def datum_generator( - request_iterator: AsyncIterable[sink_pb2.SinkRequest], -) -> AsyncIterable[Datum]: - async for d in request_iterator: - datum = Datum( - keys=list(d.keys), - sink_msg_id=d.id, - value=d.value, - event_time=d.event_time.ToDatetime(), - watermark=d.watermark.ToDatetime(), - ) - yield datum - - -class AsyncSinker(sink_pb2_grpc.SinkServicer): - """ - Provides an interface to write an Async Sinker - which will be exposed over an Asyncronous gRPC server. - - Args: - handler: Function callable following the type signature of SinkCallable - sock_path: Path to the UNIX Domain Socket - max_message_size: The max message size in bytes the server can receive and send - max_threads: The max number of threads to be spawned; - defaults to number of processors x 4 - - Example invocation: - >>> import aiorun - >>> from pynumaflow.sinker import Datum, Responses, Response, AsyncSinker - >>> async def my_handler(datums: AsyncIterable[Datum]) -> Responses: - ... responses = Responses() - ... async for msg in datums: - ... responses.append(Response.as_success(msg.id)) - ... return responses - >>> grpc_server = AsyncSinker(handler=my_handler) - >>> aiorun.run(grpc_server.start()) - """ - - def __init__( - self, - handler: SinkCallable, - sock_path=SINK_SOCK_PATH, - max_message_size=MAX_MESSAGE_SIZE, - max_threads=MAX_THREADS, - ): - self.background_tasks = set() - self.__sink_handler: SinkCallable = handler - self.sock_path = f"unix://{sock_path}" - self._max_message_size = max_message_size - self._max_threads = max_threads - self.cleanup_coroutines = [] - - self._server_options = [ - ("grpc.max_send_message_length", self._max_message_size), - ("grpc.max_receive_message_length", self._max_message_size), - ] - - async def SinkFn( - self, - request_iterator: AsyncIterable[sink_pb2.SinkRequest], - context: NumaflowServicerContext, - ) -> sink_pb2.SinkResponse: - """ - Applies a sink function to a list of datum elements. - The pascal case function name comes from the proto sink_pb2_grpc.py file. - """ - # if there is an exception, we will mark all the responses as a failure - datum_iterator = datum_generator(request_iterator=request_iterator) - results = await self.__invoke_sink(datum_iterator) - - return sink_pb2.SinkResponse(results=results) - - async def __invoke_sink(self, datum_iterator: AsyncIterable[Datum]): - try: - rspns = await self.__sink_handler(datum_iterator) - except Exception as err: - err_msg = "UDSinkError: %r" % err - _LOGGER.critical(err_msg, exc_info=True) - rspns = Responses() - async for _datum in datum_iterator: - rspns.append(Response.as_failure(_datum.id, err_msg)) - responses = [] - for rspn in rspns: - responses.append( - sink_pb2.SinkResponse.Result(id=rspn.id, success=rspn.success, err_msg=rspn.err) - ) - return responses - - async def IsReady( - self, request: _empty_pb2.Empty, context: NumaflowServicerContext - ) -> sink_pb2.ReadyResponse: - """ - IsReady is the heartbeat endpoint for gRPC. - The pascal case function name comes from the proto sink_pb2_grpc.py file. - """ - return sink_pb2.ReadyResponse(ready=True) - - async def __serve_async(self, server) -> None: - sink_pb2_grpc.add_SinkServicer_to_server(AsyncSinker(self.__sink_handler), server) - server.add_insecure_port(self.sock_path) - _LOGGER.info("GRPC Async Server listening on: %s", self.sock_path) - await server.start() - serv_info = ServerInfo( - protocol=Protocol.UDS, - language=Language.PYTHON, - version=get_sdk_version(), - ) - info_server_write(server_info=serv_info, info_file=SERVER_INFO_FILE_PATH) - - async def server_graceful_shutdown(): - _LOGGER.info("Starting graceful shutdown...") - """ - Shuts down the server with 5 seconds of grace period. During the - grace period, the server won't accept new connections and allow - existing RPCs to continue within the grace period. - await server.stop(5) - """ - - self.cleanup_coroutines.append(server_graceful_shutdown()) - await server.wait_for_termination() - - async def start(self) -> None: - """Starts the Async gRPC server on the given UNIX socket.""" - server = grpc.aio.server(options=self._server_options) - await self.__serve_async(server) diff --git a/pynumaflow-old/sinker/proto/__init__.py b/pynumaflow-old/sinker/proto/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/pynumaflow-old/sinker/proto/sink.proto b/pynumaflow-old/sinker/proto/sink.proto deleted file mode 100644 index 1c97077a..00000000 --- a/pynumaflow-old/sinker/proto/sink.proto +++ /dev/null @@ -1,50 +0,0 @@ -syntax = "proto3"; - -option go_package = "github.com/numaproj/numaflow-go/pkg/apis/proto/sink/v1"; - -import "google/protobuf/empty.proto"; -import "google/protobuf/timestamp.proto"; - - -package sink.v1; - -service Sink { - // SinkFn writes the request to a user defined sink. - rpc SinkFn(stream SinkRequest) returns (SinkResponse); - - // IsReady is the heartbeat endpoint for gRPC. - rpc IsReady(google.protobuf.Empty) returns (ReadyResponse); -} - -/** - * SinkRequest represents a request element. - */ -message SinkRequest { - repeated string keys = 1; - bytes value = 2; - google.protobuf.Timestamp event_time = 3; - google.protobuf.Timestamp watermark = 4; - string id = 5; -} - -/** - * ReadyResponse is the health check result. - */ -message ReadyResponse { - bool ready = 1; -} - -/** - * SinkResponse is the individual response of each message written to the sink. - */ -message SinkResponse { - message Result { - // id is the ID of the message, can be used to uniquely identify the message. - string id = 1; - // success denotes the status of persisting to disk. if set to false, it means writing to sink for the message failed. - bool success = 2; - // err_msg is the error message, set it if success is set to false. - string err_msg = 3; - } - repeated Result results = 1; -} \ No newline at end of file diff --git a/pynumaflow-old/sinker/proto/sink_pb2.py b/pynumaflow-old/sinker/proto/sink_pb2.py deleted file mode 100644 index b6182a45..00000000 --- a/pynumaflow-old/sinker/proto/sink_pb2.py +++ /dev/null @@ -1,39 +0,0 @@ -# -*- coding: utf-8 -*- -# Generated by the protocol buffer compiler. DO NOT EDIT! -# source: sink.proto -"""Generated protocol buffer code.""" -from google.protobuf import descriptor as _descriptor -from google.protobuf import descriptor_pool as _descriptor_pool -from google.protobuf import symbol_database as _symbol_database -from google.protobuf.internal import builder as _builder - -# @@protoc_insertion_point(imports) - -_sym_db = _symbol_database.Default() - - -from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2 -from google.protobuf import timestamp_pb2 as google_dot_protobuf_dot_timestamp__pb2 - - -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b'\n\nsink.proto\x12\x07sink.v1\x1a\x1bgoogle/protobuf/empty.proto\x1a\x1fgoogle/protobuf/timestamp.proto"\x95\x01\n\x0bSinkRequest\x12\x0c\n\x04keys\x18\x01 \x03(\t\x12\r\n\x05value\x18\x02 \x01(\x0c\x12.\n\nevent_time\x18\x03 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12-\n\twatermark\x18\x04 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12\n\n\x02id\x18\x05 \x01(\t"\x1e\n\rReadyResponse\x12\r\n\x05ready\x18\x01 \x01(\x08"u\n\x0cSinkResponse\x12-\n\x07results\x18\x01 \x03(\x0b\x32\x1c.sink.v1.SinkResponse.Result\x1a\x36\n\x06Result\x12\n\n\x02id\x18\x01 \x01(\t\x12\x0f\n\x07success\x18\x02 \x01(\x08\x12\x0f\n\x07\x65rr_msg\x18\x03 \x01(\t2z\n\x04Sink\x12\x37\n\x06SinkFn\x12\x14.sink.v1.SinkRequest\x1a\x15.sink.v1.SinkResponse(\x01\x12\x39\n\x07IsReady\x12\x16.google.protobuf.Empty\x1a\x16.sink.v1.ReadyResponseB8Z6github.com/numaproj/numaflow-go/pkg/apis/proto/sink/v1b\x06proto3' -) - -_globals = globals() -_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) -_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "sink_pb2", _globals) -if _descriptor._USE_C_DESCRIPTORS == False: - DESCRIPTOR._options = None - DESCRIPTOR._serialized_options = b"Z6github.com/numaproj/numaflow-go/pkg/apis/proto/sink/v1" - _globals["_SINKREQUEST"]._serialized_start = 86 - _globals["_SINKREQUEST"]._serialized_end = 235 - _globals["_READYRESPONSE"]._serialized_start = 237 - _globals["_READYRESPONSE"]._serialized_end = 267 - _globals["_SINKRESPONSE"]._serialized_start = 269 - _globals["_SINKRESPONSE"]._serialized_end = 386 - _globals["_SINKRESPONSE_RESULT"]._serialized_start = 332 - _globals["_SINKRESPONSE_RESULT"]._serialized_end = 386 - _globals["_SINK"]._serialized_start = 388 - _globals["_SINK"]._serialized_end = 510 -# @@protoc_insertion_point(module_scope) diff --git a/pynumaflow-old/sinker/proto/sink_pb2_grpc.py b/pynumaflow-old/sinker/proto/sink_pb2_grpc.py deleted file mode 100644 index ef673e9d..00000000 --- a/pynumaflow-old/sinker/proto/sink_pb2_grpc.py +++ /dev/null @@ -1,123 +0,0 @@ -# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT! -"""Client and server classes corresponding to protobuf-defined services.""" -import grpc - -from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2 -from . import sink_pb2 as sink__pb2 - - -class SinkStub(object): - """Missing associated documentation comment in .proto file.""" - - def __init__(self, channel): - """Constructor. - - Args: - channel: A grpc.Channel. - """ - self.SinkFn = channel.stream_unary( - "/sink.v1.Sink/SinkFn", - request_serializer=sink__pb2.SinkRequest.SerializeToString, - response_deserializer=sink__pb2.SinkResponse.FromString, - ) - self.IsReady = channel.unary_unary( - "/sink.v1.Sink/IsReady", - request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, - response_deserializer=sink__pb2.ReadyResponse.FromString, - ) - - -class SinkServicer(object): - """Missing associated documentation comment in .proto file.""" - - def SinkFn(self, request_iterator, context): - """SinkFn writes the request to a user defined sink.""" - context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") - - def IsReady(self, request, context): - """IsReady is the heartbeat endpoint for gRPC.""" - context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") - - -def add_SinkServicer_to_server(servicer, server): - rpc_method_handlers = { - "SinkFn": grpc.stream_unary_rpc_method_handler( - servicer.SinkFn, - request_deserializer=sink__pb2.SinkRequest.FromString, - response_serializer=sink__pb2.SinkResponse.SerializeToString, - ), - "IsReady": grpc.unary_unary_rpc_method_handler( - servicer.IsReady, - request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, - response_serializer=sink__pb2.ReadyResponse.SerializeToString, - ), - } - generic_handler = grpc.method_handlers_generic_handler("sink.v1.Sink", rpc_method_handlers) - server.add_generic_rpc_handlers((generic_handler,)) - - -# This class is part of an EXPERIMENTAL API. -class Sink(object): - """Missing associated documentation comment in .proto file.""" - - @staticmethod - def SinkFn( - request_iterator, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.stream_unary( - request_iterator, - target, - "/sink.v1.Sink/SinkFn", - sink__pb2.SinkRequest.SerializeToString, - sink__pb2.SinkResponse.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - ) - - @staticmethod - def IsReady( - request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.unary_unary( - request, - target, - "/sink.v1.Sink/IsReady", - google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, - sink__pb2.ReadyResponse.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - ) diff --git a/pynumaflow-old/sinker/server.py b/pynumaflow-old/sinker/server.py deleted file mode 100644 index 195cee10..00000000 --- a/pynumaflow-old/sinker/server.py +++ /dev/null @@ -1,139 +0,0 @@ -import logging -import multiprocessing -import os -from concurrent.futures import ThreadPoolExecutor -from collections.abc import Iterator, Iterable - -import grpc -from google.protobuf import empty_pb2 as _empty_pb2 - -from pynumaflow import setup_logging -from pynumaflow._constants import ( - SINK_SOCK_PATH, - MAX_MESSAGE_SIZE, -) -from pynumaflow.info.server import get_sdk_version, write as info_server_write -from pynumaflow.info.types import ServerInfo, Protocol, Language, SERVER_INFO_FILE_PATH -from pynumaflow.sinker import Responses, Datum, Response -from pynumaflow.sinker._dtypes import SinkCallable -from pynumaflow.sinker.proto import sink_pb2_grpc, sink_pb2 -from pynumaflow.types import NumaflowServicerContext - -_LOGGER = setup_logging(__name__) -if os.getenv("PYTHONDEBUG"): - _LOGGER.setLevel(logging.DEBUG) - - -_PROCESS_COUNT = multiprocessing.cpu_count() -MAX_THREADS = int(os.getenv("MAX_THREADS", 0)) or (_PROCESS_COUNT * 4) - - -def datum_generator(request_iterator: Iterable[sink_pb2.SinkRequest]) -> Iterable[Datum]: - for d in request_iterator: - datum = Datum( - keys=list(d.keys), - sink_msg_id=d.id, - value=d.value, - event_time=d.event_time.ToDatetime(), - watermark=d.watermark.ToDatetime(), - ) - yield datum - - -class Sinker(sink_pb2_grpc.SinkServicer): - """ - Provides an interface to write a Sinker - which will be exposed over gRPC. - - Args: - handler: Function callable following the type signature of SinkCallable - sock_path: Path to the UNIX Domain Socket - max_message_size: The max message size in bytes the server can receive and send - max_threads: The max number of threads to be spawned; - defaults to number of processors x 4 - - Example invocation: - >>> from typing import List - >>> from pynumaflow.sinker import Datum, Responses, Response, Sinker - >>> def my_handler(datums: Iterator[Datum]) -> Responses: - ... responses = Responses() - ... for msg in datums: - ... responses.append(Response.as_success(msg.id)) - ... return responses - >>> grpc_server = Sinker(handler=my_handler) - >>> grpc_server.start() - """ - - def __init__( - self, - handler: SinkCallable, - sock_path=SINK_SOCK_PATH, - max_message_size=MAX_MESSAGE_SIZE, - max_threads=MAX_THREADS, - ): - self.__sink_handler: SinkCallable = handler - self.sock_path = f"unix://{sock_path}" - self._max_message_size = max_message_size - self._max_threads = max_threads - - self._server_options = [ - ("grpc.max_send_message_length", self._max_message_size), - ("grpc.max_receive_message_length", self._max_message_size), - ] - - def SinkFn( - self, request_iterator: Iterator[sink_pb2.SinkRequest], context: NumaflowServicerContext - ) -> sink_pb2.SinkResponse: - """ - Applies a sink function to a list of datum elements. - The pascal case function name comes from the proto sink_pb2_grpc.py file. - """ - # if there is an exception, we will mark all the responses as a failure - datum_iterator = datum_generator(request_iterator) - try: - rspns = self.__sink_handler(datum_iterator) - except Exception as err: - err_msg = "UDSinkError: %r" % err - _LOGGER.critical(err_msg, exc_info=True) - rspns = Responses() - for _datum in datum_iterator: - rspns.append(Response.as_failure(_datum.id, err_msg)) - - responses = [] - for rspn in rspns: - responses.append( - sink_pb2.SinkResponse.Result(id=rspn.id, success=rspn.success, err_msg=rspn.err) - ) - - return sink_pb2.SinkResponse(results=responses) - - def IsReady( - self, request: _empty_pb2.Empty, context: NumaflowServicerContext - ) -> sink_pb2.ReadyResponse: - """ - IsReady is the heartbeat endpoint for gRPC. - The pascal case function name comes from the proto sink_pb2_grpc.py file. - """ - return sink_pb2.ReadyResponse(ready=True) - - def start(self) -> None: - """ - Starts the gRPC server on the given UNIX socket with given max threads. - """ - server = grpc.server( - ThreadPoolExecutor(max_workers=self._max_threads), options=self._server_options - ) - sink_pb2_grpc.add_SinkServicer_to_server(Sinker(self.__sink_handler), server) - server.add_insecure_port(self.sock_path) - server.start() - serv_info = ServerInfo( - protocol=Protocol.UDS, - language=Language.PYTHON, - version=get_sdk_version(), - ) - info_server_write(server_info=serv_info, info_file=SERVER_INFO_FILE_PATH) - - _LOGGER.info( - "GRPC Server listening on: %s with max threads: %s", self.sock_path, self._max_threads - ) - server.wait_for_termination() diff --git a/pynumaflow-old/sourcer/__init__.py b/pynumaflow-old/sourcer/__init__.py deleted file mode 100644 index f846b5f7..00000000 --- a/pynumaflow-old/sourcer/__init__.py +++ /dev/null @@ -1,23 +0,0 @@ -from pynumaflow.sourcer._dtypes import ( - Message, - ReadRequest, - PendingResponse, - AckRequest, - Offset, - PartitionsResponse, - get_default_partitions, -) -from pynumaflow.sourcer.async_server import AsyncSourcer -from pynumaflow.sourcer.server import Sourcer - -__all__ = [ - "Message", - "ReadRequest", - "PendingResponse", - "AckRequest", - "Offset", - "AsyncSourcer", - "Sourcer", - "PartitionsResponse", - "get_default_partitions", -] diff --git a/pynumaflow-old/sourcer/_dtypes.py b/pynumaflow-old/sourcer/_dtypes.py deleted file mode 100644 index 8e042b28..00000000 --- a/pynumaflow-old/sourcer/_dtypes.py +++ /dev/null @@ -1,217 +0,0 @@ -import os -from collections.abc import Iterable -from dataclasses import dataclass -from datetime import datetime -from typing import Callable -from collections.abc import AsyncIterable - - -@dataclass(init=False) -class Offset: - """ - Args: - offset: the offset of the datum. - partition_id: partition_id indicates which partition of the source the datum belongs to. - """ - - __slots__ = ("_offset", "_partition_id") - - _offset: bytes - _partition_id: int - - def __init__(self, offset: bytes, partition_id: int): - self._offset = offset - self._partition_id = partition_id - - @classmethod - def offset_with_default_partition_id(cls, offset: bytes): - """ - Returns an Offset object with the given offset and default partition id. - """ - return Offset(offset=offset, partition_id=get_default_partitions()[0]) - - @property - def as_dict(self): - return {"offset": self._offset, "partition_id": self._partition_id} - - @property - def offset(self) -> bytes: - return self._offset - - @property - def partition_id(self) -> int: - return self._partition_id - - -@dataclass(init=False) -class Message: - """ - Basic datatype for data passing to the next vertex/vertices. - - Args: - payload: data in bytes - offset: the offset of the datum. - event_time: event time of the message, usually extracted from the payload. - keys: []string keys for vertex (optional) - """ - - __slots__ = ("_payload", "_offset", "_event_time", "_keys") - - _payload: bytes - _offset: Offset - _event_time: datetime - _keys: list[str] - - def __init__( - self, payload: bytes, offset: Offset, event_time: datetime, keys: list[str] = None - ): - """ - Creates a Message object to send value to a vertex. - """ - self._payload = payload - self._offset = offset - self._event_time = event_time - self._keys = keys or [] - - @property - def payload(self) -> bytes: - return self._payload - - @property - def keys(self) -> list[str]: - return self._keys - - @property - def offset(self) -> Offset: - return self._offset - - @property - def event_time(self) -> datetime: - return self._event_time - - -@dataclass(init=False) -class ReadRequest: - """ - Class to define the request for reading datum stream from user defined source. - Args: - num_records: the number of records to read. - timeout_in_ms: the request timeout in milliseconds. - >>> # Example usage - >>> from pynumaflow.sourcer import ReadRequest - >>> read_request = ReadRequest(num_records=10, timeout_in_ms=1000) - """ - - __slots__ = ("_num_records", "_timeout_in_ms") - - _num_records: int - _timeout_in_ms: int - - def __init__( - self, - num_records: int, - timeout_in_ms: int, - ): - if not isinstance(num_records, int): - raise TypeError(f"Wrong data type: {type(num_records)} for ReadRequest.num_records") - self._num_records = num_records - if not isinstance(timeout_in_ms, int): - raise TypeError(f"Wrong data type: {type(timeout_in_ms)} for ReadRequest.timeout_in_ms") - self._timeout_in_ms = timeout_in_ms - - @property - def num_records(self) -> int: - """Returns the num_records of the request""" - return self._num_records - - @property - def timeout_in_ms(self) -> int: - """Returns the timeout_in_ms of the request.""" - return self._timeout_in_ms - - -@dataclass(init=False) -class AckRequest: - """ - Class for defining the request for acknowledging datum. - It takes a list of offsets that need to be acknowledged. - Args: - offsets: the offsets to be acknowledged. - >>> # Example usage - >>> from pynumaflow.sourcer import AckRequest, Offset - >>> offset = Offset(offset=b"123", partition_id="0") - >>> ack_request = AckRequest(offsets=[offset, offset]) - """ - - __slots__ = ("_offsets",) - _offsets: list[Offset] - - def __init__(self, offsets: list[Offset]): - self._offsets = offsets - - @property - def offset(self) -> list[Offset]: - """Returns the offsets to be acknowledged.""" - return self._offsets - - -@dataclass(init=False) -class PendingResponse: - """ - PendingResponse is the response for the pending request. - It indicates the number of pending records at the user defined source. - A negative count indicates that the pending information is not available. - Args: - count: the number of pending records. - """ - - __slots__ = ("_count",) - _count: int - - def __init__(self, count: int): - if not isinstance(count, int): - raise TypeError(f"Wrong data type: {type(count)} for Pending.count") - self._count = count - - @property - def count(self) -> int: - """Returns the count of pending records""" - return self._count - - -@dataclass(init=False) -class PartitionsResponse: - """ - PartitionsResponse is the response for the partition request. - It indicates the number of partitions at the user defined source. - A negative count indicates that the partition information is not available. - Args: - count: the number of partitions. - """ - - __slots__ = ("_partitions",) - _partitions: list[int] - - def __init__(self, partitions: list[int]): - if not isinstance(partitions, list): - raise TypeError(f"Wrong data type: {type(partitions)} for Partition.partitions") - self._partitions = partitions - - @property - def partitions(self) -> list[int]: - """Returns the list of partitions""" - return self._partitions - - -# Create default partition id from the environment variable "NUMAFLOW_REPLICA" -DefaultPartitionId = int(os.getenv("NUMAFLOW_REPLICA", "0")) -SourceReadCallable = Callable[[ReadRequest], Iterable[Message]] -AsyncSourceReadCallable = Callable[[ReadRequest], AsyncIterable[Message]] -SourceAckCallable = Callable[[AckRequest], None] - - -def get_default_partitions() -> list[int]: - """ - Returns the default partition ids. - """ - return [DefaultPartitionId] diff --git a/pynumaflow-old/sourcer/async_server.py b/pynumaflow-old/sourcer/async_server.py deleted file mode 100644 index 411349a8..00000000 --- a/pynumaflow-old/sourcer/async_server.py +++ /dev/null @@ -1,237 +0,0 @@ -import logging -import multiprocessing -import os - -from collections.abc import AsyncIterable -from google.protobuf import timestamp_pb2 as _timestamp_pb2 -import grpc -from google.protobuf import empty_pb2 as _empty_pb2 - -from pynumaflow import setup_logging -from pynumaflow._constants import ( - MAX_MESSAGE_SIZE, - SOURCE_SOCK_PATH, -) -from pynumaflow.sourcer import ReadRequest -from pynumaflow.sourcer._dtypes import AsyncSourceReadCallable, Offset, AckRequest -from pynumaflow.sourcer.proto import source_pb2 -from pynumaflow.sourcer.proto import source_pb2_grpc -from pynumaflow.types import NumaflowServicerContext -from pynumaflow.info.server import get_sdk_version, write as info_server_write -from pynumaflow.info.types import ServerInfo, Protocol, Language, SERVER_INFO_FILE_PATH - -_LOGGER = setup_logging(__name__) -if os.getenv("PYTHONDEBUG"): - _LOGGER.setLevel(logging.DEBUG) - -_PROCESS_COUNT = multiprocessing.cpu_count() -MAX_THREADS = int(os.getenv("MAX_THREADS", "4")) - - -class AsyncSourcer(source_pb2_grpc.SourceServicer): - """ - Provides an interface to write an Asynchronous Sourcer - which will be exposed over gRPC. - - Args: - read_handler: Function callable following the type signature of AsyncSourceReadCallable - ack_handler: Function handler for AckFn - pending_handler: Function handler for PendingFn - partitions_handler: Function handler for PartitionsFn - - sock_path: Path to the UNIX Domain Socket - max_message_size: The max message size in bytes the server can receive and send - max_threads: The max number of threads to be spawned; - defaults to number of processors x4 - - Example invocation: - >>> from typing import Iterator - >>> from pynumaflow.sourcer import Message, get_default_partitions \ - ... ReadRequest, AsyncSourcer, - ... import aiorun - ... async def read_handler(datum: ReadRequest) -> AsyncIterable[Message]: - ... payload = b"payload:test_mock_message" - ... keys = ["test_key"] - ... offset = mock_offset() - ... event_time = mock_event_time() - ... for i in range(10): - ... yield Message(payload=payload, keys=keys, offset=offset, event_time=event_time) - ... async def ack_handler(ack_request: AckRequest): - ... return - ... async def pending_handler() -> PendingResponse: - ... PendingResponse(count=10) - ... async def partitions_handler() -> PartitionsResponse: - ... return PartitionsResponse(partitions=get_default_partitions()) - >>> grpc_server = AsyncSourcer(read_handler=read_handler, - ... ack_handler=ack_handler, - ... pending_handler=pending_handler, - ... partitions_handler=partitions_handler) - >>> aiorun.run(grpc_server.start()) - """ - - def __init__( - self, - read_handler: AsyncSourceReadCallable, - ack_handler, - pending_handler, - partitions_handler, - sock_path=SOURCE_SOCK_PATH, - max_message_size=MAX_MESSAGE_SIZE, - max_threads=MAX_THREADS, - ): - self.__source_read_handler: AsyncSourceReadCallable = read_handler - self.__source_ack_handler = ack_handler - self.__source_pending_handler = pending_handler - self.__source_partitions_handler = partitions_handler - self.sock_path = f"unix://{sock_path}" - self._max_message_size = max_message_size - self._max_threads = max_threads - self.cleanup_coroutines = [] - # Collection for storing strong references to all running tasks. - # Event loop only keeps a weak reference, which can cause it to - # get lost during execution. - self.background_tasks = set() - - self._server_options = [ - ("grpc.max_send_message_length", self._max_message_size), - ("grpc.max_receive_message_length", self._max_message_size), - ] - - async def ReadFn( - self, - request: source_pb2.ReadRequest, - context: NumaflowServicerContext, - ) -> AsyncIterable[source_pb2.ReadResponse]: - """ - Applies a Read function and returns a stream of datum responses. - The pascal case function name comes from the proto source_pb2_grpc.py file. - """ - - async for res in self.__invoke_source_read_stream( - ReadRequest( - num_records=request.request.num_records, - timeout_in_ms=request.request.timeout_in_ms, - ) - ): - yield source_pb2.ReadResponse(result=res) - - async def __invoke_source_read_stream(self, req: ReadRequest): - try: - async for msg in self.__source_read_handler(req): - event_time_timestamp = _timestamp_pb2.Timestamp() - event_time_timestamp.FromDatetime(dt=msg.event_time) - yield source_pb2.ReadResponse.Result( - payload=msg.payload, - keys=msg.keys, - offset=msg.offset.as_dict, - event_time=event_time_timestamp, - ) - except Exception as err: - _LOGGER.critical("User-Defined Source ReadError ", exc_info=True) - raise err - - async def AckFn( - self, request: source_pb2.AckRequest, context: NumaflowServicerContext - ) -> source_pb2.AckResponse: - """ - Applies an Ack function in User Defined Source - """ - # proto repeated field(offsets) is of type google._upb._message.RepeatedScalarContainer - # we need to explicitly convert it to list - offsets = [] - for offset in request.request.offsets: - offsets.append(Offset(offset.offset, offset.partition_id)) - try: - await self.__invoke_ack(ack_req=offsets) - except Exception as e: - context.set_code(grpc.StatusCode.UNKNOWN) - context.set_details(str(e)) - raise e - - return source_pb2.AckResponse() - - async def __invoke_ack(self, ack_req: list[Offset]): - """ - Invokes the Source Ack Function. - """ - try: - await self.__source_ack_handler(AckRequest(offsets=ack_req)) - except Exception as err: - _LOGGER.critical("AckFn Error", exc_info=True) - raise err - return source_pb2.AckResponse.Result() - - async def IsReady( - self, request: _empty_pb2.Empty, context: NumaflowServicerContext - ) -> source_pb2.ReadyResponse: - """ - IsReady is the heartbeat endpoint for gRPC. - The pascal case function name comes from the proto source_pb2_grpc.py file. - """ - return source_pb2.ReadyResponse(ready=True) - - async def PendingFn( - self, request: _empty_pb2.Empty, context: NumaflowServicerContext - ) -> source_pb2.PendingResponse: - """ - PendingFn returns the number of pending records - at the user defined source. - """ - try: - count = await self.__source_pending_handler() - except Exception as err: - _LOGGER.critical("PendingFn Error", exc_info=True) - raise err - resp = source_pb2.PendingResponse.Result(count=count.count) - return source_pb2.PendingResponse(result=resp) - - async def PartitionsFn( - self, request: _empty_pb2.Empty, context: NumaflowServicerContext - ) -> source_pb2.PartitionsResponse: - """ - PartitionsFn returns the partitions of the user defined source. - """ - try: - partitions = await self.__source_partitions_handler() - except Exception as err: - _LOGGER.critical("PartitionsFn Error", exc_info=True) - raise err - resp = source_pb2.PartitionsResponse.Result(partitions=partitions.partitions) - return source_pb2.PartitionsResponse(result=resp) - - async def __serve_async(self, server) -> None: - source_pb2_grpc.add_SourceServicer_to_server( - AsyncSourcer( - read_handler=self.__source_read_handler, - ack_handler=self.__source_ack_handler, - pending_handler=self.__source_pending_handler, - partitions_handler=self.__source_partitions_handler, - ), - server, - ) - server.add_insecure_port(self.sock_path) - _LOGGER.info("GRPC Async Server listening on: %s", self.sock_path) - await server.start() - serv_info = ServerInfo( - protocol=Protocol.UDS, - language=Language.PYTHON, - version=get_sdk_version(), - ) - info_server_write(server_info=serv_info, info_file=SERVER_INFO_FILE_PATH) - - async def server_graceful_shutdown(): - """ - Shuts down the server with 5 seconds of grace period. During the - grace period, the server won't accept new connections and allow - existing RPCs to continue within the grace period. - """ - _LOGGER.info("Starting graceful shutdown...") - await server.stop(5) - - self.cleanup_coroutines.append(server_graceful_shutdown()) - await server.wait_for_termination() - - async def start(self) -> None: - """Starts the Async gRPC server on the given UNIX socket.""" - server = grpc.aio.server(options=self._server_options) - await self.__serve_async(server) diff --git a/pynumaflow-old/sourcer/proto/__init__.py b/pynumaflow-old/sourcer/proto/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/pynumaflow-old/sourcer/proto/source.proto b/pynumaflow-old/sourcer/proto/source.proto deleted file mode 100644 index 87c6ff72..00000000 --- a/pynumaflow-old/sourcer/proto/source.proto +++ /dev/null @@ -1,148 +0,0 @@ -syntax = "proto3"; -import "google/protobuf/timestamp.proto"; -import "google/protobuf/empty.proto"; - -package source.v1; - -service Source { - // Read returns a stream of datum responses. - // The size of the returned ReadResponse is less than or equal to the num_records specified in ReadRequest. - // If the request timeout is reached on server side, the returned ReadResponse will contain all the datum that have been read (which could be an empty list). - rpc ReadFn(ReadRequest) returns (stream ReadResponse); - - // AckFn acknowledges a list of datum offsets. - // When AckFn is called, it implicitly indicates that the datum stream has been processed by the source vertex. - // The caller (numa) expects the AckFn to be successful, and it does not expect any errors. - // If there are some irrecoverable errors when the callee (UDSource) is processing the AckFn request, - // then it is best to crash because there are no other retry mechanisms possible. - rpc AckFn(AckRequest) returns (AckResponse); - - // PendingFn returns the number of pending records at the user defined source. - rpc PendingFn(google.protobuf.Empty) returns (PendingResponse); - - // PartitionsFn returns the list of partitions for the user defined source. - rpc PartitionsFn(google.protobuf.Empty) returns (PartitionsResponse); - - // IsReady is the heartbeat endpoint for user defined source gRPC. - rpc IsReady(google.protobuf.Empty) returns (ReadyResponse); -} - -/* - * ReadRequest is the request for reading datum stream from user defined source. - */ -message ReadRequest { - message Request { - // Required field indicating the number of records to read. - uint64 num_records = 1; - // Required field indicating the request timeout in milliseconds. - // uint32 can represent 2^32 milliseconds, which is about 49 days. - // We don't use uint64 because time.Duration takes int64 as nano seconds. Using uint64 for milli will cause overflow. - uint32 timeout_in_ms = 2; - } - // Required field indicating the request. - Request request = 1; -} - -/* - * ReadResponse is the response for reading datum stream from user defined source. - */ -message ReadResponse { - message Result { - // Required field holding the payload of the datum. - bytes payload = 1; - // Required field indicating the offset information of the datum. - Offset offset = 2; - // Required field representing the time associated with each datum. It is used for watermarking. - google.protobuf.Timestamp event_time = 3; - // Optional list of keys associated with the datum. - // Key is the "key" attribute in (key,value) as in the map-reduce paradigm. - // We add this optional field to support the use case where the user defined source can provide keys for the datum. - // e.g. Kafka and Redis Stream message usually include information about the keys. - repeated string keys = 4; - } - // Required field holding the result. - Result result = 1; -} - -/* - * AckRequest is the request for acknowledging datum. - * It takes a list of offsets to be acknowledged. - */ -message AckRequest { - message Request { - // Required field holding a list of offsets to be acknowledged. - // The offsets must be strictly corresponding to the previously read batch, - // meaning the offsets must be in the same order as the datum responses in the ReadResponse. - // By enforcing ordering, we can save deserialization effort on the server side, assuming the server keeps a local copy of the raw/un-serialized offsets. - repeated Offset offsets = 1; - } - // Required field holding the request. The list will be ordered and will have the same order as the original Read response. - Request request = 1; -} - -/* - * AckResponse is the response for acknowledging datum. It contains one empty field confirming - * the batch of offsets that have been successfully acknowledged. The contract between client and server - * is that the server will only return the AckResponse if the ack request is successful. - * If the server hangs during the ack request, the client can decide to timeout and error out the data forwarder. - * The reason why we define such contract is that we always expect the server to be able to process the ack request. - * Client is expected to send the AckRequest to the server with offsets that are strictly - * corresponding to the previously read batch. If the client sends the AckRequest with offsets that are not, - * it is considered as a client error and the server will not return the AckResponse. - */ -message AckResponse { - message Result { - // Required field indicating the ack request is successful. - google.protobuf.Empty success = 1; - } - // Required field holding the result. - Result result = 1; -} - -/* - * ReadyResponse is the health check result for user defined source. - */ -message ReadyResponse { - // Required field holding the health check result. - bool ready = 1; -} - -/* - * PendingResponse is the response for the pending request. - */ -message PendingResponse { - message Result { - // Required field holding the number of pending records at the user defined source. - // A negative count indicates that the pending information is not available. - int64 count = 1; - } - // Required field holding the result. - Result result = 1; -} - -/* - * PartitionsResponse is the response for the partitions request. - */ -message PartitionsResponse { - message Result { - // Required field holding the list of partitions. - repeated int32 partitions = 1; - } - // Required field holding the result. - Result result = 1; -} - -/* - * Offset is the offset of the datum. - */ -message Offset { - // offset is the offset of the datum. This field is required. - // We define Offset as a byte array because different input data sources can have different representations for Offset. - // The only way to generalize it is to define it as a byte array, - // Such that we can let the UDSource to de-serialize the offset using its own interpretation logics. - bytes offset = 1; - // Optional partition_id indicates which partition of the source the datum belongs to. - // It is useful for sources that have multiple partitions. e.g. Kafka. - // If the partition_id is not specified, it is assumed that the source has a single partition. - int32 partition_id = 2; -} \ No newline at end of file diff --git a/pynumaflow-old/sourcer/proto/source_pb2.py b/pynumaflow-old/sourcer/proto/source_pb2.py deleted file mode 100644 index 73c282e1..00000000 --- a/pynumaflow-old/sourcer/proto/source_pb2.py +++ /dev/null @@ -1,58 +0,0 @@ -# -*- coding: utf-8 -*- -# Generated by the protocol buffer compiler. DO NOT EDIT! -# source: source.proto -"""Generated protocol buffer code.""" -from google.protobuf import descriptor as _descriptor -from google.protobuf import descriptor_pool as _descriptor_pool -from google.protobuf import symbol_database as _symbol_database -from google.protobuf.internal import builder as _builder - -# @@protoc_insertion_point(imports) - -_sym_db = _symbol_database.Default() - - -from google.protobuf import timestamp_pb2 as google_dot_protobuf_dot_timestamp__pb2 -from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2 - - -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b'\n\x0csource.proto\x12\tsource.v1\x1a\x1fgoogle/protobuf/timestamp.proto\x1a\x1bgoogle/protobuf/empty.proto"u\n\x0bReadRequest\x12/\n\x07request\x18\x01 \x01(\x0b\x32\x1e.source.v1.ReadRequest.Request\x1a\x35\n\x07Request\x12\x13\n\x0bnum_records\x18\x01 \x01(\x04\x12\x15\n\rtimeout_in_ms\x18\x02 \x01(\r"\xba\x01\n\x0cReadResponse\x12.\n\x06result\x18\x01 \x01(\x0b\x32\x1e.source.v1.ReadResponse.Result\x1az\n\x06Result\x12\x0f\n\x07payload\x18\x01 \x01(\x0c\x12!\n\x06offset\x18\x02 \x01(\x0b\x32\x11.source.v1.Offset\x12.\n\nevent_time\x18\x03 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12\x0c\n\x04keys\x18\x04 \x03(\t"k\n\nAckRequest\x12.\n\x07request\x18\x01 \x01(\x0b\x32\x1d.source.v1.AckRequest.Request\x1a-\n\x07Request\x12"\n\x07offsets\x18\x01 \x03(\x0b\x32\x11.source.v1.Offset"o\n\x0b\x41\x63kResponse\x12-\n\x06result\x18\x01 \x01(\x0b\x32\x1d.source.v1.AckResponse.Result\x1a\x31\n\x06Result\x12\'\n\x07success\x18\x01 \x01(\x0b\x32\x16.google.protobuf.Empty"\x1e\n\rReadyResponse\x12\r\n\x05ready\x18\x01 \x01(\x08"]\n\x0fPendingResponse\x12\x31\n\x06result\x18\x01 \x01(\x0b\x32!.source.v1.PendingResponse.Result\x1a\x17\n\x06Result\x12\r\n\x05\x63ount\x18\x01 \x01(\x03"h\n\x12PartitionsResponse\x12\x34\n\x06result\x18\x01 \x01(\x0b\x32$.source.v1.PartitionsResponse.Result\x1a\x1c\n\x06Result\x12\x12\n\npartitions\x18\x01 \x03(\x05".\n\x06Offset\x12\x0e\n\x06offset\x18\x01 \x01(\x0c\x12\x14\n\x0cpartition_id\x18\x02 \x01(\x05\x32\xc2\x02\n\x06Source\x12;\n\x06ReadFn\x12\x16.source.v1.ReadRequest\x1a\x17.source.v1.ReadResponse0\x01\x12\x36\n\x05\x41\x63kFn\x12\x15.source.v1.AckRequest\x1a\x16.source.v1.AckResponse\x12?\n\tPendingFn\x12\x16.google.protobuf.Empty\x1a\x1a.source.v1.PendingResponse\x12\x45\n\x0cPartitionsFn\x12\x16.google.protobuf.Empty\x1a\x1d.source.v1.PartitionsResponse\x12;\n\x07IsReady\x12\x16.google.protobuf.Empty\x1a\x18.source.v1.ReadyResponseb\x06proto3' -) - -_globals = globals() -_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) -_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "source_pb2", _globals) -if _descriptor._USE_C_DESCRIPTORS == False: - DESCRIPTOR._options = None - _globals["_READREQUEST"]._serialized_start = 89 - _globals["_READREQUEST"]._serialized_end = 206 - _globals["_READREQUEST_REQUEST"]._serialized_start = 153 - _globals["_READREQUEST_REQUEST"]._serialized_end = 206 - _globals["_READRESPONSE"]._serialized_start = 209 - _globals["_READRESPONSE"]._serialized_end = 395 - _globals["_READRESPONSE_RESULT"]._serialized_start = 273 - _globals["_READRESPONSE_RESULT"]._serialized_end = 395 - _globals["_ACKREQUEST"]._serialized_start = 397 - _globals["_ACKREQUEST"]._serialized_end = 504 - _globals["_ACKREQUEST_REQUEST"]._serialized_start = 459 - _globals["_ACKREQUEST_REQUEST"]._serialized_end = 504 - _globals["_ACKRESPONSE"]._serialized_start = 506 - _globals["_ACKRESPONSE"]._serialized_end = 617 - _globals["_ACKRESPONSE_RESULT"]._serialized_start = 568 - _globals["_ACKRESPONSE_RESULT"]._serialized_end = 617 - _globals["_READYRESPONSE"]._serialized_start = 619 - _globals["_READYRESPONSE"]._serialized_end = 649 - _globals["_PENDINGRESPONSE"]._serialized_start = 651 - _globals["_PENDINGRESPONSE"]._serialized_end = 744 - _globals["_PENDINGRESPONSE_RESULT"]._serialized_start = 721 - _globals["_PENDINGRESPONSE_RESULT"]._serialized_end = 744 - _globals["_PARTITIONSRESPONSE"]._serialized_start = 746 - _globals["_PARTITIONSRESPONSE"]._serialized_end = 850 - _globals["_PARTITIONSRESPONSE_RESULT"]._serialized_start = 822 - _globals["_PARTITIONSRESPONSE_RESULT"]._serialized_end = 850 - _globals["_OFFSET"]._serialized_start = 852 - _globals["_OFFSET"]._serialized_end = 898 - _globals["_SOURCE"]._serialized_start = 901 - _globals["_SOURCE"]._serialized_end = 1223 -# @@protoc_insertion_point(module_scope) diff --git a/pynumaflow-old/sourcer/proto/source_pb2_grpc.py b/pynumaflow-old/sourcer/proto/source_pb2_grpc.py deleted file mode 100644 index 3a132eea..00000000 --- a/pynumaflow-old/sourcer/proto/source_pb2_grpc.py +++ /dev/null @@ -1,266 +0,0 @@ -# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT! -"""Client and server classes corresponding to protobuf-defined services.""" -import grpc - -from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2 -from . import source_pb2 as source__pb2 - - -class SourceStub(object): - """Missing associated documentation comment in .proto file.""" - - def __init__(self, channel): - """Constructor. - - Args: - channel: A grpc.Channel. - """ - self.ReadFn = channel.unary_stream( - "/source.v1.Source/ReadFn", - request_serializer=source__pb2.ReadRequest.SerializeToString, - response_deserializer=source__pb2.ReadResponse.FromString, - ) - self.AckFn = channel.unary_unary( - "/source.v1.Source/AckFn", - request_serializer=source__pb2.AckRequest.SerializeToString, - response_deserializer=source__pb2.AckResponse.FromString, - ) - self.PendingFn = channel.unary_unary( - "/source.v1.Source/PendingFn", - request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, - response_deserializer=source__pb2.PendingResponse.FromString, - ) - self.PartitionsFn = channel.unary_unary( - "/source.v1.Source/PartitionsFn", - request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, - response_deserializer=source__pb2.PartitionsResponse.FromString, - ) - self.IsReady = channel.unary_unary( - "/source.v1.Source/IsReady", - request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, - response_deserializer=source__pb2.ReadyResponse.FromString, - ) - - -class SourceServicer(object): - """Missing associated documentation comment in .proto file.""" - - def ReadFn(self, request, context): - """Read returns a stream of datum responses. - The size of the returned ReadResponse is less than or equal to the num_records specified in ReadRequest. - If the request timeout is reached on server side, the returned ReadResponse will contain all the datum that have been read (which could be an empty list). - """ - context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") - - def AckFn(self, request, context): - """AckFn acknowledges a list of datum offsets. - When AckFn is called, it implicitly indicates that the datum stream has been processed by the source vertex. - The caller (numa) expects the AckFn to be successful, and it does not expect any errors. - If there are some irrecoverable errors when the callee (UDSource) is processing the AckFn request, - then it is best to crash because there are no other retry mechanisms possible. - """ - context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") - - def PendingFn(self, request, context): - """PendingFn returns the number of pending records at the user defined source.""" - context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") - - def PartitionsFn(self, request, context): - """PartitionsFn returns the list of partitions for the user defined source.""" - context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") - - def IsReady(self, request, context): - """IsReady is the heartbeat endpoint for user defined source gRPC.""" - context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") - - -def add_SourceServicer_to_server(servicer, server): - rpc_method_handlers = { - "ReadFn": grpc.unary_stream_rpc_method_handler( - servicer.ReadFn, - request_deserializer=source__pb2.ReadRequest.FromString, - response_serializer=source__pb2.ReadResponse.SerializeToString, - ), - "AckFn": grpc.unary_unary_rpc_method_handler( - servicer.AckFn, - request_deserializer=source__pb2.AckRequest.FromString, - response_serializer=source__pb2.AckResponse.SerializeToString, - ), - "PendingFn": grpc.unary_unary_rpc_method_handler( - servicer.PendingFn, - request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, - response_serializer=source__pb2.PendingResponse.SerializeToString, - ), - "PartitionsFn": grpc.unary_unary_rpc_method_handler( - servicer.PartitionsFn, - request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, - response_serializer=source__pb2.PartitionsResponse.SerializeToString, - ), - "IsReady": grpc.unary_unary_rpc_method_handler( - servicer.IsReady, - request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, - response_serializer=source__pb2.ReadyResponse.SerializeToString, - ), - } - generic_handler = grpc.method_handlers_generic_handler("source.v1.Source", rpc_method_handlers) - server.add_generic_rpc_handlers((generic_handler,)) - - -# This class is part of an EXPERIMENTAL API. -class Source(object): - """Missing associated documentation comment in .proto file.""" - - @staticmethod - def ReadFn( - request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.unary_stream( - request, - target, - "/source.v1.Source/ReadFn", - source__pb2.ReadRequest.SerializeToString, - source__pb2.ReadResponse.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - ) - - @staticmethod - def AckFn( - request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.unary_unary( - request, - target, - "/source.v1.Source/AckFn", - source__pb2.AckRequest.SerializeToString, - source__pb2.AckResponse.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - ) - - @staticmethod - def PendingFn( - request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.unary_unary( - request, - target, - "/source.v1.Source/PendingFn", - google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, - source__pb2.PendingResponse.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - ) - - @staticmethod - def PartitionsFn( - request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.unary_unary( - request, - target, - "/source.v1.Source/PartitionsFn", - google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, - source__pb2.PartitionsResponse.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - ) - - @staticmethod - def IsReady( - request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.unary_unary( - request, - target, - "/source.v1.Source/IsReady", - google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, - source__pb2.ReadyResponse.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - ) diff --git a/pynumaflow-old/sourcer/server.py b/pynumaflow-old/sourcer/server.py deleted file mode 100644 index b36b22ba..00000000 --- a/pynumaflow-old/sourcer/server.py +++ /dev/null @@ -1,231 +0,0 @@ -import logging -import multiprocessing -import os - -from collections.abc import Iterable -from concurrent.futures import ThreadPoolExecutor - -from google.protobuf import timestamp_pb2 as _timestamp_pb2 -import grpc -from google.protobuf import empty_pb2 as _empty_pb2 - -from pynumaflow import setup_logging -from pynumaflow._constants import ( - MAX_MESSAGE_SIZE, - SOURCE_SOCK_PATH, -) -from pynumaflow.sourcer import ReadRequest -from pynumaflow.sourcer._dtypes import ( - SourceReadCallable, - Offset, - AckRequest, - SourceAckCallable, -) -from pynumaflow.sourcer.proto import source_pb2 -from pynumaflow.sourcer.proto import source_pb2_grpc -from pynumaflow.types import NumaflowServicerContext -from pynumaflow.info.server import get_sdk_version, write as info_server_write -from pynumaflow.info.types import ServerInfo, Protocol, Language, SERVER_INFO_FILE_PATH - -_LOGGER = setup_logging(__name__) -if os.getenv("PYTHONDEBUG"): - _LOGGER.setLevel(logging.DEBUG) - -_PROCESS_COUNT = multiprocessing.cpu_count() -MAX_THREADS = int(os.getenv("MAX_THREADS", "4")) - - -class Sourcer(source_pb2_grpc.SourceServicer): - """ - Provides an interface to write a Sourcer - which will be exposed over gRPC. - - Args: - read_handler: Function callable following the type signature of SyncSourceReadCallable - ack_handler: Function handler for AckFn - pending_handler: Function handler for PendingFn - partitions_handler: Function handler for PartitionsFn - sock_path: Path to the UNIX Domain Socket - max_message_size: The max message size in bytes the server can receive and send - max_threads: The max number of threads to be spawned; - defaults to number of processors x4 - - Example invocation: - >>> from typing import Iterator - >>> from pynumaflow.sourcer import Message, get_default_partitions, PartitionsResponse \ - ... ReadRequest, Sourcer, AckRequest, - ... def read_handler(datum: ReadRequest) -> Iterable[Message]: - ... payload = b"payload:test_mock_message" - ... keys = ["test_key"] - ... offset = mock_offset() - ... event_time = mock_event_time() - ... for i in range(10): - ... yield Message(payload=payload, keys=keys, offset=offset, event_time=event_time) - ... def ack_handler(ack_request: AckRequest): - ... return - ... def pending_handler() -> PendingResponse: - ... PendingResponse(count=10) - ... def partitions_handler() -> PartitionsResponse: - ... return PartitionsResponse(partitions=get_default_partitions()) - >>> grpc_server = Sourcer(read_handler=read_handler, - ... ack_handler=ack_handler, - ... pending_handler=pending_handler, - ... partitions_handler=partition_handler,) - >>> grpc_server.start() - """ - - def __init__( - self, - read_handler: SourceReadCallable, - ack_handler: SourceAckCallable, - pending_handler, - partitions_handler, - sock_path=SOURCE_SOCK_PATH, - max_message_size=MAX_MESSAGE_SIZE, - max_threads=MAX_THREADS, - ): - self.__source_read_handler: SourceReadCallable = read_handler - self.__source_ack_handler: SourceAckCallable = ack_handler - self.__source_pending_handler = pending_handler - self.__source_partitions_handler = partitions_handler - self.sock_path = f"unix://{sock_path}" - self._max_message_size = max_message_size - self._max_threads = max_threads - self.cleanup_coroutines = [] - # Collection for storing strong references to all running tasks. - # Event loop only keeps a weak reference, which can cause it to - # get lost during execution. - self.background_tasks = set() - - self._server_options = [ - ("grpc.max_send_message_length", self._max_message_size), - ("grpc.max_receive_message_length", self._max_message_size), - ] - - def ReadFn( - self, - request: source_pb2.ReadRequest, - context: NumaflowServicerContext, - ) -> Iterable[source_pb2.ReadResponse]: - """ - Applies a Read function to a datum stream in streaming mode. - The pascal case function name comes from the proto source_pb2_grpc.py file. - """ - - for res in self.__invoke_source_read_stream( - ReadRequest( - num_records=request.request.num_records, - timeout_in_ms=request.request.timeout_in_ms, - ) - ): - yield source_pb2.ReadResponse(result=res) - - def __invoke_source_read_stream(self, req: ReadRequest): - try: - for msg in self.__source_read_handler(req): - event_time_timestamp = _timestamp_pb2.Timestamp() - event_time_timestamp.FromDatetime(dt=msg.event_time) - yield source_pb2.ReadResponse.Result( - payload=msg.payload, - keys=msg.keys, - offset=msg.offset.as_dict, - event_time=event_time_timestamp, - ) - except Exception as err: - _LOGGER.critical("User-Defined Source ReadError ", exc_info=True) - raise err - - def AckFn( - self, request: source_pb2.AckRequest, context: NumaflowServicerContext - ) -> source_pb2.AckResponse: - """ - Applies an Ack function in User Defined Source - """ - # proto repeated field(offsets) is of type google._upb._message.RepeatedScalarContainer - # we need to explicitly convert it to list - offsets = [] - for offset in request.request.offsets: - offsets.append(Offset(offset.offset, offset.partition_id)) - try: - self.__invoke_ack(ack_req=offsets) - except Exception as e: - context.set_code(grpc.StatusCode.UNKNOWN) - context.set_details(str(e)) - raise e - - return source_pb2.AckResponse() - - def __invoke_ack(self, ack_req: list[Offset]): - """ - Invokes the Source Ack Function. - """ - try: - self.__source_ack_handler(AckRequest(offsets=ack_req)) - except Exception as err: - _LOGGER.critical("AckFn Error", exc_info=True) - raise err - return source_pb2.AckResponse.Result() - - def IsReady( - self, request: _empty_pb2.Empty, context: NumaflowServicerContext - ) -> source_pb2.ReadyResponse: - """ - IsReady is the heartbeat endpoint for gRPC. - The pascal case function name comes from the proto source_pb2_grpc.py file. - """ - return source_pb2.ReadyResponse(ready=True) - - def PendingFn( - self, request: _empty_pb2.Empty, context: NumaflowServicerContext - ) -> source_pb2.PendingResponse: - """ - PendingFn returns the number of pending records - at the user defined source. - """ - try: - count = self.__source_pending_handler() - except Exception as err: - _LOGGER.critical("PendingFn error", exc_info=True) - raise err - resp = source_pb2.PendingResponse.Result(count=count.count) - return source_pb2.PendingResponse(result=resp) - - def PartitionsFn( - self, request: _empty_pb2.Empty, context: NumaflowServicerContext - ) -> source_pb2.PartitionsResponse: - """ - Partitions returns the partitions associated with the source, will be used by - the platform to determine the partitions to which the watermark should be published. - If the source doesn't have partitions, get_default_partitions() can be used to - return the default partitions. In most cases, the get_default_partitions() - should be enough; the cases where we need to implement custom partitions_handler() - is in a case like Kafka, where a reader can read from multiple Kafka partitions. - """ - try: - partitions = self.__source_partitions_handler() - except Exception as err: - _LOGGER.critical("PartitionFn error", exc_info=True) - raise err - resp = source_pb2.PartitionsResponse.Result(partitions=partitions.partitions) - return source_pb2.PartitionsResponse(result=resp) - - def start(self) -> None: - """ - Starts the gRPC server on the given UNIX socket with given max threads. - """ - server = grpc.server( - ThreadPoolExecutor(max_workers=self._max_threads), options=self._server_options - ) - source_pb2_grpc.add_SourceServicer_to_server(self, server) - server.add_insecure_port(self.sock_path) - server.start() - serv_info = ServerInfo( - protocol=Protocol.UDS, - language=Language.PYTHON, - version=get_sdk_version(), - ) - info_server_write(server_info=serv_info, info_file=SERVER_INFO_FILE_PATH) - _LOGGER.info( - "GRPC Server listening on: %s with max threads: %s", self.sock_path, self._max_threads - ) - server.wait_for_termination() diff --git a/pynumaflow-old/sourcetransformer/__init__.py b/pynumaflow-old/sourcetransformer/__init__.py deleted file mode 100644 index 4708603d..00000000 --- a/pynumaflow-old/sourcetransformer/__init__.py +++ /dev/null @@ -1,12 +0,0 @@ -from pynumaflow.sourcetransformer._dtypes import Message, Messages, Datum, DROP -from pynumaflow.sourcetransformer.multiproc_server import MultiProcSourceTransformer -from pynumaflow.sourcetransformer.server import SourceTransformer - -__all__ = [ - "Message", - "Messages", - "Datum", - "DROP", - "SourceTransformer", - "MultiProcSourceTransformer", -] diff --git a/pynumaflow-old/types.py b/pynumaflow-old/types.py deleted file mode 100644 index e028ea54..00000000 --- a/pynumaflow-old/types.py +++ /dev/null @@ -1,7 +0,0 @@ -from typing import Union, NewType - -import grpc - -NumaflowServicerContext = NewType( - "NumaflowServicerContext", Union[grpc.aio.ServicerContext, grpc.ServicerContext] -) diff --git a/pynumaflow/sinker/_dtypes.py b/pynumaflow/sinker/_dtypes.py index 436dcf3a..3c4b820e 100644 --- a/pynumaflow/sinker/_dtypes.py +++ b/pynumaflow/sinker/_dtypes.py @@ -2,6 +2,7 @@ from dataclasses import dataclass from datetime import datetime from typing import TypeVar, Optional, Callable, Union +from collections.abc import AsyncIterable, Awaitable from collections.abc import Sequence, Iterator from warnings import warn @@ -186,4 +187,5 @@ def handler(self, datums: Iterator[Datum]) -> Responses: SinkHandlerCallable = Callable[[Iterator[Datum]], Responses] -SinkCallable = Union[SinkerClass, SinkHandlerCallable] +AsyncSinkCallable = Callable[[AsyncIterable[Datum]], Awaitable[Responses]] +SinkCallable = Union[SinkerClass, SinkHandlerCallable, AsyncSinkCallable] diff --git a/tests/map/test_async_mapper.py b/tests/map/test_async_mapper.py index eacbe4ab..fd7e35e2 100644 --- a/tests/map/test_async_mapper.py +++ b/tests/map/test_async_mapper.py @@ -7,15 +7,16 @@ from google.protobuf import empty_pb2 as _empty_pb2 from google.protobuf import timestamp_pb2 as _timestamp_pb2 from grpc.aio._server import Server +from pynumaflow._constants import ServerType from pynumaflow import setup_logging from pynumaflow.mapper import ( - AsyncMapper, Datum, Messages, Message, + MapServer, ) -from pynumaflow.mapper.proto import map_pb2_grpc, map_pb2 +from pynumaflow.proto.mapper import map_pb2, map_pb2_grpc from tests.testing_utils import ( mock_event_time, mock_watermark, @@ -62,11 +63,12 @@ def startup_callable(loop): def new_async_mapper(): - udfs = AsyncMapper(handler=async_map_handler) + server = MapServer(mapper_instance=async_map_handler, server_type=ServerType.Async) + udfs = server.get_servicer(mapper_instance=async_map_handler, server_type=ServerType.Async) return udfs -async def start_server(udfs: AsyncMapper): +async def start_server(udfs): server = grpc.aio.server() map_pb2_grpc.add_MapServicer_to_server(udfs, server) listen_addr = "[::]:50056" diff --git a/tests/map/test_multiproc_mapper.py b/tests/map/test_multiproc_mapper.py index 7613be53..b3608620 100644 --- a/tests/map/test_multiproc_mapper.py +++ b/tests/map/test_multiproc_mapper.py @@ -8,9 +8,10 @@ from google.protobuf import timestamp_pb2 as _timestamp_pb2 from grpc import StatusCode from grpc_testing import server_from_dictionary, strict_real_time +from pynumaflow._constants import ServerType -from pynumaflow.mapper.multiproc_server import MultiProcMapper -from pynumaflow.mapper.proto import map_pb2_grpc, map_pb2 +from pynumaflow.mapper import MapServer +from pynumaflow.proto.mapper import map_pb2 from tests.map.utils import map_handler, err_map_handler from tests.testing_utils import ( mock_event_time, @@ -25,52 +26,58 @@ def mockenv(**envvars): class TestMultiProcMethods(unittest.TestCase): def setUp(self) -> None: - my_servicer = MultiProcMapper(handler=map_handler) + my_server = MapServer(mapper_instance=map_handler) + my_servicer = my_server.get_servicer( + mapper_instance=map_handler, server_type=ServerType.Multiproc + ) services = {map_pb2.DESCRIPTOR.services_by_name["Map"]: my_servicer} self.test_server = server_from_dictionary(services, strict_real_time()) @mockenv(NUM_CPU_MULTIPROC="3") def test_multiproc_init(self) -> None: - server = MultiProcMapper(handler=map_handler) - self.assertEqual(server._process_count, 3) + my_server = MapServer(mapper_instance=map_handler) + self.assertEqual(my_server._process_count, 3) @patch("os.cpu_count", Mock(return_value=4)) def test_multiproc_process_count(self) -> None: - server = MultiProcMapper(handler=map_handler) - self.assertEqual(server._process_count, 4) + my_server = MapServer(mapper_instance=map_handler) + self.assertEqual(my_server._process_count, 4) @patch("os.cpu_count", Mock(return_value=4)) @mockenv(NUM_CPU_MULTIPROC="10") def test_max_process_count(self) -> None: - server = MultiProcMapper(handler=map_handler) + server = MapServer(mapper_instance=map_handler) self.assertEqual(server._process_count, 8) - # To test the reuse property for the grpc servers which allow multiple - # bindings to the same server - def test_reuse_port(self): - serv_options = [("grpc.so_reuseaddr", 1)] - - server = MultiProcMapper(handler=map_handler) - - with server._reserve_port(0) as port: - print(port) - bind_address = f"localhost:{port}" - server1 = grpc.server(thread_pool=None, options=serv_options) - map_pb2_grpc.add_MapServicer_to_server(server, server1) - server1.add_insecure_port(bind_address) - - # so_reuseport=0 -> the bind should raise an error - server2 = grpc.server(thread_pool=None, options=(("grpc.so_reuseport", 0),)) - map_pb2_grpc.add_MapServicer_to_server(server, server2) - self.assertRaises(RuntimeError, server2.add_insecure_port, bind_address) - - # so_reuseport=1 -> should allow server to bind to port again - server3 = grpc.server(thread_pool=None, options=(("grpc.so_reuseport", 1),)) - map_pb2_grpc.add_MapServicer_to_server(server, server3) - server3.add_insecure_port(bind_address) + # # To test the reuse property for the grpc servers which allow multiple + # # bindings to the same server + # def test_reuse_port(self): + # serv_options = [("grpc.so_reuseaddr", 1)] + # + # server = MapServer(mapper_instance=map_handler) + # + # with server._reserve_port(0) as port: + # print(port) + # bind_address = f"localhost:{port}" + # server1 = grpc.server(thread_pool=None, options=serv_options) + # map_pb2_grpc.add_MapServicer_to_server(server, server1) + # server1.add_insecure_port(bind_address) + # + # # so_reuseport=0 -> the bind should raise an error + # server2 = grpc.server(thread_pool=None, options=(("grpc.so_reuseport", 0),)) + # map_pb2_grpc.add_MapServicer_to_server(server, server2) + # self.assertRaises(RuntimeError, server2.add_insecure_port, bind_address) + # + # # so_reuseport=1 -> should allow server to bind to port again + # server3 = grpc.server(thread_pool=None, options=(("grpc.so_reuseport", 1),)) + # map_pb2_grpc.add_MapServicer_to_server(server, server3) + # server3.add_insecure_port(bind_address) def test_udf_map_err(self): - my_servicer = MultiProcMapper(handler=err_map_handler) + my_server = MapServer(mapper_instance=err_map_handler) + my_servicer = my_server.get_servicer( + mapper_instance=my_server.mapper_instance, server_type=ServerType.Multiproc + ) services = {map_pb2.DESCRIPTOR.services_by_name["Map"]: my_servicer} self.test_server = server_from_dictionary(services, strict_real_time()) @@ -148,7 +155,7 @@ def test_map_forward_message(self): def test_invalid_input(self): with self.assertRaises(TypeError): - MultiProcMapper() + MapServer() if __name__ == "__main__": diff --git a/tests/map/test_sync_mapper.py b/tests/map/test_sync_mapper.py index 8a4d7c4c..d0ebe1b1 100644 --- a/tests/map/test_sync_mapper.py +++ b/tests/map/test_sync_mapper.py @@ -5,9 +5,12 @@ from google.protobuf import timestamp_pb2 as _timestamp_pb2 from grpc import StatusCode from grpc_testing import server_from_dictionary, strict_real_time +from pynumaflow._constants import ServerType -from pynumaflow.mapper import Mapper -from pynumaflow.mapper.proto import map_pb2 +from pynumaflow.mapper.server import Mapper + +from pynumaflow.mapper import MapServer +from pynumaflow.proto.mapper import map_pb2 from tests.map.utils import map_handler, err_map_handler from tests.testing_utils import ( mock_event_time, @@ -18,16 +21,22 @@ class TestSyncMapper(unittest.TestCase): def setUp(self) -> None: - my_servicer = Mapper(handler=map_handler) + my_server = MapServer(mapper_instance=map_handler) + my_servicer = my_server.get_servicer( + mapper_instance=map_handler, server_type=ServerType.Sync + ) + services = {map_pb2.DESCRIPTOR.services_by_name["Map"]: my_servicer} self.test_server = server_from_dictionary(services, strict_real_time()) def test_init_with_args(self) -> None: - my_servicer = Mapper( - handler=map_handler, sock_path="/tmp/test.sock", max_message_size=1024 * 1024 * 5 + my_servicer = MapServer( + mapper_instance=map_handler, + sock_path="/tmp/test.sock", + max_message_size=1024 * 1024 * 5, ) self.assertEqual(my_servicer.sock_path, "unix:///tmp/test.sock") - self.assertEqual(my_servicer._max_message_size, 1024 * 1024 * 5) + self.assertEqual(my_servicer.max_message_size, 1024 * 1024 * 5) def test_udf_map_err(self): my_servicer = Mapper(handler=err_map_handler) diff --git a/tests/mapstream/test_async_map_stream.py b/tests/mapstream/test_async_map_stream.py index e2022713..621b4f3f 100644 --- a/tests/mapstream/test_async_map_stream.py +++ b/tests/mapstream/test_async_map_stream.py @@ -12,9 +12,9 @@ from pynumaflow.mapstreamer import ( Message, Datum, - AsyncMapStreamer, + MapStreamServer, ) -from pynumaflow.mapstreamer.proto import mapstream_pb2_grpc +from pynumaflow.proto.mapstreamer import mapstream_pb2_grpc from tests.mapstream.utils import start_request_map_stream LOGGER = setup_logging(__name__) @@ -47,12 +47,14 @@ def startup_callable(loop): def NewAsyncMapStreamer( map_stream_handler=async_map_stream_handler, ): - udfs = AsyncMapStreamer(handler=async_map_stream_handler) - + server = MapStreamServer(map_stream_instance=async_map_stream_handler) + udfs = server.get_servicer( + map_stream_instance=async_map_stream_handler, server_type=server.server_type + ) return udfs -async def start_server(udfs: AsyncMapStreamer): +async def start_server(udfs): server = grpc.aio.server() mapstream_pb2_grpc.add_MapStreamServicer_to_server(udfs, server) listen_addr = "[::]:50060" diff --git a/tests/mapstream/test_async_map_stream_err.py b/tests/mapstream/test_async_map_stream_err.py index 1211165c..6bf3a246 100644 --- a/tests/mapstream/test_async_map_stream_err.py +++ b/tests/mapstream/test_async_map_stream_err.py @@ -7,10 +7,11 @@ import grpc from grpc.aio._server import Server +from pynumaflow._constants import ServerType from pynumaflow import setup_logging -from pynumaflow.mapstreamer import Message, Datum, AsyncMapStreamer -from pynumaflow.mapstreamer.proto import mapstream_pb2_grpc +from pynumaflow.mapstreamer import Message, Datum, MapStreamServer +from pynumaflow.proto.mapstreamer import mapstream_pb2_grpc from tests.mapstream.utils import start_request_map_stream LOGGER = setup_logging(__name__) @@ -32,7 +33,7 @@ async def err_async_map_stream_handler(keys: list[str], datum: Datum) -> AsyncIt _s: Server = None -_channel = grpc.insecure_channel("localhost:50052") +_channel = grpc.insecure_channel("localhost:50041") _loop = None @@ -43,9 +44,12 @@ def startup_callable(loop): async def start_server(): server = grpc.aio.server() - udfs = AsyncMapStreamer(handler=err_async_map_stream_handler) + server_instance = MapStreamServer(map_stream_instance=err_async_map_stream_handler) + udfs = server_instance.get_servicer( + map_stream_instance=err_async_map_stream_handler, server_type=server.server_type + ) mapstream_pb2_grpc.add_MapStreamServicer_to_server(udfs, server) - listen_addr = "[::]:50052" + listen_addr = "[::]:50041" server.add_insecure_port(listen_addr) logging.info("Starting server on %s", listen_addr) global _s @@ -65,7 +69,7 @@ def setUpClass(cls) -> None: asyncio.run_coroutine_threadsafe(start_server(), loop=loop) while True: try: - with grpc.insecure_channel("localhost:50052") as channel: + with grpc.insecure_channel("localhost:50041") as channel: f = grpc.channel_ready_future(channel) f.result(timeout=10) if f.done(): @@ -100,7 +104,7 @@ def __stub(self): def test_invalid_input(self): with self.assertRaises(TypeError): - AsyncMapStreamer() + MapStreamServer(server_type=ServerType.Async) if __name__ == "__main__": diff --git a/tests/mapstream/utils.py b/tests/mapstream/utils.py index 9f0960f0..4e9e4824 100644 --- a/tests/mapstream/utils.py +++ b/tests/mapstream/utils.py @@ -1,5 +1,5 @@ from pynumaflow.mapstreamer import Datum -from pynumaflow.mapstreamer.proto import mapstream_pb2 +from pynumaflow.proto.mapstreamer import mapstream_pb2 from tests.testing_utils import get_time_args, mock_message diff --git a/tests/reduce/test_async_reduce.py b/tests/reduce/test_async_reduce.py index b75501ed..eae69791 100644 --- a/tests/reduce/test_async_reduce.py +++ b/tests/reduce/test_async_reduce.py @@ -10,15 +10,14 @@ from grpc.aio._server import Server from pynumaflow import setup_logging -from pynumaflow._constants import WIN_START_TIME, WIN_END_TIME +from pynumaflow._constants import WIN_START_TIME, WIN_END_TIME, ServerType from pynumaflow.reducer import ( Messages, Message, Datum, - AsyncReducer, - Metadata, + Metadata, ReduceServer, ) -from pynumaflow.reducer.proto import reduce_pb2, reduce_pb2_grpc +from pynumaflow.proto.reducer import reduce_pb2, reduce_pb2_grpc from tests.testing_utils import ( mock_message, mock_interval_window_start, @@ -94,12 +93,15 @@ async def reduce_handler(keys: list[str], datums: Iterator[Datum], md: Metadata) def NewAsyncReducer( reduce_handler=async_reduce_handler, ): - udfs = AsyncReducer(handler=async_reduce_handler) + server_instance = ReduceServer(reducer_instance=async_reduce_handler, + server_type=ServerType.Async) + udfs = server_instance.get_servicer(reducer_instance=server_instance.reducer_instance, + server_type=server_instance.server_type) return udfs -async def start_server(udfs: AsyncReducer): +async def start_server(udfs): server = grpc.aio.server() reduce_pb2_grpc.add_ReduceServicer_to_server(udfs, server) listen_addr = "[::]:50057" diff --git a/tests/sideinput/test_side_input_server.py b/tests/sideinput/test_side_input_server.py index 53e360bb..4d8db881 100644 --- a/tests/sideinput/test_side_input_server.py +++ b/tests/sideinput/test_side_input_server.py @@ -5,7 +5,7 @@ from grpc import StatusCode from grpc_testing import server_from_dictionary, strict_real_time from pynumaflow.sideinput import SideInput -from pynumaflow.sideinput.proto import sideinput_pb2 +from pynumaflow.proto.sideinput import sideinput_pb2 from pynumaflow.sideinput import Response diff --git a/tests/sink/test_async_sink.py b/tests/sink/test_async_sink.py index 062c5781..005e07c6 100644 --- a/tests/sink/test_async_sink.py +++ b/tests/sink/test_async_sink.py @@ -7,14 +7,14 @@ import grpc from google.protobuf import empty_pb2 as _empty_pb2 from grpc.aio._server import Server +from pynumaflow._constants import ServerType from pynumaflow import setup_logging from pynumaflow.sinker import ( Datum, ) -from pynumaflow.sinker import Responses, Response, AsyncSinker -from pynumaflow.sinker.proto import sink_pb2 -from pynumaflow.sinker.proto import sink_pb2_grpc +from pynumaflow.sinker import Responses, Response, SinkServer +from pynumaflow.proto.sinker import sink_pb2_grpc, sink_pb2 from tests.sink.test_server import ( mock_message, mock_err_message, @@ -69,7 +69,8 @@ def startup_callable(loop): async def start_server(): server = grpc.aio.server() - uds = AsyncSinker(handler=udsink_handler) + server_instance = SinkServer(sinker_instance=udsink_handler, server_type=ServerType.Async) + uds = server_instance.get_servicer(sinker_instance=udsink_handler, server_type=ServerType.Async) sink_pb2_grpc.add_SinkServicer_to_server(uds, server) listen_addr = "[::]:50055" server.add_insecure_port(listen_addr) diff --git a/tests/sink/test_server.py b/tests/sink/test_server.py index af9bccac..28b8af3e 100644 --- a/tests/sink/test_server.py +++ b/tests/sink/test_server.py @@ -7,8 +7,8 @@ from grpc import StatusCode from grpc_testing import server_from_dictionary, strict_real_time -from pynumaflow.sinker import Responses, Datum, Response, Sinker -from pynumaflow.sinker.proto import sink_pb2 +from pynumaflow.sinker import Responses, Datum, Response, SinkServer +from pynumaflow.proto.sinker import sink_pb2 def udsink_handler(datums: Iterator[Datum]) -> Responses: @@ -47,7 +47,10 @@ def mock_watermark(): class TestServer(unittest.TestCase): def setUp(self) -> None: - my_servicer = Sinker(udsink_handler) + server = SinkServer(sinker_instance=udsink_handler) + my_servicer = server.get_servicer( + sinker_instance=server.sinker_instance, server_type=server.server_type + ) services = {sink_pb2.DESCRIPTOR.services_by_name["Sink"]: my_servicer} self.test_server = server_from_dictionary(services, strict_real_time()) @@ -67,7 +70,10 @@ def test_is_ready(self): self.assertEqual(code, StatusCode.OK) def test_udsink_err(self): - my_servicer = Sinker(err_udsink_handler) + server = SinkServer(sinker_instance=err_udsink_handler) + my_servicer = server.get_servicer( + sinker_instance=server.sinker_instance, server_type=server.server_type + ) services = {sink_pb2.DESCRIPTOR.services_by_name["Sink"]: my_servicer} self.test_server = server_from_dictionary(services, strict_real_time()) diff --git a/tests/source/test_async_source.py b/tests/source/test_async_source.py index 53a6d97a..c62291c3 100644 --- a/tests/source/test_async_source.py +++ b/tests/source/test_async_source.py @@ -6,21 +6,19 @@ import grpc from google.protobuf import empty_pb2 as _empty_pb2 from grpc.aio._server import Server +from pynumaflow._constants import ServerType from pynumaflow import setup_logging from pynumaflow.sourcer import ( - AsyncSourcer, + SourceServer, ) -from pynumaflow.sourcer.proto import source_pb2_grpc, source_pb2 +from pynumaflow.proto.sourcer import source_pb2_grpc, source_pb2 from tests.source.utils import ( - async_source_read_handler, - async_source_ack_handler, - async_source_pending_handler, - async_source_partition_handler, mock_offset, read_req_source_fn, ack_req_source_fn, mock_partitions, + AsyncSource, ) LOGGER = setup_logging(__name__) @@ -28,7 +26,6 @@ # if set to true, map handler will raise a `ValueError` exception. raise_error_from_map = False - server_port = "localhost:50058" _s: Server = None @@ -41,22 +38,17 @@ def startup_callable(loop): loop.run_forever() -def NewAsyncSourcer( - handler=async_source_read_handler, - ack_handler=async_source_ack_handler, - pending_handler=async_source_pending_handler, - partitions_handler=async_source_partition_handler, -): - udfs = AsyncSourcer( - read_handler=async_source_read_handler, - ack_handler=async_source_ack_handler, - pending_handler=async_source_pending_handler, - partitions_handler=async_source_partition_handler, +def NewAsyncSourcer(): + class_instance = AsyncSource() + server = SourceServer(sourcer_instance=class_instance, server_type=ServerType.Async) + + udfs = server.get_servicer( + sourcer_instance=server.sourcer_instance, server_type=server.server_type ) return udfs -async def start_server(udfs: AsyncSourcer): +async def start_server(udfs): server = grpc.aio.server() source_pb2_grpc.add_SourceServicer_to_server(udfs, server) listen_addr = "[::]:50058" diff --git a/tests/source/test_async_source_err.py b/tests/source/test_async_source_err.py index 6b9bd3f0..a0b689fd 100644 --- a/tests/source/test_async_source_err.py +++ b/tests/source/test_async_source_err.py @@ -6,18 +6,16 @@ import grpc from grpc.aio._server import Server +from pynumaflow._constants import ServerType from pynumaflow import setup_logging -from pynumaflow.sourcer import AsyncSourcer -from pynumaflow.sourcer.proto import source_pb2_grpc, source_pb2 +from pynumaflow.sourcer import SourceServer +from pynumaflow.proto.sourcer import source_pb2_grpc, source_pb2 from google.protobuf import empty_pb2 as _empty_pb2 from tests.source.utils import ( - err_async_source_read_handler, - err_async_source_ack_handler, - err_async_source_pending_handler, read_req_source_fn, ack_req_source_fn, - err_async_source_partition_handler, + AsyncSourceError, ) LOGGER = setup_logging(__name__) @@ -35,11 +33,10 @@ def startup_callable(loop): async def start_server(): server = grpc.aio.server() - udfs = AsyncSourcer( - read_handler=err_async_source_read_handler, - ack_handler=err_async_source_ack_handler, - pending_handler=err_async_source_pending_handler, - partitions_handler=err_async_source_partition_handler, + class_instance = AsyncSourceError() + server_instance = SourceServer(sourcer_instance=class_instance, server_type=ServerType.Async) + udfs = server.get_servicer( + sourcer_instance=server_instance.sourcer_instance, server_type=server_instance.server_type ) source_pb2_grpc.add_SourceServicer_to_server(udfs, server) listen_addr = "[::]:50062" diff --git a/tests/source/test_sync_source.py b/tests/source/test_sync_source.py index 8b230d5e..9cfc81fa 100644 --- a/tests/source/test_sync_source.py +++ b/tests/source/test_sync_source.py @@ -4,42 +4,36 @@ from grpc import StatusCode from grpc_testing import server_from_dictionary, strict_real_time -from pynumaflow.sourcer import Sourcer -from pynumaflow.sourcer.proto import source_pb2 +from pynumaflow.sourcer import SourceServer +from pynumaflow.proto.sourcer import source_pb2 from tests.source.utils import ( - sync_source_read_handler, - sync_source_ack_handler, - sync_source_pending_handler, - sync_source_partition_handler, read_req_source_fn, mock_offset, ack_req_source_fn, mock_partitions, + SyncSource, ) class TestSyncSourcer(unittest.TestCase): def setUp(self) -> None: - my_servicer = Sourcer( - read_handler=sync_source_read_handler, - ack_handler=sync_source_ack_handler, - pending_handler=sync_source_pending_handler, - partitions_handler=sync_source_partition_handler, + class_instance = SyncSource() + server = SourceServer(sourcer_instance=class_instance) + my_servicer = server.get_servicer( + sourcer_instance=server.sourcer_instance, server_type=server.server_type ) services = {source_pb2.DESCRIPTOR.services_by_name["Source"]: my_servicer} self.test_server = server_from_dictionary(services, strict_real_time()) def test_init_with_args(self) -> None: - my_servicer = Sourcer( - read_handler=sync_source_read_handler, - ack_handler=sync_source_ack_handler, - pending_handler=sync_source_pending_handler, - partitions_handler=sync_source_partition_handler, + class_instance = SyncSource() + server = SourceServer( + sourcer_instance=class_instance, sock_path="/tmp/test.sock", max_message_size=1024 * 1024 * 5, ) - self.assertEqual(my_servicer.sock_path, "unix:///tmp/test.sock") - self.assertEqual(my_servicer._max_message_size, 1024 * 1024 * 5) + self.assertEqual(server.sock_path, "unix:///tmp/test.sock") + self.assertEqual(server.max_message_size, 1024 * 1024 * 5) def test_is_ready(self): method = self.test_server.invoke_unary_unary( diff --git a/tests/source/test_sync_source_err.py b/tests/source/test_sync_source_err.py index 875c4494..b2464e2e 100644 --- a/tests/source/test_sync_source_err.py +++ b/tests/source/test_sync_source_err.py @@ -3,26 +3,23 @@ import grpc from google.protobuf import empty_pb2 as _empty_pb2 from grpc_testing import server_from_dictionary, strict_real_time +from pynumaflow._constants import ServerType -from pynumaflow.sourcer import Sourcer -from pynumaflow.sourcer.proto import source_pb2 +from pynumaflow.sourcer import SourceServer +from pynumaflow.proto.sourcer import source_pb2 from tests.source.utils import ( read_req_source_fn, ack_req_source_fn, - err_sync_source_read_handler, - err_sync_source_ack_handler, - err_sync_source_pending_handler, - err_sync_source_partition_handler, + SyncSourceError, ) class TestSyncSourcer(unittest.TestCase): def setUp(self) -> None: - my_servicer = Sourcer( - read_handler=err_sync_source_read_handler, - ack_handler=err_sync_source_ack_handler, - pending_handler=err_sync_source_pending_handler, - partitions_handler=err_sync_source_partition_handler, + class_instance = SyncSourceError() + server = SourceServer(sourcer_instance=class_instance, server_type=ServerType.Sync) + my_servicer = server.get_servicer( + sourcer_instance=server.sourcer_instance, server_type=server.server_type ) services = {source_pb2.DESCRIPTOR.services_by_name["Source"]: my_servicer} self.test_server = server_from_dictionary(services, strict_real_time()) @@ -105,7 +102,7 @@ def test_source_partition(self): def test_invalid_input(self): with self.assertRaises(TypeError): - Sourcer() + SourceServer() if __name__ == "__main__": diff --git a/tests/source/utils.py b/tests/source/utils.py index 7ddc8f85..1c7f693b 100644 --- a/tests/source/utils.py +++ b/tests/source/utils.py @@ -1,8 +1,14 @@ from collections.abc import AsyncIterable, Iterable from pynumaflow.sourcer import ReadRequest, Message -from pynumaflow.sourcer._dtypes import AckRequest, PendingResponse, Offset, PartitionsResponse -from pynumaflow.sourcer.proto import source_pb2 +from pynumaflow.sourcer._dtypes import ( + AckRequest, + PendingResponse, + Offset, + PartitionsResponse, + SourcerClass, +) +from pynumaflow.proto.sourcer import source_pb2 from tests.testing_utils import mock_event_time @@ -14,46 +20,42 @@ def mock_partitions() -> list[int]: return [1, 2, 3] -async def async_source_read_handler(datum: ReadRequest) -> AsyncIterable[Message]: - payload = b"payload:test_mock_message" - keys = ["test_key"] - offset = mock_offset() - event_time = mock_event_time() - for i in range(10): - yield Message(payload=payload, keys=keys, offset=offset, event_time=event_time) +class AsyncSource(SourcerClass): + async def read_handler(self, datum: ReadRequest) -> AsyncIterable[Message]: + payload = b"payload:test_mock_message" + keys = ["test_key"] + offset = mock_offset() + event_time = mock_event_time() + for i in range(10): + yield Message(payload=payload, keys=keys, offset=offset, event_time=event_time) + async def ack_handler(self, ack_request: AckRequest): + return -async def async_source_ack_handler(ack_request: AckRequest): - return + async def pending_handler(self) -> PendingResponse: + return PendingResponse(count=10) + async def partitions_handler(self) -> PartitionsResponse: + return PartitionsResponse(partitions=mock_partitions()) -async def async_source_pending_handler() -> PendingResponse: - return PendingResponse(count=10) +class SyncSource(SourcerClass): + def read_handler(self, datum: ReadRequest) -> Iterable[Message]: + payload = b"payload:test_mock_message" + keys = ["test_key"] + offset = mock_offset() + event_time = mock_event_time() + for i in range(10): + yield Message(payload=payload, keys=keys, offset=offset, event_time=event_time) -async def async_source_partition_handler() -> PartitionsResponse: - return PartitionsResponse(partitions=mock_partitions()) + def ack_handler(self, ack_request: AckRequest): + return + def pending_handler(self) -> PendingResponse: + return PendingResponse(count=10) -def sync_source_read_handler(datum: ReadRequest) -> Iterable[Message]: - payload = b"payload:test_mock_message" - keys = ["test_key"] - offset = mock_offset() - event_time = mock_event_time() - for i in range(10): - yield Message(payload=payload, keys=keys, offset=offset, event_time=event_time) - - -def sync_source_ack_handler(ack_request: AckRequest): - return - - -def sync_source_pending_handler() -> PendingResponse: - return PendingResponse(count=10) - - -def sync_source_partition_handler() -> PartitionsResponse: - return PartitionsResponse(partitions=mock_partitions()) + def partitions_handler(self) -> PartitionsResponse: + return PartitionsResponse(partitions=mock_partitions()) def read_req_source_fn() -> ReadRequest: @@ -70,40 +72,36 @@ def ack_req_source_fn() -> AckRequest: return request -# This handler mimics the scenario where map stream UDF throws a runtime error. -async def err_async_source_read_handler(datum: ReadRequest) -> AsyncIterable[Message]: - payload = b"payload:test_mock_message" - keys = ["test_key"] - offset = mock_offset() - event_time = mock_event_time() - for i in range(10): - yield Message(payload=payload, keys=keys, offset=offset, event_time=event_time) - raise RuntimeError("Got a runtime error from read handler.") - - -async def err_async_source_ack_handler(ack_request: AckRequest): - raise RuntimeError("Got a runtime error from ack handler.") - - -async def err_async_source_pending_handler() -> PendingResponse: - raise RuntimeError("Got a runtime error from pending handler.") - - -async def err_async_source_partition_handler() -> PartitionsResponse: - raise RuntimeError("Got a runtime error from partition handler.") +class AsyncSourceError(SourcerClass): + # This handler mimics the scenario where map stream UDF throws a runtime error. + async def read_handler(self, datum: ReadRequest) -> AsyncIterable[Message]: + payload = b"payload:test_mock_message" + keys = ["test_key"] + offset = mock_offset() + event_time = mock_event_time() + for i in range(10): + yield Message(payload=payload, keys=keys, offset=offset, event_time=event_time) + raise RuntimeError("Got a runtime error from read handler.") + async def ack_handler(self, ack_request: AckRequest): + raise RuntimeError("Got a runtime error from ack handler.") -def err_sync_source_read_handler(datum: ReadRequest) -> Iterable[Message]: - raise RuntimeError("Got a runtime error from read handler.") + async def pending_handler(self) -> PendingResponse: + raise RuntimeError("Got a runtime error from pending handler.") + async def partitions_handler(self) -> PartitionsResponse: + raise RuntimeError("Got a runtime error from partition handler.") -def err_sync_source_ack_handler(ack_request: AckRequest): - raise RuntimeError("Got a runtime error from ack handler.") +class SyncSourceError(SourcerClass): + def read_handler(self, datum: ReadRequest) -> Iterable[Message]: + raise RuntimeError("Got a runtime error from read handler.") -def err_sync_source_pending_handler() -> PendingResponse: - raise RuntimeError("Got a runtime error from pending handler.") + def ack_handler(self, ack_request: AckRequest): + raise RuntimeError("Got a runtime error from ack handler.") + def pending_handler(self) -> PendingResponse: + raise RuntimeError("Got a runtime error from pending handler.") -def err_sync_source_partition_handler() -> PartitionsResponse: - raise RuntimeError("Got a runtime error from partition handler.") + def partitions_handler(self) -> PartitionsResponse: + raise RuntimeError("Got a runtime error from partition handler.") diff --git a/tests/sourcetransform/test_multiproc.py b/tests/sourcetransform/test_multiproc.py index 25cca845..10196857 100644 --- a/tests/sourcetransform/test_multiproc.py +++ b/tests/sourcetransform/test_multiproc.py @@ -8,9 +8,10 @@ from google.protobuf import timestamp_pb2 as _timestamp_pb2 from grpc import StatusCode from grpc_testing import server_from_dictionary, strict_real_time +from pynumaflow._constants import ServerType -from pynumaflow.sourcetransformer.multiproc_server import MultiProcSourceTransformer -from pynumaflow.sourcetransformer.proto import transform_pb2_grpc, transform_pb2 +from pynumaflow.proto.sourcetransformer import transform_pb2 +from pynumaflow.sourcetransformer import SourceTransformServer from tests.sourcetransform.utils import transform_handler, err_transform_handler from tests.testing_utils import ( mock_event_time, @@ -27,52 +28,46 @@ def mockenv(**envvars): class TestMultiProcMethods(unittest.TestCase): def setUp(self) -> None: - my_servicer = MultiProcSourceTransformer(handler=transform_handler) + server = SourceTransformServer( + source_transform_instance=transform_handler, server_type=ServerType.Multiproc + ) + my_servicer = server.get_servicer( + source_transform_instance=server.source_transform_instance, + server_type=server.server_type, + ) services = {transform_pb2.DESCRIPTOR.services_by_name["SourceTransform"]: my_servicer} self.test_server = server_from_dictionary(services, strict_real_time()) @mockenv(NUM_CPU_MULTIPROC="3") def test_multiproc_init(self) -> None: - server = MultiProcSourceTransformer(handler=transform_handler) + server = SourceTransformServer( + source_transform_instance=transform_handler, server_type=ServerType.Multiproc + ) self.assertEqual(server._process_count, 3) @patch("os.cpu_count", Mock(return_value=4)) def test_multiproc_process_count(self) -> None: - server = MultiProcSourceTransformer(handler=transform_handler) + server = SourceTransformServer( + source_transform_instance=transform_handler, server_type=ServerType.Multiproc + ) self.assertEqual(server._process_count, 4) @patch("os.cpu_count", Mock(return_value=4)) @mockenv(NUM_CPU_MULTIPROC="10") def test_max_process_count(self) -> None: - server = MultiProcSourceTransformer(handler=transform_handler) + server = SourceTransformServer( + source_transform_instance=transform_handler, server_type=ServerType.Multiproc + ) self.assertEqual(server._process_count, 8) - # To test the reuse property for the grpc servers which allow multiple - # bindings to the same server - def test_reuse_port(self): - serv_options = [("grpc.so_reuseaddr", 1)] - - server = MultiProcSourceTransformer(handler=transform_handler) - - with server._reserve_port(0) as port: - print(port) - bind_address = f"localhost:{port}" - server1 = grpc.server(thread_pool=None, options=serv_options) - transform_pb2_grpc.add_SourceTransformServicer_to_server(server, server1) - server1.add_insecure_port(bind_address) - - # so_reuseport=0 -> the bind should raise an error - server2 = grpc.server(thread_pool=None, options=(("grpc.so_reuseport", 0),)) - transform_pb2_grpc.add_SourceTransformServicer_to_server(server, server2) - self.assertRaises(RuntimeError, server2.add_insecure_port, bind_address) - - # so_reuseport=1 -> should allow server to bind to port again - server3 = grpc.server(thread_pool=None, options=(("grpc.so_reuseport", 1),)) - transform_pb2_grpc.add_SourceTransformServicer_to_server(server, server3) - server3.add_insecure_port(bind_address) - def test_udf_mapt_err(self): - my_servicer = MultiProcSourceTransformer(handler=err_transform_handler) + server = SourceTransformServer( + source_transform_instance=err_transform_handler, server_type=ServerType.Multiproc + ) + my_servicer = server.get_servicer( + source_transform_instance=server.source_transform_instance, + server_type=server.server_type, + ) services = {transform_pb2.DESCRIPTOR.services_by_name["SourceTransform"]: my_servicer} self.test_server = server_from_dictionary(services, strict_real_time()) @@ -166,7 +161,7 @@ def test_mapt_assign_new_event_time(self, test_server=None): def test_invalid_input(self): with self.assertRaises(TypeError): - MultiProcSourceTransformer() + SourceTransformServer(server_type=ServerType.Multiproc) if __name__ == "__main__": diff --git a/tests/sourcetransform/test_sync_server.py b/tests/sourcetransform/test_sync_server.py index 54830c40..f340878d 100644 --- a/tests/sourcetransform/test_sync_server.py +++ b/tests/sourcetransform/test_sync_server.py @@ -5,9 +5,10 @@ from google.protobuf import timestamp_pb2 as _timestamp_pb2 from grpc import StatusCode from grpc_testing import server_from_dictionary, strict_real_time +from pynumaflow._constants import ServerType -from pynumaflow.sourcetransformer import SourceTransformer -from pynumaflow.sourcetransformer.proto import transform_pb2 +from pynumaflow.sourcetransformer import SourceTransformServer +from pynumaflow.proto.sourcetransformer import transform_pb2 from tests.sourcetransform.utils import transform_handler, err_transform_handler from tests.testing_utils import ( mock_event_time, @@ -19,19 +20,34 @@ class TestServer(unittest.TestCase): def setUp(self) -> None: - my_servicer = SourceTransformer(handler=transform_handler) + server = SourceTransformServer( + source_transform_instance=transform_handler, server_type=ServerType.Sync + ) + my_servicer = server.get_servicer( + source_transform_instance=server.source_transform_instance, + server_type=server.server_type, + ) services = {transform_pb2.DESCRIPTOR.services_by_name["SourceTransform"]: my_servicer} self.test_server = server_from_dictionary(services, strict_real_time()) def test_init_with_args(self) -> None: - my_servicer = SourceTransformer( - handler=transform_handler, sock_path="/tmp/test.sock", max_message_size=1024 * 1024 * 5 + server = SourceTransformServer( + source_transform_instance=transform_handler, + sock_path="/tmp/test.sock", + max_message_size=1024 * 1024 * 5, + server_type=ServerType.Sync, ) - self.assertEqual(my_servicer.sock_path, "unix:///tmp/test.sock") - self.assertEqual(my_servicer._max_message_size, 1024 * 1024 * 5) + self.assertEqual(server.sock_path, "unix:///tmp/test.sock") + self.assertEqual(server.max_message_size, 1024 * 1024 * 5) def test_udf_mapt_err(self): - my_servicer = SourceTransformer(handler=err_transform_handler) + server = SourceTransformServer( + source_transform_instance=err_transform_handler, server_type=ServerType.Sync + ) + my_servicer = server.get_servicer( + source_transform_instance=server.source_transform_instance, + server_type=server.server_type, + ) services = {transform_pb2.DESCRIPTOR.services_by_name["SourceTransform"]: my_servicer} self.test_server = server_from_dictionary(services, strict_real_time()) @@ -128,7 +144,7 @@ def test_mapt_assign_new_event_time(self, test_server=None): def test_invalid_input(self): with self.assertRaises(TypeError): - SourceTransformer() + SourceTransformServer() if __name__ == "__main__": From 7b96725d812cd77823b7b13e01c00353f7dd21b2 Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Wed, 10 Jan 2024 12:03:09 -0800 Subject: [PATCH 41/78] source Signed-off-by: Sidhant Kohli --- tests/map/test_async_mapper.py | 10 +++++----- tests/mapstream/test_async_map_stream.py | 8 ++++---- tests/mapstream/test_async_map_stream_err.py | 8 ++++---- tests/reduce/test_async_reduce.py | 21 +++++++++++--------- tests/sink/test_async_sink.py | 8 ++++---- tests/source/test_async_source.py | 4 ++-- tests/source/test_async_source_err.py | 7 ++++--- 7 files changed, 35 insertions(+), 31 deletions(-) diff --git a/tests/map/test_async_mapper.py b/tests/map/test_async_mapper.py index fd7e35e2..d50da394 100644 --- a/tests/map/test_async_mapper.py +++ b/tests/map/test_async_mapper.py @@ -53,7 +53,7 @@ def request_generator(count, request, resetkey: bool = False): _s: Server = None -_channel = grpc.insecure_channel("localhost:50056") +_channel = grpc.insecure_channel("unix:///tmp/async_map.sock") _loop = None @@ -71,7 +71,7 @@ def new_async_mapper(): async def start_server(udfs): server = grpc.aio.server() map_pb2_grpc.add_MapServicer_to_server(udfs, server) - listen_addr = "[::]:50056" + listen_addr = "unix:///tmp/async_map.sock" server.add_insecure_port(listen_addr) logging.info("Starting server on %s", listen_addr) global _s @@ -92,7 +92,7 @@ def setUpClass(cls) -> None: asyncio.run_coroutine_threadsafe(start_server(udfs), loop=loop) while True: try: - with grpc.insecure_channel("localhost:50056") as channel: + with grpc.insecure_channel("unix:///tmp/async_map.sock") as channel: f = grpc.channel_ready_future(channel) f.result(timeout=10) if f.done(): @@ -110,7 +110,7 @@ def tearDownClass(cls) -> None: LOGGER.error(e) def test_run_server(self) -> None: - with grpc.insecure_channel("localhost:50056") as channel: + with grpc.insecure_channel("unix:///tmp/async_map.sock") as channel: stub = map_pb2_grpc.MapStub(channel) event_time_timestamp = _timestamp_pb2.Timestamp() event_time_timestamp.FromDatetime(dt=mock_event_time()) @@ -200,7 +200,7 @@ def test_map_grpc_error(self) -> None: self.assertIsNotNone(grpcException) def test_is_ready(self) -> None: - with grpc.insecure_channel("localhost:50056") as channel: + with grpc.insecure_channel("unix:///tmp/async_map.sock") as channel: stub = map_pb2_grpc.MapStub(channel) request = _empty_pb2.Empty() diff --git a/tests/mapstream/test_async_map_stream.py b/tests/mapstream/test_async_map_stream.py index 621b4f3f..dec4eb77 100644 --- a/tests/mapstream/test_async_map_stream.py +++ b/tests/mapstream/test_async_map_stream.py @@ -35,7 +35,7 @@ async def async_map_stream_handler(keys: list[str], datum: Datum) -> AsyncIterab _s: Server = None -_channel = grpc.insecure_channel("localhost:50060") +_channel = grpc.insecure_channel("unix:///tmp/async_map_stream.sock") _loop = None @@ -57,7 +57,7 @@ def NewAsyncMapStreamer( async def start_server(udfs): server = grpc.aio.server() mapstream_pb2_grpc.add_MapStreamServicer_to_server(udfs, server) - listen_addr = "[::]:50060" + listen_addr = "unix:///tmp/async_map_stream.sock" server.add_insecure_port(listen_addr) logging.info("Starting server on %s", listen_addr) global _s @@ -78,7 +78,7 @@ def setUpClass(cls) -> None: asyncio.run_coroutine_threadsafe(start_server(udfs), loop=loop) while True: try: - with grpc.insecure_channel("localhost:50060") as channel: + with grpc.insecure_channel("unix:///tmp/async_map_stream.sock") as channel: f = grpc.channel_ready_future(channel) f.result(timeout=10) if f.done(): @@ -120,7 +120,7 @@ def test_map_stream(self) -> None: self.assertEqual(10, counter) def test_is_ready(self) -> None: - with grpc.insecure_channel("localhost:50060") as channel: + with grpc.insecure_channel("unix:///tmp/async_map_stream.sock") as channel: stub = mapstream_pb2_grpc.MapStreamStub(channel) request = _empty_pb2.Empty() diff --git a/tests/mapstream/test_async_map_stream_err.py b/tests/mapstream/test_async_map_stream_err.py index 6bf3a246..10927163 100644 --- a/tests/mapstream/test_async_map_stream_err.py +++ b/tests/mapstream/test_async_map_stream_err.py @@ -33,7 +33,7 @@ async def err_async_map_stream_handler(keys: list[str], datum: Datum) -> AsyncIt _s: Server = None -_channel = grpc.insecure_channel("localhost:50041") +_channel = grpc.insecure_channel("unix:///tmp/async_map_stream_err.sock") _loop = None @@ -46,10 +46,10 @@ async def start_server(): server = grpc.aio.server() server_instance = MapStreamServer(map_stream_instance=err_async_map_stream_handler) udfs = server_instance.get_servicer( - map_stream_instance=err_async_map_stream_handler, server_type=server.server_type + map_stream_instance=err_async_map_stream_handler, server_type=server_instance.server_type ) mapstream_pb2_grpc.add_MapStreamServicer_to_server(udfs, server) - listen_addr = "[::]:50041" + listen_addr = "unix:///tmp/async_map_stream_err.sock" server.add_insecure_port(listen_addr) logging.info("Starting server on %s", listen_addr) global _s @@ -69,7 +69,7 @@ def setUpClass(cls) -> None: asyncio.run_coroutine_threadsafe(start_server(), loop=loop) while True: try: - with grpc.insecure_channel("localhost:50041") as channel: + with grpc.insecure_channel("unix:///tmp/async_map_stream_err.sock") as channel: f = grpc.channel_ready_future(channel) f.result(timeout=10) if f.done(): diff --git a/tests/reduce/test_async_reduce.py b/tests/reduce/test_async_reduce.py index eae69791..329e3561 100644 --- a/tests/reduce/test_async_reduce.py +++ b/tests/reduce/test_async_reduce.py @@ -15,7 +15,8 @@ Messages, Message, Datum, - Metadata, ReduceServer, + Metadata, + ReduceServer, ) from pynumaflow.proto.reducer import reduce_pb2, reduce_pb2_grpc from tests.testing_utils import ( @@ -69,7 +70,7 @@ def start_request() -> (Datum, tuple): _s: Server = None -_channel = grpc.insecure_channel("localhost:50057") +_channel = grpc.insecure_channel("unix:///tmp/reduce.sock") _loop = None @@ -93,10 +94,12 @@ async def reduce_handler(keys: list[str], datums: Iterator[Datum], md: Metadata) def NewAsyncReducer( reduce_handler=async_reduce_handler, ): - server_instance = ReduceServer(reducer_instance=async_reduce_handler, - server_type=ServerType.Async) - udfs = server_instance.get_servicer(reducer_instance=server_instance.reducer_instance, - server_type=server_instance.server_type) + server_instance = ReduceServer( + reducer_instance=async_reduce_handler, server_type=ServerType.Async + ) + udfs = server_instance.get_servicer( + reducer_instance=server_instance.reducer_instance, server_type=server_instance.server_type + ) return udfs @@ -104,7 +107,7 @@ def NewAsyncReducer( async def start_server(udfs): server = grpc.aio.server() reduce_pb2_grpc.add_ReduceServicer_to_server(udfs, server) - listen_addr = "[::]:50057" + listen_addr = "unix:///tmp/reduce.sock" server.add_insecure_port(listen_addr) logging.info("Starting server on %s", listen_addr) global _s @@ -125,7 +128,7 @@ def setUpClass(cls) -> None: asyncio.run_coroutine_threadsafe(start_server(udfs), loop=loop) while True: try: - with grpc.insecure_channel("localhost:50057") as channel: + with grpc.insecure_channel("unix:///tmp/reduce.sock") as channel: f = grpc.channel_ready_future(channel) f.result(timeout=10) if f.done(): @@ -219,7 +222,7 @@ def test_reduce_with_multiple_keys(self) -> None: self.assertEqual(100, count) def test_is_ready(self) -> None: - with grpc.insecure_channel("localhost:50057") as channel: + with grpc.insecure_channel("unix:///tmp/reduce.sock") as channel: stub = reduce_pb2_grpc.ReduceStub(channel) request = _empty_pb2.Empty() diff --git a/tests/sink/test_async_sink.py b/tests/sink/test_async_sink.py index 005e07c6..db2ea793 100644 --- a/tests/sink/test_async_sink.py +++ b/tests/sink/test_async_sink.py @@ -58,7 +58,7 @@ def start_sink_streaming_request(err=False) -> (Datum, tuple): _s: Server = None -_channel = grpc.insecure_channel("localhost:50055") +_channel = grpc.insecure_channel("unix:///tmp/async_sink.sock") _loop = None @@ -72,7 +72,7 @@ async def start_server(): server_instance = SinkServer(sinker_instance=udsink_handler, server_type=ServerType.Async) uds = server_instance.get_servicer(sinker_instance=udsink_handler, server_type=ServerType.Async) sink_pb2_grpc.add_SinkServicer_to_server(uds, server) - listen_addr = "[::]:50055" + listen_addr = "unix:///tmp/async_sink.sock" server.add_insecure_port(listen_addr) logging.info("Starting server on %s", listen_addr) global _s @@ -92,7 +92,7 @@ def setUpClass(cls) -> None: asyncio.run_coroutine_threadsafe(start_server(), loop=loop) while True: try: - with grpc.insecure_channel("localhost:50055") as channel: + with grpc.insecure_channel("unix:///tmp/async_sink.sock") as channel: f = grpc.channel_ready_future(channel) f.result(timeout=10) if f.done(): @@ -111,7 +111,7 @@ def tearDownClass(cls) -> None: # def test_run_server(self) -> None: - with grpc.insecure_channel("localhost:50055") as channel: + with grpc.insecure_channel("unix:///tmp/async_sink.sock") as channel: stub = sink_pb2_grpc.SinkStub(channel) request = _empty_pb2.Empty() diff --git a/tests/source/test_async_source.py b/tests/source/test_async_source.py index c62291c3..b7c7d179 100644 --- a/tests/source/test_async_source.py +++ b/tests/source/test_async_source.py @@ -26,7 +26,7 @@ # if set to true, map handler will raise a `ValueError` exception. raise_error_from_map = False -server_port = "localhost:50058" +server_port = "unix:///tmp/async_source.sock" _s: Server = None _channel = grpc.insecure_channel(server_port) @@ -51,7 +51,7 @@ def NewAsyncSourcer(): async def start_server(udfs): server = grpc.aio.server() source_pb2_grpc.add_SourceServicer_to_server(udfs, server) - listen_addr = "[::]:50058" + listen_addr = "unix:///tmp/async_source.sock" server.add_insecure_port(listen_addr) logging.info("Starting server on %s", listen_addr) global _s diff --git a/tests/source/test_async_source_err.py b/tests/source/test_async_source_err.py index a0b689fd..d61d55a5 100644 --- a/tests/source/test_async_source_err.py +++ b/tests/source/test_async_source_err.py @@ -21,7 +21,7 @@ LOGGER = setup_logging(__name__) _s: Server = None -server_port = "localhost:50062" +server_port = "unix:///tmp/async_source_err.sock" _channel = grpc.insecure_channel(server_port) _loop = None @@ -39,7 +39,7 @@ async def start_server(): sourcer_instance=server_instance.sourcer_instance, server_type=server_instance.server_type ) source_pb2_grpc.add_SourceServicer_to_server(udfs, server) - listen_addr = "[::]:50062" + listen_addr = "unix:///tmp/async_source_err.sock" server.add_insecure_port(listen_addr) logging.info("Starting server on %s", listen_addr) global _s @@ -59,7 +59,7 @@ def setUpClass(cls) -> None: asyncio.run_coroutine_threadsafe(start_server(), loop=loop) while True: try: - with grpc.insecure_channel("localhost:50062") as channel: + with grpc.insecure_channel("unix:///tmp/async_source_err.sock") as channel: f = grpc.channel_ready_future(channel) f.result(timeout=10) if f.done(): @@ -71,6 +71,7 @@ def setUpClass(cls) -> None: @classmethod def tearDownClass(cls) -> None: try: + _s.stop() _loop.stop() LOGGER.info("stopped the event loop") except Exception as e: From 0a3c00427f96898c3fd5d6b86fa40be51cc85205 Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Wed, 10 Jan 2024 12:28:11 -0800 Subject: [PATCH 42/78] tests Signed-off-by: Sidhant Kohli --- tests/source/test_async_source_err.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/tests/source/test_async_source_err.py b/tests/source/test_async_source_err.py index d61d55a5..bf5ec795 100644 --- a/tests/source/test_async_source_err.py +++ b/tests/source/test_async_source_err.py @@ -21,7 +21,7 @@ LOGGER = setup_logging(__name__) _s: Server = None -server_port = "unix:///tmp/async_source_err.sock" +server_port = "localhost:50062" _channel = grpc.insecure_channel(server_port) _loop = None @@ -35,11 +35,17 @@ async def start_server(): server = grpc.aio.server() class_instance = AsyncSourceError() server_instance = SourceServer(sourcer_instance=class_instance, server_type=ServerType.Async) - udfs = server.get_servicer( - sourcer_instance=server_instance.sourcer_instance, server_type=server_instance.server_type + udfs = server_instance.get_servicer( + sourcer_instance=server_instance.sourcer_instance, server_type=ServerType.Async ) + # udfs = AsyncSourcer( + # read_handler=err_async_source_read_handler, + # ack_handler=err_async_source_ack_handler, + # pending_handler=err_async_source_pending_handler, + # partitions_handler=err_async_source_partition_handler, + # ) source_pb2_grpc.add_SourceServicer_to_server(udfs, server) - listen_addr = "unix:///tmp/async_source_err.sock" + listen_addr = "[::]:50062" server.add_insecure_port(listen_addr) logging.info("Starting server on %s", listen_addr) global _s @@ -59,7 +65,7 @@ def setUpClass(cls) -> None: asyncio.run_coroutine_threadsafe(start_server(), loop=loop) while True: try: - with grpc.insecure_channel("unix:///tmp/async_source_err.sock") as channel: + with grpc.insecure_channel("localhost:50062") as channel: f = grpc.channel_ready_future(channel) f.result(timeout=10) if f.done(): @@ -71,7 +77,6 @@ def setUpClass(cls) -> None: @classmethod def tearDownClass(cls) -> None: try: - _s.stop() _loop.stop() LOGGER.info("stopped the event loop") except Exception as e: From 50fd5c35ab1267b730705f2416575038781bb648 Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Wed, 10 Jan 2024 12:32:47 -0800 Subject: [PATCH 43/78] tests Signed-off-by: Sidhant Kohli --- tests/source/test_async_source_err.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/tests/source/test_async_source_err.py b/tests/source/test_async_source_err.py index bf5ec795..0a1f9a61 100644 --- a/tests/source/test_async_source_err.py +++ b/tests/source/test_async_source_err.py @@ -21,7 +21,7 @@ LOGGER = setup_logging(__name__) _s: Server = None -server_port = "localhost:50062" +server_port = "unix:///tmp/async_err_source.sock" _channel = grpc.insecure_channel(server_port) _loop = None @@ -38,14 +38,8 @@ async def start_server(): udfs = server_instance.get_servicer( sourcer_instance=server_instance.sourcer_instance, server_type=ServerType.Async ) - # udfs = AsyncSourcer( - # read_handler=err_async_source_read_handler, - # ack_handler=err_async_source_ack_handler, - # pending_handler=err_async_source_pending_handler, - # partitions_handler=err_async_source_partition_handler, - # ) source_pb2_grpc.add_SourceServicer_to_server(udfs, server) - listen_addr = "[::]:50062" + listen_addr = "unix:///tmp/async_err_source.sock" server.add_insecure_port(listen_addr) logging.info("Starting server on %s", listen_addr) global _s @@ -65,7 +59,7 @@ def setUpClass(cls) -> None: asyncio.run_coroutine_threadsafe(start_server(), loop=loop) while True: try: - with grpc.insecure_channel("localhost:50062") as channel: + with grpc.insecure_channel("unix:///tmp/async_err_source.sock") as channel: f = grpc.channel_ready_future(channel) f.result(timeout=10) if f.done(): From c76de42ca3310c7e6694c869784cf3f7ce164909 Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Wed, 10 Jan 2024 13:34:40 -0800 Subject: [PATCH 44/78] examples Signed-off-by: Sidhant Kohli --- examples/map/even_odd/example.py | 4 +- examples/map/even_odd/pyproject.toml | 2 +- examples/map/flatmap/example.py | 28 +++++----- examples/map/flatmap/pyproject.toml | 2 +- examples/map/forward_message/example.py | 4 +- examples/map/forward_message/pyproject.toml | 2 +- examples/map/multiproc_map/README.md | 5 +- examples/map/multiproc_map/example.py | 29 +++++----- examples/map/multiproc_map/pyproject.toml | 2 +- examples/mapstream/flatmap_stream/example.py | 11 ++-- .../mapstream/flatmap_stream/pipeline.yaml | 53 +++++++++++++++++++ .../mapstream/flatmap_stream/pyproject.toml | 2 +- examples/reduce/counter/Makefile | 2 +- examples/reduce/counter/example.py | 30 +++++++++-- examples/reduce/counter/pipeline.yaml | 51 ++++++++++++++++++ examples/reduce/counter/pyproject.toml | 3 +- .../sideinput/simple-sideinput/udf/example.py | 4 +- examples/sink/async_log/Makefile | 2 +- examples/sink/async_log/example.py | 22 ++++++-- .../sink/async_log/pipeline-numaflow.yaml | 10 +++- examples/sink/async_log/pyproject.toml | 2 +- examples/sink/log/Makefile | 2 +- examples/sink/log/example.py | 20 ++++++- examples/sink/log/pipeline-numaflow.yaml | 10 +++- examples/sink/log/pyproject.toml | 2 +- examples/source/async-source/Makefile | 2 +- examples/source/async-source/example.py | 13 ++--- .../async-source/pipeline-numaflow.yaml | 2 +- examples/source/async-source/pyproject.toml | 3 +- examples/source/simple-source/Makefile | 2 +- examples/source/simple-source/example.py | 12 ++--- .../simple-source/pipeline-numaflow.yaml | 2 +- examples/source/simple-source/pyproject.toml | 3 +- .../event_time_filter/Makefile | 2 +- .../event_time_filter/example.py | 4 +- .../event_time_filter/pyproject.toml | 2 +- 36 files changed, 262 insertions(+), 89 deletions(-) create mode 100644 examples/mapstream/flatmap_stream/pipeline.yaml create mode 100644 examples/reduce/counter/pipeline.yaml diff --git a/examples/map/even_odd/example.py b/examples/map/even_odd/example.py index 68c63ea3..b35fe371 100644 --- a/examples/map/even_odd/example.py +++ b/examples/map/even_odd/example.py @@ -1,4 +1,4 @@ -from pynumaflow.mapper import Messages, Message, Datum, Mapper +from pynumaflow.mapper import Messages, Message, Datum, MapServer def my_handler(keys: list[str], datum: Datum) -> Messages: @@ -22,5 +22,5 @@ def my_handler(keys: list[str], datum: Datum) -> Messages: if __name__ == "__main__": - grpc_server = Mapper(handler=my_handler) + grpc_server = MapServer(mapper_instance=my_handler) grpc_server.start() diff --git a/examples/map/even_odd/pyproject.toml b/examples/map/even_odd/pyproject.toml index d5df62f1..94e721a5 100644 --- a/examples/map/even_odd/pyproject.toml +++ b/examples/map/even_odd/pyproject.toml @@ -6,7 +6,7 @@ authors = ["Numaflow developers"] [tool.poetry.dependencies] python = "~3.10" -pynumaflow = "~0.6.0" +pynumaflow = "~0.6.1" [tool.poetry.dev-dependencies] diff --git a/examples/map/flatmap/example.py b/examples/map/flatmap/example.py index 48642741..3245b92d 100644 --- a/examples/map/flatmap/example.py +++ b/examples/map/flatmap/example.py @@ -1,20 +1,22 @@ -from pynumaflow.mapper import Messages, Message, Datum, Mapper +from pynumaflow.mapper import Messages, Message, Datum, MapServer, MapperClass -def my_handler(keys: list[str], datum: Datum) -> Messages: - val = datum.value - _ = datum.event_time - _ = datum.watermark - strs = val.decode("utf-8").split(",") - messages = Messages() - if len(strs) == 0: - messages.append(Message.to_drop()) +class Flatmap(MapperClass): + def handler(self, keys: list[str], datum: Datum) -> Messages: + val = datum.value + _ = datum.event_time + _ = datum.watermark + strs = val.decode("utf-8").split(",") + messages = Messages() + if len(strs) == 0: + messages.append(Message.to_drop()) + return messages + for s in strs: + messages.append(Message(str.encode(s))) return messages - for s in strs: - messages.append(Message(str.encode(s))) - return messages if __name__ == "__main__": - grpc_server = Mapper(handler=my_handler) + flatmap_instance = Flatmap() + grpc_server = MapServer(mapper_instance=flatmap_instance) grpc_server.start() diff --git a/examples/map/flatmap/pyproject.toml b/examples/map/flatmap/pyproject.toml index 3ecf4c88..41fcecb5 100644 --- a/examples/map/flatmap/pyproject.toml +++ b/examples/map/flatmap/pyproject.toml @@ -6,7 +6,7 @@ authors = ["Numaflow developers"] [tool.poetry.dependencies] python = "~3.10" -pynumaflow = "~0.6.0" +pynumaflow = "~0.6.1" [tool.poetry.dev-dependencies] diff --git a/examples/map/forward_message/example.py b/examples/map/forward_message/example.py index a64eb897..d4425fc0 100644 --- a/examples/map/forward_message/example.py +++ b/examples/map/forward_message/example.py @@ -1,4 +1,4 @@ -from pynumaflow.mapper import Messages, Message, Datum, Mapper +from pynumaflow.mapper import Messages, Message, Datum, MapServer def my_handler(keys: list[str], datum: Datum) -> Messages: @@ -11,5 +11,5 @@ def my_handler(keys: list[str], datum: Datum) -> Messages: if __name__ == "__main__": - grpc_server = Mapper(handler=my_handler) + grpc_server = MapServer(mapper_instance=my_handler) grpc_server.start() diff --git a/examples/map/forward_message/pyproject.toml b/examples/map/forward_message/pyproject.toml index 361ba9e5..db000ba6 100644 --- a/examples/map/forward_message/pyproject.toml +++ b/examples/map/forward_message/pyproject.toml @@ -6,7 +6,7 @@ authors = ["Numaflow developers"] [tool.poetry.dependencies] python = "~3.10" -pynumaflow = "~0.6.0" +pynumaflow = "~0.6.1" [tool.poetry.dev-dependencies] diff --git a/examples/map/multiproc_map/README.md b/examples/map/multiproc_map/README.md index 3b42bbc4..0bff3481 100644 --- a/examples/map/multiproc_map/README.md +++ b/examples/map/multiproc_map/README.md @@ -15,10 +15,11 @@ processes to bind to the same port. To enable multiprocessing mode -1) Start the multiproc server in the UDF using the following command +1) Start the multiproc server in the UDF using the following command, select the server_type = ServerType.Multiproc ```python if __name__ == "__main__": - grpc_server = MultiProcServer(map_handler=my_handler) + grpc_server = MapServer(mapper_instance=handler, + server_type=ServerType.Multiproc) grpc_server.start() ``` 2) Set the ENV var value `NUM_CPU_MULTIPROC="n"` for the UDF container, diff --git a/examples/map/multiproc_map/example.py b/examples/map/multiproc_map/example.py index 195aad89..4dfb44c5 100644 --- a/examples/map/multiproc_map/example.py +++ b/examples/map/multiproc_map/example.py @@ -1,6 +1,8 @@ import math -from pynumaflow.mapper import Messages, Message, Datum, MultiProcMapper +from pynumaflow._constants import ServerType + +from pynumaflow.mapper import Messages, Message, Datum, MapServer, MapperClass def is_prime(n): @@ -11,23 +13,26 @@ def is_prime(n): return True -def my_handler(keys: list[str], datum: Datum) -> Messages: - val = datum.value - _ = datum.event_time - _ = datum.watermark - messages = Messages() - for i in range(2, 100000): - is_prime(i) - messages.append(Message(val, keys=keys)) - return messages +class PrimeMap(MapperClass): + def handler(self, keys: list[str], datum: Datum) -> Messages: + val = datum.value + _ = datum.event_time + _ = datum.watermark + messages = Messages() + for i in range(2, 100000): + is_prime(i) + messages.append(Message(val, keys=keys)) + return messages if __name__ == "__main__": """ Example of starting a multiprocessing map vertex. To enable set the env variable - MAP_MULTIPROC="true" + NUM_CPU_MULTIPROC="N" + Set the server_type = ServerType.Multiproc in the pipeline config for the numa container. """ - grpc_server = MultiProcMapper(handler=my_handler) + prime_class = PrimeMap() + grpc_server = MapServer(mapper_instance=prime_class, server_type=ServerType.Multiproc) grpc_server.start() diff --git a/examples/map/multiproc_map/pyproject.toml b/examples/map/multiproc_map/pyproject.toml index 361ba9e5..db000ba6 100644 --- a/examples/map/multiproc_map/pyproject.toml +++ b/examples/map/multiproc_map/pyproject.toml @@ -6,7 +6,7 @@ authors = ["Numaflow developers"] [tool.poetry.dependencies] python = "~3.10" -pynumaflow = "~0.6.0" +pynumaflow = "~0.6.1" [tool.poetry.dev-dependencies] diff --git a/examples/mapstream/flatmap_stream/example.py b/examples/mapstream/flatmap_stream/example.py index 58f78f97..d966ca58 100644 --- a/examples/mapstream/flatmap_stream/example.py +++ b/examples/mapstream/flatmap_stream/example.py @@ -1,7 +1,8 @@ -import aiorun from collections.abc import AsyncIterable -from pynumaflow.mapstreamer import Message, Datum, AsyncMapStreamer +from pynumaflow._constants import ServerType + +from pynumaflow.mapstreamer import Message, Datum, MapStreamServer async def map_stream_handler(_: list[str], datum: Datum) -> AsyncIterable[Message]: @@ -22,5 +23,7 @@ async def map_stream_handler(_: list[str], datum: Datum) -> AsyncIterable[Messag if __name__ == "__main__": - grpc_server = AsyncMapStreamer(handler=map_stream_handler) - aiorun.run(grpc_server.start()) + grpc_server = MapStreamServer( + map_stream_instance=map_stream_handler, server_type=ServerType.Async + ) + grpc_server.start() diff --git a/examples/mapstream/flatmap_stream/pipeline.yaml b/examples/mapstream/flatmap_stream/pipeline.yaml new file mode 100644 index 00000000..30e399e0 --- /dev/null +++ b/examples/mapstream/flatmap_stream/pipeline.yaml @@ -0,0 +1,53 @@ +apiVersion: numaflow.numaproj.io/v1alpha1 +kind: Pipeline +metadata: + name: simple-pipeline +spec: + limits: + readBatchSize: 2 + vertices: + - name: in + source: + # A self data generating source + generator: + rpu: 10 + duration: 1s + - name: flatmap + metadata: + annotations: + numaflow.numaproj.io/map-stream: "true" + limits: + readBatchSize: 1 + udf: + container: + image: "quay.io/kohlisid/numaflow-python/mapstream:v3" + imagePullPolicy: Always + env: + - name: PYTHONDEBUG + value: "true" + - name: SERVER_TYPE + value: "async" + - name : INVOKE + value: "handler" +# - name: NUM_CPU_MULTIPROC +# value: "2" # DO NOT forgekt the double quotes!!! + containerTemplate: + resources: + limits: + cpu: "1" + memory: 2Gi + requests: + cpu: "500m" + memory: 1Gi + env: + - name: NUMAFLOW_DEBUG + value: "true" # DO NOT forget the double quotes!!! + - name: out + sink: + # A simple log printing sink + log: {} + edges: + - from: in + to: flatmap + - from: flatmap + to: out diff --git a/examples/mapstream/flatmap_stream/pyproject.toml b/examples/mapstream/flatmap_stream/pyproject.toml index 7df9056e..da79a6b5 100644 --- a/examples/mapstream/flatmap_stream/pyproject.toml +++ b/examples/mapstream/flatmap_stream/pyproject.toml @@ -6,7 +6,7 @@ authors = ["Numaflow developers"] [tool.poetry.dependencies] python = "~3.10" -pynumaflow = "~0.6.0" +pynumaflow = "~0.6.1" [tool.poetry.dev-dependencies] diff --git a/examples/reduce/counter/Makefile b/examples/reduce/counter/Makefile index 16146522..42367ab0 100644 --- a/examples/reduce/counter/Makefile +++ b/examples/reduce/counter/Makefile @@ -1,6 +1,6 @@ .PHONY: image image: - docker build -t "quay.io/numaio/numaflow-python/reduce-counter:v0.5.0" . + docker buildx build --no-cache -t "quay.io/kohlisid/numaflow-python/reduce:v3" --platform linux/amd64,linux/arm64 . --push # Github CI runner uses platform linux/amd64. If your local environment don't, the image built by command above might not work # under the CI E2E test environment. # To build an image that supports multiple platforms(linux/amd64,linux/arm64) and push to quay.io, use the following command diff --git a/examples/reduce/counter/example.py b/examples/reduce/counter/example.py index 043b4dd9..7bcd956c 100644 --- a/examples/reduce/counter/example.py +++ b/examples/reduce/counter/example.py @@ -1,7 +1,24 @@ -import aiorun +import os from collections.abc import AsyncIterable -from pynumaflow.reducer import Messages, Message, Datum, Metadata, AsyncReducer +from pynumaflow._constants import ServerType + +from pynumaflow.reducer import Messages, Message, Datum, Metadata, ReduceServer, ReducerClass + + +class ExampleClass(ReducerClass): + async def handler( + self, keys: list[str], datums: AsyncIterable[Datum], md: Metadata + ) -> Messages: + interval_window = md.interval_window + counter = 0 + async for _ in datums: + counter += 1 + msg = ( + f"counter:{counter} interval_window_start:{interval_window.start} " + f"interval_window_end:{interval_window.end}" + ) + return Messages(Message(str.encode(msg), keys=keys)) async def reduce_handler(keys: list[str], datums: AsyncIterable[Datum], md: Metadata) -> Messages: @@ -17,5 +34,10 @@ async def reduce_handler(keys: list[str], datums: AsyncIterable[Datum], md: Meta if __name__ == "__main__": - grpc_server = AsyncReducer(handler=reduce_handler) - aiorun.run(grpc_server.start()) + invoke = os.getenv("INVOKE", "handler") + if invoke == "class": + handler = ExampleClass() + else: + handler = reduce_handler + grpc_server = ReduceServer(reducer_instance=handler, server_type=ServerType.Async) + grpc_server.start() diff --git a/examples/reduce/counter/pipeline.yaml b/examples/reduce/counter/pipeline.yaml new file mode 100644 index 00000000..b8b72bf9 --- /dev/null +++ b/examples/reduce/counter/pipeline.yaml @@ -0,0 +1,51 @@ +apiVersion: numaflow.numaproj.io/v1alpha1 +kind: Pipeline +metadata: + name: even-odd-sum +spec: + vertices: + - name: in + source: + http: {} + - name: atoi + scale: + min: 3 + udf: + container: + # Tell the input number is even or odd, see https://github.com/numaproj/numaflow-go/tree/main/pkg/mapper/examples/even_odd + image: quay.io/numaio/numaflow-go/map-even-odd:v0.5.0 + - name: compute-sum + udf: + container: + # compute the sum + image: quay.io/kohlisid/numaflow-python/reduce:v3 + imagePullPolicy: Always + env: + - name: PYTHONDEBUG + value: "true" + - name: SERVER_TYPE + value: "async" + - name: INVOKE + value: "class" + groupBy: + window: + fixed: + length: 60s + keyed: true + storage: + persistentVolumeClaim: + volumeSize: 10Gi + accessMode: ReadWriteOnce + partitions: 2 + - name: sink + scale: + min: 1 + sink: + log: {} + edges: + - from: in + to: atoi + - from: atoi + to: compute-sum + - from: compute-sum + to: sink diff --git a/examples/reduce/counter/pyproject.toml b/examples/reduce/counter/pyproject.toml index 7c956677..b4ab3665 100644 --- a/examples/reduce/counter/pyproject.toml +++ b/examples/reduce/counter/pyproject.toml @@ -6,8 +6,7 @@ authors = ["Numaflow developers"] [tool.poetry.dependencies] python = "~3.10" -pynumaflow = "~0.6.0" -aiorun = "^2022.11.1" +pynumaflow = "~0.6.1" [tool.poetry.dev-dependencies] diff --git a/examples/sideinput/simple-sideinput/udf/example.py b/examples/sideinput/simple-sideinput/udf/example.py index a155c2d6..ba1f7fc9 100644 --- a/examples/sideinput/simple-sideinput/udf/example.py +++ b/examples/sideinput/simple-sideinput/udf/example.py @@ -1,6 +1,6 @@ from threading import Thread import pynumaflow.sideinput as sideinputsdk -from pynumaflow.mapper import Messages, Mapper, Message, Datum +from pynumaflow.mapper import Messages, MapServer, Message, Datum from watchfiles import watch @@ -24,7 +24,7 @@ def watcher(): This function is used to start the GRPC server and the watcher thread. """ daemon = Thread(target=watcher, daemon=True, name="Monitor") - grpc_server = Mapper(handler=my_handler) + grpc_server = MapServer(mapper_instance=my_handler) thread_server = Thread(target=grpc_server.start, daemon=True, name="GRPC Server") daemon.start() thread_server.start() diff --git a/examples/sink/async_log/Makefile b/examples/sink/async_log/Makefile index 5f300878..cf4b0eeb 100644 --- a/examples/sink/async_log/Makefile +++ b/examples/sink/async_log/Makefile @@ -1,6 +1,6 @@ .PHONY: image image: - docker build -t "quay.io/numaio/numaflow-python/async-sink-log:v0.5.0" . + docker buildx build --no-cache -t "quay.io/kohlisid/numaflow-python/sink:v5" --platform linux/amd64,linux/arm64 . --push # Github CI runner uses platform linux/amd64. If your local environment don't, the image built by command above might not work # under the CI E2E test environment. # To build an image that supports multiple platforms(linux/amd64,linux/arm64) and push to quay.io, use the following command diff --git a/examples/sink/async_log/example.py b/examples/sink/async_log/example.py index d8b9ac1f..1ae4dde8 100644 --- a/examples/sink/async_log/example.py +++ b/examples/sink/async_log/example.py @@ -1,8 +1,17 @@ +import os from collections.abc import AsyncIterable -import aiorun -from pynumaflow.sinker import Datum, Responses, Response, AsyncSinker +from pynumaflow.sinker import Datum, Responses, Response, SinkServer, SinkerClass, ServerType + + +class UserDefinedSink(SinkerClass): + async def handler(self, datums: AsyncIterable[Datum]) -> Responses: + responses = Responses() + async for msg in datums: + print("User Defined Sink", msg.value.decode("utf-8")) + responses.append(Response.as_success(msg.id)) + return responses async def udsink_handler(datums: AsyncIterable[Datum]) -> Responses: @@ -14,5 +23,10 @@ async def udsink_handler(datums: AsyncIterable[Datum]) -> Responses: if __name__ == "__main__": - grpc_server = AsyncSinker(handler=udsink_handler) - aiorun.run(grpc_server.start()) + invoke = os.getenv("INVOKE", "handler") + if invoke == "class": + sink_handler = UserDefinedSink() + else: + sink_handler = udsink_handler + grpc_server = SinkServer(sinker_instance=sink_handler, server_type=ServerType.Async) + grpc_server.start() diff --git a/examples/sink/async_log/pipeline-numaflow.yaml b/examples/sink/async_log/pipeline-numaflow.yaml index 690d1d7d..bf1e9d8a 100644 --- a/examples/sink/async_log/pipeline-numaflow.yaml +++ b/examples/sink/async_log/pipeline-numaflow.yaml @@ -21,7 +21,15 @@ spec: args: - python - example.py - image: quay.io/numaio/numaflow-python/async-sink-log:latest + image: quay.io/kohlisid/numaflow-python/sink:v5 + imagePullPolicy: Always + env: + - name: PYTHONDEBUG + value: "true" + - name: SERVER_TYPE + value: "async" + - name: INVOKE + value: "handler" - name: log-output sink: log: {} diff --git a/examples/sink/async_log/pyproject.toml b/examples/sink/async_log/pyproject.toml index 629d9c26..60a417b5 100644 --- a/examples/sink/async_log/pyproject.toml +++ b/examples/sink/async_log/pyproject.toml @@ -6,7 +6,7 @@ authors = ["Numaflow developers"] [tool.poetry.dependencies] python = "~3.10" -pynumaflow = "~0.6.0" +pynumaflow = "~0.6.1" [tool.poetry.dev-dependencies] diff --git a/examples/sink/log/Makefile b/examples/sink/log/Makefile index f0997a86..b2441878 100644 --- a/examples/sink/log/Makefile +++ b/examples/sink/log/Makefile @@ -1,6 +1,6 @@ .PHONY: image image: - docker build -t "quay.io/numaio/numaflow-python/sink-log:v0.5.0" . + docker buildx build --no-cache -t "quay.io/kohlisid/numaflow-python/sink:v3" --platform linux/amd64,linux/arm64 . --push # Github CI runner uses platform linux/amd64. If your local environment don't, the image built by command above might not work # under the CI E2E test environment. # To build an image that supports multiple platforms(linux/amd64,linux/arm64) and push to quay.io, use the following command diff --git a/examples/sink/log/example.py b/examples/sink/log/example.py index 653d8489..6c2f48c6 100644 --- a/examples/sink/log/example.py +++ b/examples/sink/log/example.py @@ -1,6 +1,17 @@ +import os from collections.abc import Iterator -from pynumaflow.sinker import Datum, Responses, Response, Sinker +from pynumaflow.sinker import Datum, Responses, Response, SinkServer +from pynumaflow.sinker import SinkerClass + + +class UserDefinedSink(SinkerClass): + def handler(self, datums: Iterator[Datum]) -> Responses: + responses = Responses() + for msg in datums: + print("User Defined Sink", msg.value.decode("utf-8")) + responses.append(Response.as_success(msg.id)) + return responses def udsink_handler(datums: Iterator[Datum]) -> Responses: @@ -12,5 +23,10 @@ def udsink_handler(datums: Iterator[Datum]) -> Responses: if __name__ == "__main__": - grpc_server = Sinker(handler=udsink_handler) + invoke = os.getenv("INVOKE", "handler") + if invoke == "class": + sink_handler = UserDefinedSink() + else: + sink_handler = udsink_handler + grpc_server = SinkServer(sinker_instance=sink_handler) grpc_server.start() diff --git a/examples/sink/log/pipeline-numaflow.yaml b/examples/sink/log/pipeline-numaflow.yaml index c2bc60ad..d7014276 100644 --- a/examples/sink/log/pipeline-numaflow.yaml +++ b/examples/sink/log/pipeline-numaflow.yaml @@ -21,7 +21,15 @@ spec: args: - python - example.py - image: quay.io/numaio/numaflow-python/sink-log:latest + image: quay.io/kohlisid/numaflow-python/sink:v3 + imagePullPolicy: Always + env: + - name: PYTHONDEBUG + value: "true" + - name: SERVER_TYPE + value: "async" + - name: INVOKE + value: "handler" - name: log-output sink: log: {} diff --git a/examples/sink/log/pyproject.toml b/examples/sink/log/pyproject.toml index 629d9c26..60a417b5 100644 --- a/examples/sink/log/pyproject.toml +++ b/examples/sink/log/pyproject.toml @@ -6,7 +6,7 @@ authors = ["Numaflow developers"] [tool.poetry.dependencies] python = "~3.10" -pynumaflow = "~0.6.0" +pynumaflow = "~0.6.1" [tool.poetry.dev-dependencies] diff --git a/examples/source/async-source/Makefile b/examples/source/async-source/Makefile index 782ee86d..7a616f4b 100644 --- a/examples/source/async-source/Makefile +++ b/examples/source/async-source/Makefile @@ -1,6 +1,6 @@ .PHONY: image image: - docker build -t "quay.io/numaio/numaflow-python/async-source:v0.5.5" . + docker buildx build --no-cache -t "quay.io/kohlisid/numaflow-python/source:v5" --platform linux/amd64,linux/arm64 . --push # Github CI runner uses platform linux/amd64. If your local environment don't, the image built by command above might not work # under the CI E2E test environment. # To build an image that supports multiple platforms(linux/amd64,linux/arm64) and push to quay.io, use the following command diff --git a/examples/source/async-source/example.py b/examples/source/async-source/example.py index cf413808..f5775730 100644 --- a/examples/source/async-source/example.py +++ b/examples/source/async-source/example.py @@ -2,6 +2,7 @@ from collections.abc import AsyncIterable import aiorun +from pynumaflow._constants import ServerType from pynumaflow.sourcer import ( ReadRequest, @@ -9,13 +10,14 @@ AckRequest, PendingResponse, Offset, - AsyncSourcer, PartitionsResponse, get_default_partitions, + SourceServer, + SourcerClass, ) -class AsyncSource: +class AsyncSource(SourcerClass): """ AsyncSource is a class for User Defined Source implementation. """ @@ -69,10 +71,5 @@ async def partitions_handler(self) -> PartitionsResponse: if __name__ == "__main__": ud_source = AsyncSource() - grpc_server = AsyncSourcer( - read_handler=ud_source.read_handler, - ack_handler=ud_source.ack_handler, - pending_handler=ud_source.pending_handler, - partitions_handler=ud_source.partitions_handler, - ) + grpc_server = SourceServer(sourcer_instance=ud_source, server_type=ServerType.Async) aiorun.run(grpc_server.start()) diff --git a/examples/source/async-source/pipeline-numaflow.yaml b/examples/source/async-source/pipeline-numaflow.yaml index 1626001e..d7cfbb8f 100644 --- a/examples/source/async-source/pipeline-numaflow.yaml +++ b/examples/source/async-source/pipeline-numaflow.yaml @@ -9,7 +9,7 @@ spec: udsource: container: # A simple user-defined async source - image: quay.io/numaio/numaflow-python/async-source:v0.5.5 + image: quay.io/kohlisid/numaflow-python/source:v5 imagePullPolicy: Always limits: readBatchSize: 2 diff --git a/examples/source/async-source/pyproject.toml b/examples/source/async-source/pyproject.toml index bf23dd70..15ee2ec1 100644 --- a/examples/source/async-source/pyproject.toml +++ b/examples/source/async-source/pyproject.toml @@ -6,8 +6,7 @@ authors = ["Numaflow developers"] [tool.poetry.dependencies] python = "~3.10" -pynumaflow = "~0.6.0" -aiorun = "^2023.7" +pynumaflow = "~0.6.1" [tool.poetry.dev-dependencies] diff --git a/examples/source/simple-source/Makefile b/examples/source/simple-source/Makefile index 1c350bc5..613ec3f2 100644 --- a/examples/source/simple-source/Makefile +++ b/examples/source/simple-source/Makefile @@ -1,6 +1,6 @@ .PHONY: image image: - docker build -t "quay.io/numaio/numaflow-python/simple-source:v0.5.5" . + docker buildx build --no-cache -t "quay.io/kohlisid/numaflow-python/source:v4" --platform linux/amd64,linux/arm64 . --push # Github CI runner uses platform linux/amd64. If your local environment don't, the image built by command above might not work # under the CI E2E test environment. # To build an image that supports multiple platforms(linux/amd64,linux/arm64) and push to quay.io, use the following command diff --git a/examples/source/simple-source/example.py b/examples/source/simple-source/example.py index 49047b27..46f25daa 100644 --- a/examples/source/simple-source/example.py +++ b/examples/source/simple-source/example.py @@ -4,16 +4,17 @@ from pynumaflow.sourcer import ( ReadRequest, Message, - Sourcer, AckRequest, PendingResponse, Offset, PartitionsResponse, get_default_partitions, + SourcerClass, + SourceServer, ) -class SimpleSource: +class SimpleSource(SourcerClass): """ SimpleSource is a class for User Defined Source implementation. """ @@ -67,10 +68,5 @@ def partitions_handler(self) -> PartitionsResponse: if __name__ == "__main__": ud_source = SimpleSource() - grpc_server = Sourcer( - read_handler=ud_source.read_handler, - ack_handler=ud_source.ack_handler, - pending_handler=ud_source.pending_handler, - partitions_handler=ud_source.partitions_handler, - ) + grpc_server = SourceServer(sourcer_instance=ud_source) grpc_server.start() diff --git a/examples/source/simple-source/pipeline-numaflow.yaml b/examples/source/simple-source/pipeline-numaflow.yaml index 920ded41..964e6314 100644 --- a/examples/source/simple-source/pipeline-numaflow.yaml +++ b/examples/source/simple-source/pipeline-numaflow.yaml @@ -9,7 +9,7 @@ spec: udsource: container: # A simple user-defined source for e2e testing - image: quay.io/numaio/numaflow-python/simple-source:v0.5.4 + image: quay.io/kohlisid/numaflow-python/source:v4 imagePullPolicy: Always limits: readBatchSize: 2 diff --git a/examples/source/simple-source/pyproject.toml b/examples/source/simple-source/pyproject.toml index 82428bc2..e57b1751 100644 --- a/examples/source/simple-source/pyproject.toml +++ b/examples/source/simple-source/pyproject.toml @@ -6,8 +6,7 @@ authors = ["Numaflow developers"] [tool.poetry.dependencies] python = "~3.10" -pynumaflow = "~0.6.0" - +pynumaflow = "~0.6.1" [tool.poetry.dev-dependencies] diff --git a/examples/sourcetransform/event_time_filter/Makefile b/examples/sourcetransform/event_time_filter/Makefile index c47eac03..ffdeac30 100644 --- a/examples/sourcetransform/event_time_filter/Makefile +++ b/examples/sourcetransform/event_time_filter/Makefile @@ -1,6 +1,6 @@ .PHONY: image image: - docker build -t "quay.io/numaio/numaflow-python/mapt-event-time-filter:v0.5.0" . + docker buildx build --no-cache -t "quay.io/kohlisid/numaflow-python/transform:v1" --platform linux/amd64,linux/arm64 . --push # Github CI runner uses platform linux/amd64. If your local environment don't, the image built by command above might not work # under the CI E2E test environment. # To build an image that supports multiple platforms(linux/amd64,linux/arm64) and push to quay.io, use the following command diff --git a/examples/sourcetransform/event_time_filter/example.py b/examples/sourcetransform/event_time_filter/example.py index e33604dc..43f49ad2 100644 --- a/examples/sourcetransform/event_time_filter/example.py +++ b/examples/sourcetransform/event_time_filter/example.py @@ -1,7 +1,7 @@ import datetime import logging -from pynumaflow.sourcetransformer import Messages, Message, Datum, SourceTransformer +from pynumaflow.sourcetransformer import Messages, Message, Datum, SourceTransformServer """ This is a simple User Defined Function example which receives a message, applies the following @@ -43,5 +43,5 @@ def my_handler(keys: list[str], datum: Datum) -> Messages: if __name__ == "__main__": - grpc_server = SourceTransformer(handler=my_handler) + grpc_server = SourceTransformServer(source_transform_instance=my_handler) grpc_server.start() diff --git a/examples/sourcetransform/event_time_filter/pyproject.toml b/examples/sourcetransform/event_time_filter/pyproject.toml index a6d19b57..be102248 100644 --- a/examples/sourcetransform/event_time_filter/pyproject.toml +++ b/examples/sourcetransform/event_time_filter/pyproject.toml @@ -8,7 +8,7 @@ packages = [{include = "mapt_event_time_filter"}] [tool.poetry.dependencies] python = ">=3.9, <3.12" -pynumaflow = "~0.6.0" +pynumaflow = "~0.6.1" [build-system] requires = ["poetry-core"] From 2aa9b30f82aa39ff6b01529ee80184d870b5f7f9 Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Wed, 10 Jan 2024 13:47:26 -0800 Subject: [PATCH 45/78] examples Signed-off-by: Sidhant Kohli --- examples/mapstream/flatmap_stream/Makefile | 2 +- examples/mapstream/flatmap_stream/pipeline.yaml | 6 +----- examples/reduce/counter/Makefile | 2 +- examples/reduce/counter/pipeline.yaml | 4 +--- examples/sink/async_log/Makefile | 2 +- examples/sink/async_log/pipeline-numaflow.yaml | 4 +--- examples/sink/log/Makefile | 2 +- examples/sink/log/pipeline-numaflow.yaml | 2 +- examples/source/async-source/Makefile | 2 +- examples/source/async-source/pipeline-numaflow.yaml | 2 +- examples/source/simple-source/Makefile | 2 +- examples/source/simple-source/pipeline-numaflow.yaml | 2 +- examples/sourcetransform/event_time_filter/Makefile | 2 +- 13 files changed, 13 insertions(+), 21 deletions(-) diff --git a/examples/mapstream/flatmap_stream/Makefile b/examples/mapstream/flatmap_stream/Makefile index 824bec93..b84a451b 100644 --- a/examples/mapstream/flatmap_stream/Makefile +++ b/examples/mapstream/flatmap_stream/Makefile @@ -1,6 +1,6 @@ .PHONY: image image: - docker build -t "quay.io/numaio/numaflow-python/map-flatmap-stream:v0.5.0" . + docker build -t "quay.io/numaio/numaflow-python/map-flatmap-stream:v0.6.1" . # Github CI runner uses platform linux/amd64. If your local environment don't, the image built by command above might not work # under the CI E2E test environment. # To build an image that supports multiple platforms(linux/amd64,linux/arm64) and push to quay.io, use the following command diff --git a/examples/mapstream/flatmap_stream/pipeline.yaml b/examples/mapstream/flatmap_stream/pipeline.yaml index 30e399e0..b711f522 100644 --- a/examples/mapstream/flatmap_stream/pipeline.yaml +++ b/examples/mapstream/flatmap_stream/pipeline.yaml @@ -20,17 +20,13 @@ spec: readBatchSize: 1 udf: container: - image: "quay.io/kohlisid/numaflow-python/mapstream:v3" + image: "quay.io/numaio/numaflow-python/map-flatmap-stream:v0.6.1" imagePullPolicy: Always env: - name: PYTHONDEBUG value: "true" - - name: SERVER_TYPE - value: "async" - name : INVOKE value: "handler" -# - name: NUM_CPU_MULTIPROC -# value: "2" # DO NOT forgekt the double quotes!!! containerTemplate: resources: limits: diff --git a/examples/reduce/counter/Makefile b/examples/reduce/counter/Makefile index 42367ab0..893f8a00 100644 --- a/examples/reduce/counter/Makefile +++ b/examples/reduce/counter/Makefile @@ -1,6 +1,6 @@ .PHONY: image image: - docker buildx build --no-cache -t "quay.io/kohlisid/numaflow-python/reduce:v3" --platform linux/amd64,linux/arm64 . --push + docker build -t "quay.io/numaio/numaflow-python/reduce-counter:v0.6.1" . # Github CI runner uses platform linux/amd64. If your local environment don't, the image built by command above might not work # under the CI E2E test environment. # To build an image that supports multiple platforms(linux/amd64,linux/arm64) and push to quay.io, use the following command diff --git a/examples/reduce/counter/pipeline.yaml b/examples/reduce/counter/pipeline.yaml index b8b72bf9..4fb35f4b 100644 --- a/examples/reduce/counter/pipeline.yaml +++ b/examples/reduce/counter/pipeline.yaml @@ -18,13 +18,11 @@ spec: udf: container: # compute the sum - image: quay.io/kohlisid/numaflow-python/reduce:v3 + image: quay.io/numaio/numaflow-python/reduce-counter:v0.6.1 imagePullPolicy: Always env: - name: PYTHONDEBUG value: "true" - - name: SERVER_TYPE - value: "async" - name: INVOKE value: "class" groupBy: diff --git a/examples/sink/async_log/Makefile b/examples/sink/async_log/Makefile index cf4b0eeb..8420c6dd 100644 --- a/examples/sink/async_log/Makefile +++ b/examples/sink/async_log/Makefile @@ -1,6 +1,6 @@ .PHONY: image image: - docker buildx build --no-cache -t "quay.io/kohlisid/numaflow-python/sink:v5" --platform linux/amd64,linux/arm64 . --push + docker build -t "quay.io/numaio/numaflow-python/async-sink-log:v0.6.1" . # Github CI runner uses platform linux/amd64. If your local environment don't, the image built by command above might not work # under the CI E2E test environment. # To build an image that supports multiple platforms(linux/amd64,linux/arm64) and push to quay.io, use the following command diff --git a/examples/sink/async_log/pipeline-numaflow.yaml b/examples/sink/async_log/pipeline-numaflow.yaml index bf1e9d8a..cdc2f1d2 100644 --- a/examples/sink/async_log/pipeline-numaflow.yaml +++ b/examples/sink/async_log/pipeline-numaflow.yaml @@ -21,13 +21,11 @@ spec: args: - python - example.py - image: quay.io/kohlisid/numaflow-python/sink:v5 + image: quay.io/numaio/numaflow-python/async-sink-log:v0.6.1 imagePullPolicy: Always env: - name: PYTHONDEBUG value: "true" - - name: SERVER_TYPE - value: "async" - name: INVOKE value: "handler" - name: log-output diff --git a/examples/sink/log/Makefile b/examples/sink/log/Makefile index b2441878..f2d7b2a2 100644 --- a/examples/sink/log/Makefile +++ b/examples/sink/log/Makefile @@ -1,6 +1,6 @@ .PHONY: image image: - docker buildx build --no-cache -t "quay.io/kohlisid/numaflow-python/sink:v3" --platform linux/amd64,linux/arm64 . --push + docker build -t "quay.io/numaio/numaflow-python/sink-log:v0.6.1" . # Github CI runner uses platform linux/amd64. If your local environment don't, the image built by command above might not work # under the CI E2E test environment. # To build an image that supports multiple platforms(linux/amd64,linux/arm64) and push to quay.io, use the following command diff --git a/examples/sink/log/pipeline-numaflow.yaml b/examples/sink/log/pipeline-numaflow.yaml index d7014276..9439526e 100644 --- a/examples/sink/log/pipeline-numaflow.yaml +++ b/examples/sink/log/pipeline-numaflow.yaml @@ -21,7 +21,7 @@ spec: args: - python - example.py - image: quay.io/kohlisid/numaflow-python/sink:v3 + image: "quay.io/numaio/numaflow-python/sink-log:v0.6.1" imagePullPolicy: Always env: - name: PYTHONDEBUG diff --git a/examples/source/async-source/Makefile b/examples/source/async-source/Makefile index 7a616f4b..bb43ac51 100644 --- a/examples/source/async-source/Makefile +++ b/examples/source/async-source/Makefile @@ -1,6 +1,6 @@ .PHONY: image image: - docker buildx build --no-cache -t "quay.io/kohlisid/numaflow-python/source:v5" --platform linux/amd64,linux/arm64 . --push + docker build -t "quay.io/numaio/numaflow-python/async-source:v0.6.1" . # Github CI runner uses platform linux/amd64. If your local environment don't, the image built by command above might not work # under the CI E2E test environment. # To build an image that supports multiple platforms(linux/amd64,linux/arm64) and push to quay.io, use the following command diff --git a/examples/source/async-source/pipeline-numaflow.yaml b/examples/source/async-source/pipeline-numaflow.yaml index d7cfbb8f..1a9197cc 100644 --- a/examples/source/async-source/pipeline-numaflow.yaml +++ b/examples/source/async-source/pipeline-numaflow.yaml @@ -9,7 +9,7 @@ spec: udsource: container: # A simple user-defined async source - image: quay.io/kohlisid/numaflow-python/source:v5 + image: "quay.io/numaio/numaflow-python/async-source:v0.6.1" imagePullPolicy: Always limits: readBatchSize: 2 diff --git a/examples/source/simple-source/Makefile b/examples/source/simple-source/Makefile index 613ec3f2..eb4363d2 100644 --- a/examples/source/simple-source/Makefile +++ b/examples/source/simple-source/Makefile @@ -1,6 +1,6 @@ .PHONY: image image: - docker buildx build --no-cache -t "quay.io/kohlisid/numaflow-python/source:v4" --platform linux/amd64,linux/arm64 . --push + docker build -t "quay.io/numaio/numaflow-python/simple-source:v0.6.1" . # Github CI runner uses platform linux/amd64. If your local environment don't, the image built by command above might not work # under the CI E2E test environment. # To build an image that supports multiple platforms(linux/amd64,linux/arm64) and push to quay.io, use the following command diff --git a/examples/source/simple-source/pipeline-numaflow.yaml b/examples/source/simple-source/pipeline-numaflow.yaml index 964e6314..66ea6c22 100644 --- a/examples/source/simple-source/pipeline-numaflow.yaml +++ b/examples/source/simple-source/pipeline-numaflow.yaml @@ -9,7 +9,7 @@ spec: udsource: container: # A simple user-defined source for e2e testing - image: quay.io/kohlisid/numaflow-python/source:v4 + image: quay.io/numaio/numaflow-python/simple-source:v0.6.1 imagePullPolicy: Always limits: readBatchSize: 2 diff --git a/examples/sourcetransform/event_time_filter/Makefile b/examples/sourcetransform/event_time_filter/Makefile index ffdeac30..c0df5ba6 100644 --- a/examples/sourcetransform/event_time_filter/Makefile +++ b/examples/sourcetransform/event_time_filter/Makefile @@ -1,6 +1,6 @@ .PHONY: image image: - docker buildx build --no-cache -t "quay.io/kohlisid/numaflow-python/transform:v1" --platform linux/amd64,linux/arm64 . --push + docker build -t "quay.io/numaio/numaflow-python/mapt-event-time-filter:v0.6.1" . # Github CI runner uses platform linux/amd64. If your local environment don't, the image built by command above might not work # under the CI E2E test environment. # To build an image that supports multiple platforms(linux/amd64,linux/arm64) and push to quay.io, use the following command From 49c8b8d0ca5c1bcc701196fa3777b59ee490f055 Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Wed, 10 Jan 2024 15:14:43 -0800 Subject: [PATCH 46/78] cleanup Signed-off-by: Sidhant Kohli --- .codecov.yml | 8 +- Makefile | 16 +- pynumaflow/mapper/_dtypes.py | 7 +- pynumaflow/mapper/async_server.py | 42 +-- pynumaflow/mapper/map.py | 40 ++- pynumaflow/mapper/server.py | 34 +- pynumaflow/mapstreamer/async_server.py | 35 +- pynumaflow/mapstreamer/mapstream.py | 35 +- pynumaflow/proto/mapper/map_pb2.py | 30 +- pynumaflow/proto/mapper/map_pb2_grpc.py | 129 ++++---- pynumaflow/proto/mapstreamer/mapstream_pb2.py | 30 +- .../proto/mapstreamer/mapstream_pb2_grpc.py | 129 +++----- pynumaflow/proto/reducer/reduce_pb2.py | 30 +- pynumaflow/proto/reducer/reduce_pb2_grpc.py | 129 ++++---- pynumaflow/proto/sideinput/sideinput_pb2.py | 22 +- .../proto/sideinput/sideinput_pb2_grpc.py | 129 +++----- pynumaflow/proto/sinker/sink_pb2.py | 32 +- pynumaflow/proto/sinker/sink_pb2_grpc.py | 129 ++++---- pynumaflow/proto/sourcer/source_pb2.py | 70 ++-- pynumaflow/proto/sourcer/source_pb2_grpc.py | 306 +++++++----------- .../proto/sourcetransformer/transform_pb2.py | 30 +- .../sourcetransformer/transform_pb2_grpc.py | 124 +++---- pynumaflow/reducer/_dtypes.py | 5 +- pynumaflow/reducer/async_server.py | 45 +-- pynumaflow/reducer/reduce.py | 35 +- pynumaflow/shared/server.py | 80 +---- pynumaflow/sinker/_dtypes.py | 6 +- pynumaflow/sinker/async_sink.py | 23 +- pynumaflow/sinker/server.py | 23 +- pynumaflow/sinker/sink.py | 43 ++- pynumaflow/sourcer/async_server.py | 49 +-- pynumaflow/sourcer/server.py | 40 +-- pynumaflow/sourcer/source.py | 29 +- pynumaflow/sourcetransformer/_dtypes.py | 9 +- pynumaflow/sourcetransformer/server.py | 35 +- .../sourcetransformer/sourcetransform.py | 32 +- tests/map/test_multiproc_mapper.py | 32 +- tests/map/test_sync_mapper.py | 16 +- tests/mapstream/test_async_map_stream_err.py | 4 + tests/reduce/test_async_reduce.py | 6 + tests/sink/test_async_sink.py | 6 + tests/sink/test_server.py | 6 + tests/source/test_sync_source_err.py | 2 + tests/sourcetransform/test_sync_server.py | 4 + 44 files changed, 839 insertions(+), 1227 deletions(-) diff --git a/.codecov.yml b/.codecov.yml index fb3d02de..22838bc6 100644 --- a/.codecov.yml +++ b/.codecov.yml @@ -11,10 +11,4 @@ coverage: ignore: - "examples/" - - "pynumaflow/mapper/proto/*" - - "pynumaflow/sinker/proto/*" - - "pynumaflow/mapstreamer/proto/*" - - "pynumaflow/reducer/proto/*" - - "pynumaflow/sourcetransformer/proto/*" - - "pynumaflow/sideinput/proto/*" - - "pynumaflow/sourcer/proto/*" + - "pynumaflow/proto/*" diff --git a/Makefile b/Makefile index 7c42a33b..af48b9ae 100644 --- a/Makefile +++ b/Makefile @@ -27,13 +27,13 @@ setup: proto: - python3 -m grpc_tools.protoc -I=pynumaflow/sinker/proto --python_out=pynumaflow/sinker/proto --grpc_python_out=pynumaflow/sinker/proto pynumaflow/sinker/proto/*.proto - python3 -m grpc_tools.protoc -I=pynumaflow/mapper/proto --python_out=pynumaflow/mapper/proto --grpc_python_out=pynumaflow/mapper/proto pynumaflow/mapper/proto/*.proto - python3 -m grpc_tools.protoc -I=pynumaflow/mapstreamer/proto --python_out=pynumaflow/mapstreamer/proto --grpc_python_out=pynumaflow/mapstreamer/proto pynumaflow/mapstreamer/proto/*.proto - python3 -m grpc_tools.protoc -I=pynumaflow/reducer/proto --python_out=pynumaflow/reducer/proto --grpc_python_out=pynumaflow/reducer/proto pynumaflow/reducer/proto/*.proto - python3 -m grpc_tools.protoc -I=pynumaflow/sourcetransformer/proto --python_out=pynumaflow/sourcetransformer/proto --grpc_python_out=pynumaflow/sourcetransformer/proto pynumaflow/sourcetransformer/proto/*.proto - python3 -m grpc_tools.protoc -I=pynumaflow/sideinput/proto --python_out=pynumaflow/sideinput/proto --grpc_python_out=pynumaflow/sideinput/proto pynumaflow/sideinput/proto/*.proto - python3 -m grpc_tools.protoc -I=pynumaflow/sourcer/proto --python_out=pynumaflow/sourcer/proto --grpc_python_out=pynumaflow/sourcer/proto pynumaflow/sourcer/proto/*.proto + python3 -m grpc_tools.protoc -I=pynumaflow/proto/sinker --python_out=pynumaflow/proto/sinker --grpc_python_out=pynumaflow/proto/sinker pynumaflow/proto/sinker/*.proto + python3 -m grpc_tools.protoc -I=pynumaflow/proto/mapper --python_out=pynumaflow/proto/mapper --grpc_python_out=pynumaflow/proto/mapper pynumaflow/proto/mapper/*.proto + python3 -m grpc_tools.protoc -I=pynumaflow/proto/mapstreamer --python_out=pynumaflow/proto/mapstreamer --grpc_python_out=pynumaflow/proto/mapstreamer pynumaflow/proto/mapstreamer/*.proto + python3 -m grpc_tools.protoc -I=pynumaflow/proto/reducer --python_out=pynumaflow/proto/reducer --grpc_python_out=pynumaflow/proto/reducer pynumaflow/proto/reducer/*.proto + python3 -m grpc_tools.protoc -I=pynumaflow/proto/sourcetransformer --python_out=pynumaflow/proto/sourcetransformer --grpc_python_out=pynumaflow/proto/sourcetransformer pynumaflow/proto/sourcetransformer/*.proto + python3 -m grpc_tools.protoc -I=pynumaflow/proto/sideinput --python_out=pynumaflow/proto/sideinput --grpc_python_out=pynumaflow/proto/sideinput pynumaflow/proto/sideinput/*.proto + python3 -m grpc_tools.protoc -I=pynumaflow/proto/sourcer --python_out=pynumaflow/proto/sourcer --grpc_python_out=pynumaflow/proto/sourcer pynumaflow/proto/sourcer/*.proto - sed -i '' 's/^\(import.*_pb2\)/from . \1/' pynumaflow/*/proto/*.py + sed -i '' 's/^\(import.*_pb2\)/from . \1/' pynumaflow/proto/*/*.py diff --git a/pynumaflow/mapper/_dtypes.py b/pynumaflow/mapper/_dtypes.py index cbddc3bc..63ff1088 100644 --- a/pynumaflow/mapper/_dtypes.py +++ b/pynumaflow/mapper/_dtypes.py @@ -171,14 +171,12 @@ class MapperClass(metaclass=ABCMeta): """ Provides an interface to write a Mapper which will be exposed over a Synchronous gRPC server. - - Args: - """ def __call__(self, *args, **kwargs): """ - Allow to call handler function directly if class instance is sent + This allows to execute the handler function directly if + class instance is sent as a callable. """ return self.handler(*args, **kwargs) @@ -190,4 +188,5 @@ def handler(self, keys: list[str], datum: Datum) -> Messages: pass +# MapCallable is a callable which can be used as a handler for the Map UDF MapCallable = Union[MapperClass, MapSyncCallable, MapAsyncCallable] diff --git a/pynumaflow/mapper/async_server.py b/pynumaflow/mapper/async_server.py index d902a362..ec76384f 100644 --- a/pynumaflow/mapper/async_server.py +++ b/pynumaflow/mapper/async_server.py @@ -1,52 +1,18 @@ -import logging -import multiprocessing -import os - - import grpc from google.protobuf import empty_pb2 as _empty_pb2 -from pynumaflow import setup_logging from pynumaflow.mapper._dtypes import Datum from pynumaflow.mapper._dtypes import MapAsyncCallable, MapCallable from pynumaflow.proto.mapper import map_pb2, map_pb2_grpc from pynumaflow.types import NumaflowServicerContext - -_LOGGER = setup_logging(__name__) -if os.getenv("PYTHONDEBUG"): - _LOGGER.setLevel(logging.DEBUG) - -_PROCESS_COUNT = multiprocessing.cpu_count() -MAX_THREADS = int(os.getenv("MAX_THREADS", 0)) or (_PROCESS_COUNT * 4) +from pynumaflow._constants import _LOGGER class AsyncMapper(map_pb2_grpc.MapServicer): """ - Provides an interface to write an Async Mapper - which will be exposed over gRPC. - - Args: - handler: Function callable following the type signature of MapCallable - sock_path: Path to the UNIX Domain Socket - max_message_size: The max message size in bytes the server can receive and send - max_threads: The max number of threads to be spawned; - defaults to number of processors x4 - - Example invocation: - >>> from typing import Iterator - >>> from pynumaflow.mapper import Messages, Message\ - ... Datum, AsyncMapper - ... import aiorun - ... - >>> async def map_handler(key: [str], datum: Datum) -> Messages: - ... val = datum.value - ... _ = datum.event_time - ... _ = datum.watermark - ... messages = Messages(Message(val, keys=keys)) - ... return messages - ... - >>> grpc_server = AsyncMapper(handler=map_handler) - >>> aiorun.run(grpc_server.start()) + This class is used to create a new grpc Async Map Servicer instance. + It implements the MapServicer interface from the proto map.proto file. + Provides the functionality for the required rpc methods. """ def __init__( diff --git a/pynumaflow/mapper/map.py b/pynumaflow/mapper/map.py index 11de212a..2d844bab 100644 --- a/pynumaflow/mapper/map.py +++ b/pynumaflow/mapper/map.py @@ -27,7 +27,7 @@ class MapServer(NumaflowServer): """ - Create a new grpc Server instance. + Create a new grpc Map Server instance. """ def __init__( @@ -39,13 +39,19 @@ def __init__( server_type=ServerType.Sync, ): """ - Create a new grpc Server instance. + Create a new grpc Map Server instance. A new servicer instance is created and attached to the server. The server instance is returned. - + Args: + mapper_instance: The mapper instance to be used for Map UDF + sock_path: The UNIX socket path to be used for the server max_message_size: The max message size in bytes the server can receive and send max_threads: The max number of threads to be spawned; defaults to number of processors x4 + server_type: The type of server to be used, this can be one of the following: + - ServerType.Sync: Synchronous server + - ServerType.Async: Asynchronous server + - ServerType.Multiproc: Multiprocess server """ self.sock_path = f"unix://{sock_path}" self.max_threads = min(max_threads, int(os.getenv("MAX_THREADS", "4"))) @@ -72,7 +78,9 @@ def __init__( def start(self) -> None: """ - Starts the gRPC server on the given UNIX socket with given max threads. + Starter function for the server class, Handles the server type and + starts the server accordingly. If the server type is not supported, + raises NotImplementedError. """ if self.server_type == ServerType.Sync: self.exec() @@ -81,13 +89,14 @@ def start(self) -> None: elif self.server_type == ServerType.Multiproc: self.exec_multiproc() else: - _LOGGER.error("Server type not supported", self.server_type) + _LOGGER.error("Server type not supported - %s", str(self.server_type)) raise NotImplementedError def exec(self): """ Starts the Synchronous gRPC server on the given UNIX socket with given max threads. """ + # Get the servicer instance based on the server type map_servicer = self.get_servicer( mapper_instance=self.mapper_instance, server_type=self.server_type ) @@ -96,7 +105,7 @@ def exec(self): self.sock_path, self.max_threads, ) - # sync_server_start(server=server) + # Start the server sync_server_start( servicer=map_servicer, bind_address=self.sock_path, @@ -107,11 +116,16 @@ def exec(self): def exec_multiproc(self): """ - Starts the gRPC server on the given UNIX socket with given max threads. + Starts the multirpoc gRPC server on the given UNIX socket with + given max threads. """ + + # Get the servicer instance based on the server type map_servicer = self.get_servicer( mapper_instance=self.mapper_instance, server_type=self.server_type ) + + # Start the multirpoc server start_multiproc_server( max_threads=self.max_threads, servicer=map_servicer, @@ -122,8 +136,14 @@ def exec_multiproc(self): async def aexec(self) -> None: """ - Starts the Async gRPC server on the given UNIX socket with given max threads.s + Starts the Async gRPC server on the given UNIX socket with + given max threads. """ + + # As the server is async, we need to create a new server instance in the + # same thread as the event loop so that all the async calls are made in the + # same context + # Create a new async server instance and add the servicer to it server_new = grpc.aio.server() server_new.add_insecure_port(self.sock_path) map_servicer = self.get_servicer( @@ -131,15 +151,15 @@ async def aexec(self) -> None: ) map_pb2_grpc.add_MapServicer_to_server(map_servicer, server_new) + # Start the async server await start_async_server(server_new, self.sock_path, self.max_threads, self._server_options) def get_servicer(self, mapper_instance: MapCallable, server_type: ServerType): + """Returns the servicer instance based on the server type""" if server_type == ServerType.Sync: map_servicer = Mapper(handler=mapper_instance) elif server_type == ServerType.Async: map_servicer = AsyncMapper(handler=mapper_instance) elif server_type == ServerType.Multiproc: map_servicer = Mapper(handler=mapper_instance) - else: - raise NotImplementedError return map_servicer diff --git a/pynumaflow/mapper/server.py b/pynumaflow/mapper/server.py index 83680cf9..88647dbb 100644 --- a/pynumaflow/mapper/server.py +++ b/pynumaflow/mapper/server.py @@ -1,44 +1,16 @@ -import logging -import os - from google.protobuf import empty_pb2 as _empty_pb2 -from pynumaflow import setup_logging from pynumaflow.mapper._dtypes import MapCallable from pynumaflow.proto.mapper import map_pb2, map_pb2_grpc from pynumaflow.mapper.utils import _map_fn_util from pynumaflow.types import NumaflowServicerContext -_LOGGER = setup_logging(__name__) -if os.getenv("PYTHONDEBUG"): - _LOGGER.setLevel(logging.DEBUG) - class Mapper(map_pb2_grpc.MapServicer): """ - Provides an interface to write a Mapper - which will be exposed over a Synchronous gRPC server. - - Args: - handler: Function callable following the type signature of MapCallable - max_message_size: The max message size in bytes the server can receive and send - max_threads: The max number of threads to be spawned; - defaults to number of processors x4 - - Example invocation: - >>> from typing import Iterator - >>> from pynumaflow.mapper import Messages, Message\ - ... Datum, Mapper - ... - >>> def map_handler(key: [str], datum: Datum) -> Messages: - ... val = datum.value - ... _ = datum.event_time - ... _ = datum.watermark - ... messages = Messages(Message(val, keys=keys)) - ... return messages - ... - >>> grpc_server = Mapper(handler=map_handler) - >>> grpc_server.start() + This class is used to create a new grpc Map Servicer instance. + It implements the MapServicer interface from the proto map.proto file. + Provides the functionality for the required rpc methods. """ def __init__( diff --git a/pynumaflow/mapstreamer/async_server.py b/pynumaflow/mapstreamer/async_server.py index 0639b4a0..7f9978bc 100644 --- a/pynumaflow/mapstreamer/async_server.py +++ b/pynumaflow/mapstreamer/async_server.py @@ -1,4 +1,3 @@ -import logging import multiprocessing import os @@ -6,15 +5,11 @@ from google.protobuf import empty_pb2 as _empty_pb2 -from pynumaflow import setup_logging from pynumaflow.mapstreamer import Datum from pynumaflow.mapstreamer._dtypes import MapStreamCallable from pynumaflow.proto.mapstreamer import mapstream_pb2_grpc, mapstream_pb2 from pynumaflow.types import NumaflowServicerContext - -_LOGGER = setup_logging(__name__) -if os.getenv("PYTHONDEBUG"): - _LOGGER.setLevel(logging.DEBUG) +from pynumaflow._constants import _LOGGER _PROCESS_COUNT = multiprocessing.cpu_count() MAX_THREADS = int(os.getenv("MAX_THREADS", 0)) or (_PROCESS_COUNT * 4) @@ -22,30 +17,10 @@ class AsyncMapStreamer(mapstream_pb2_grpc.MapStreamServicer): """ - Provides an interface to write a Map Streamer - which will be exposed over gRPC. - - Args: - handler: Function callable following the type signature of MapStreamCallable - sock_path: Path to the UNIX Domain Socket - max_message_size: The max message size in bytes the server can receive and send - max_threads: The max number of threads to be spawned; - defaults to number of processors x4 - - Example invocation: - >>> from typing import Iterator - >>> from pynumaflow.mapstreamer import Messages, Message \ - ... Datum, AsyncMapStreamer - ... import aiorun - >>> async def map_stream_handler(key: [str], datums: Datum) -> AsyncIterable[Message]: - ... val = datum.value - ... _ = datum.event_time - ... _ = datum.watermark - ... for i in range(10): - ... yield Message(val, keys=keys) - ... - >>> grpc_server = AsyncMapStreamer(handler=map_stream_handler) - >>> aiorun.run(grpc_server.start()) + This class is used to create a new grpc Map Stream Servicer instance. + It implements the MapServicer interface from the proto + mapstream_pb2_grpc.py file. + Provides the functionality for the required rpc methods. """ def __init__( diff --git a/pynumaflow/mapstreamer/mapstream.py b/pynumaflow/mapstreamer/mapstream.py index 71d16b92..e553771a 100644 --- a/pynumaflow/mapstreamer/mapstream.py +++ b/pynumaflow/mapstreamer/mapstream.py @@ -32,7 +32,18 @@ def __init__( max_threads=MAX_THREADS, server_type=ServerType.Async, ): - """ """ + """ + Create a new grpc Map Stream Server instance. + A new servicer instance is created and attached to the server. + The server instance is returned. + Args: + map_stream_instance: The map stream instance to be used for Map Stream UDF + sock_path: The UNIX socket path to be used for the server + max_message_size: The max message size in bytes the server can receive and send + max_threads: The max number of threads to be spawned; + defaults to number of processors x4 + server_type: The type of server to be used + """ self.map_stream_instance: MapStreamCallable = map_stream_instance self.sock_path = f"unix://{sock_path}" self.max_message_size = max_message_size @@ -45,13 +56,28 @@ def __init__( ] def start(self): + """ + Starter function for the Map Stream server, Handles the server type and + starts the server accordingly. If the server type is not supported, + raises NotImplementedError. + Currently supported server types are: + - ServerType.Async: Asynchronous server + """ if self.server_type == ServerType.Async: aiorun.run(self.aexec()) else: - _LOGGER.error("Server type not supported", self.server_type) + _LOGGER.error("Server type not supported - %s", str(self.server_type)) raise NotImplementedError async def aexec(self): + """ + Starts the Async gRPC server on the given UNIX socket with + given max threads. + """ + # As the server is async, we need to create a new server instance in the + # same thread as the event loop so that all the async calls are made in the + # same context + # Create a new async server instance and add the servicer to it server = grpc.aio.server() server.add_insecure_port(self.sock_path) map_servicer = self.get_servicer( @@ -65,6 +91,11 @@ async def aexec(self): await start_async_server(server, self.sock_path, self.max_threads, self._server_options) def get_servicer(self, map_stream_instance: MapStreamCallable, server_type: ServerType): + """ + Returns the servicer instance based on the server type. + Currently supported server types are: + - ServerType.Async: Asynchronous server + """ if server_type == ServerType.Async: map_servicer = AsyncMapStreamer(handler=map_stream_instance) else: diff --git a/pynumaflow/proto/mapper/map_pb2.py b/pynumaflow/proto/mapper/map_pb2.py index ddb812df..edb4dbd4 100644 --- a/pynumaflow/proto/mapper/map_pb2.py +++ b/pynumaflow/proto/mapper/map_pb2.py @@ -1,12 +1,12 @@ # -*- coding: utf-8 -*- # Generated by the protocol buffer compiler. DO NOT EDIT! # source: map.proto +# Protobuf Python Version: 4.25.0 """Generated protocol buffer code.""" from google.protobuf import descriptor as _descriptor from google.protobuf import descriptor_pool as _descriptor_pool from google.protobuf import symbol_database as _symbol_database from google.protobuf.internal import builder as _builder - # @@protoc_insertion_point(imports) _sym_db = _symbol_database.Default() @@ -16,23 +16,21 @@ from google.protobuf import timestamp_pb2 as google_dot_protobuf_dot_timestamp__pb2 -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b'\n\tmap.proto\x12\x06map.v1\x1a\x1bgoogle/protobuf/empty.proto\x1a\x1fgoogle/protobuf/timestamp.proto"\x88\x01\n\nMapRequest\x12\x0c\n\x04keys\x18\x01 \x03(\t\x12\r\n\x05value\x18\x02 \x01(\x0c\x12.\n\nevent_time\x18\x03 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12-\n\twatermark\x18\x04 \x01(\x0b\x32\x1a.google.protobuf.Timestamp"o\n\x0bMapResponse\x12+\n\x07results\x18\x01 \x03(\x0b\x32\x1a.map.v1.MapResponse.Result\x1a\x33\n\x06Result\x12\x0c\n\x04keys\x18\x01 \x03(\t\x12\r\n\x05value\x18\x02 \x01(\x0c\x12\x0c\n\x04tags\x18\x03 \x03(\t"\x1e\n\rReadyResponse\x12\r\n\x05ready\x18\x01 \x01(\x08\x32q\n\x03Map\x12\x30\n\x05MapFn\x12\x12.map.v1.MapRequest\x1a\x13.map.v1.MapResponse\x12\x38\n\x07IsReady\x12\x16.google.protobuf.Empty\x1a\x15.map.v1.ReadyResponseb\x06proto3' -) +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\tmap.proto\x12\x06map.v1\x1a\x1bgoogle/protobuf/empty.proto\x1a\x1fgoogle/protobuf/timestamp.proto\"\x88\x01\n\nMapRequest\x12\x0c\n\x04keys\x18\x01 \x03(\t\x12\r\n\x05value\x18\x02 \x01(\x0c\x12.\n\nevent_time\x18\x03 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12-\n\twatermark\x18\x04 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\"o\n\x0bMapResponse\x12+\n\x07results\x18\x01 \x03(\x0b\x32\x1a.map.v1.MapResponse.Result\x1a\x33\n\x06Result\x12\x0c\n\x04keys\x18\x01 \x03(\t\x12\r\n\x05value\x18\x02 \x01(\x0c\x12\x0c\n\x04tags\x18\x03 \x03(\t\"\x1e\n\rReadyResponse\x12\r\n\x05ready\x18\x01 \x01(\x08\x32q\n\x03Map\x12\x30\n\x05MapFn\x12\x12.map.v1.MapRequest\x1a\x13.map.v1.MapResponse\x12\x38\n\x07IsReady\x12\x16.google.protobuf.Empty\x1a\x15.map.v1.ReadyResponseb\x06proto3') _globals = globals() _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) -_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "map_pb2", _globals) +_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'map_pb2', _globals) if _descriptor._USE_C_DESCRIPTORS == False: - DESCRIPTOR._options = None - _globals["_MAPREQUEST"]._serialized_start = 84 - _globals["_MAPREQUEST"]._serialized_end = 220 - _globals["_MAPRESPONSE"]._serialized_start = 222 - _globals["_MAPRESPONSE"]._serialized_end = 333 - _globals["_MAPRESPONSE_RESULT"]._serialized_start = 282 - _globals["_MAPRESPONSE_RESULT"]._serialized_end = 333 - _globals["_READYRESPONSE"]._serialized_start = 335 - _globals["_READYRESPONSE"]._serialized_end = 365 - _globals["_MAP"]._serialized_start = 367 - _globals["_MAP"]._serialized_end = 480 + DESCRIPTOR._options = None + _globals['_MAPREQUEST']._serialized_start=84 + _globals['_MAPREQUEST']._serialized_end=220 + _globals['_MAPRESPONSE']._serialized_start=222 + _globals['_MAPRESPONSE']._serialized_end=333 + _globals['_MAPRESPONSE_RESULT']._serialized_start=282 + _globals['_MAPRESPONSE_RESULT']._serialized_end=333 + _globals['_READYRESPONSE']._serialized_start=335 + _globals['_READYRESPONSE']._serialized_end=365 + _globals['_MAP']._serialized_start=367 + _globals['_MAP']._serialized_end=480 # @@protoc_insertion_point(module_scope) diff --git a/pynumaflow/proto/mapper/map_pb2_grpc.py b/pynumaflow/proto/mapper/map_pb2_grpc.py index 6973f6e7..17345658 100644 --- a/pynumaflow/proto/mapper/map_pb2_grpc.py +++ b/pynumaflow/proto/mapper/map_pb2_grpc.py @@ -3,7 +3,7 @@ import grpc from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2 -from pynumaflow.proto.mapper import map_pb2 as map__pb2 +from . import map_pb2 as map__pb2 class MapStub(object): @@ -16,108 +16,87 @@ def __init__(self, channel): channel: A grpc.Channel. """ self.MapFn = channel.unary_unary( - "/map.v1.Map/MapFn", - request_serializer=map__pb2.MapRequest.SerializeToString, - response_deserializer=map__pb2.MapResponse.FromString, - ) + '/map.v1.Map/MapFn', + request_serializer=map__pb2.MapRequest.SerializeToString, + response_deserializer=map__pb2.MapResponse.FromString, + ) self.IsReady = channel.unary_unary( - "/map.v1.Map/IsReady", - request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, - response_deserializer=map__pb2.ReadyResponse.FromString, - ) + '/map.v1.Map/IsReady', + request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, + response_deserializer=map__pb2.ReadyResponse.FromString, + ) class MapServicer(object): """Missing associated documentation comment in .proto file.""" def MapFn(self, request, context): - """MapFn applies a function to each map request element.""" + """MapFn applies a function to each map request element. + """ context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') def IsReady(self, request, context): - """IsReady is the heartbeat endpoint for gRPC.""" + """IsReady is the heartbeat endpoint for gRPC. + """ context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') def add_MapServicer_to_server(servicer, server): rpc_method_handlers = { - "MapFn": grpc.unary_unary_rpc_method_handler( - servicer.MapFn, - request_deserializer=map__pb2.MapRequest.FromString, - response_serializer=map__pb2.MapResponse.SerializeToString, - ), - "IsReady": grpc.unary_unary_rpc_method_handler( - servicer.IsReady, - request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, - response_serializer=map__pb2.ReadyResponse.SerializeToString, - ), + 'MapFn': grpc.unary_unary_rpc_method_handler( + servicer.MapFn, + request_deserializer=map__pb2.MapRequest.FromString, + response_serializer=map__pb2.MapResponse.SerializeToString, + ), + 'IsReady': grpc.unary_unary_rpc_method_handler( + servicer.IsReady, + request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + response_serializer=map__pb2.ReadyResponse.SerializeToString, + ), } - generic_handler = grpc.method_handlers_generic_handler("map.v1.Map", rpc_method_handlers) + generic_handler = grpc.method_handlers_generic_handler( + 'map.v1.Map', rpc_method_handlers) server.add_generic_rpc_handlers((generic_handler,)) -# This class is part of an EXPERIMENTAL API. + # This class is part of an EXPERIMENTAL API. class Map(object): """Missing associated documentation comment in .proto file.""" @staticmethod - def MapFn( - request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.unary_unary( - request, + def MapFn(request, target, - "/map.v1.Map/MapFn", + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/map.v1.Map/MapFn', map__pb2.MapRequest.SerializeToString, map__pb2.MapResponse.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - ) + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) @staticmethod - def IsReady( - request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.unary_unary( - request, + def IsReady(request, target, - "/map.v1.Map/IsReady", + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/map.v1.Map/IsReady', google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, map__pb2.ReadyResponse.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - ) + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) diff --git a/pynumaflow/proto/mapstreamer/mapstream_pb2.py b/pynumaflow/proto/mapstreamer/mapstream_pb2.py index f1c2c169..7613aed0 100644 --- a/pynumaflow/proto/mapstreamer/mapstream_pb2.py +++ b/pynumaflow/proto/mapstreamer/mapstream_pb2.py @@ -1,12 +1,12 @@ # -*- coding: utf-8 -*- # Generated by the protocol buffer compiler. DO NOT EDIT! # source: mapstream.proto +# Protobuf Python Version: 4.25.0 """Generated protocol buffer code.""" from google.protobuf import descriptor as _descriptor from google.protobuf import descriptor_pool as _descriptor_pool from google.protobuf import symbol_database as _symbol_database from google.protobuf.internal import builder as _builder - # @@protoc_insertion_point(imports) _sym_db = _symbol_database.Default() @@ -16,23 +16,21 @@ from google.protobuf import timestamp_pb2 as google_dot_protobuf_dot_timestamp__pb2 -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b'\n\x0fmapstream.proto\x12\x0cmapstream.v1\x1a\x1bgoogle/protobuf/empty.proto\x1a\x1fgoogle/protobuf/timestamp.proto"\x8e\x01\n\x10MapStreamRequest\x12\x0c\n\x04keys\x18\x01 \x03(\t\x12\r\n\x05value\x18\x02 \x01(\x0c\x12.\n\nevent_time\x18\x03 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12-\n\twatermark\x18\x04 \x01(\x0b\x32\x1a.google.protobuf.Timestamp"\x80\x01\n\x11MapStreamResponse\x12\x36\n\x06result\x18\x01 \x01(\x0b\x32&.mapstream.v1.MapStreamResponse.Result\x1a\x33\n\x06Result\x12\x0c\n\x04keys\x18\x01 \x03(\t\x12\r\n\x05value\x18\x02 \x01(\x0c\x12\x0c\n\x04tags\x18\x03 \x03(\t"\x1e\n\rReadyResponse\x12\r\n\x05ready\x18\x01 \x01(\x08\x32\x9d\x01\n\tMapStream\x12P\n\x0bMapStreamFn\x12\x1e.mapstream.v1.MapStreamRequest\x1a\x1f.mapstream.v1.MapStreamResponse0\x01\x12>\n\x07IsReady\x12\x16.google.protobuf.Empty\x1a\x1b.mapstream.v1.ReadyResponseb\x06proto3' -) +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0fmapstream.proto\x12\x0cmapstream.v1\x1a\x1bgoogle/protobuf/empty.proto\x1a\x1fgoogle/protobuf/timestamp.proto\"\x8e\x01\n\x10MapStreamRequest\x12\x0c\n\x04keys\x18\x01 \x03(\t\x12\r\n\x05value\x18\x02 \x01(\x0c\x12.\n\nevent_time\x18\x03 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12-\n\twatermark\x18\x04 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\"\x80\x01\n\x11MapStreamResponse\x12\x36\n\x06result\x18\x01 \x01(\x0b\x32&.mapstream.v1.MapStreamResponse.Result\x1a\x33\n\x06Result\x12\x0c\n\x04keys\x18\x01 \x03(\t\x12\r\n\x05value\x18\x02 \x01(\x0c\x12\x0c\n\x04tags\x18\x03 \x03(\t\"\x1e\n\rReadyResponse\x12\r\n\x05ready\x18\x01 \x01(\x08\x32\x9d\x01\n\tMapStream\x12P\n\x0bMapStreamFn\x12\x1e.mapstream.v1.MapStreamRequest\x1a\x1f.mapstream.v1.MapStreamResponse0\x01\x12>\n\x07IsReady\x12\x16.google.protobuf.Empty\x1a\x1b.mapstream.v1.ReadyResponseb\x06proto3') _globals = globals() _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) -_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "mapstream_pb2", _globals) +_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'mapstream_pb2', _globals) if _descriptor._USE_C_DESCRIPTORS == False: - DESCRIPTOR._options = None - _globals["_MAPSTREAMREQUEST"]._serialized_start = 96 - _globals["_MAPSTREAMREQUEST"]._serialized_end = 238 - _globals["_MAPSTREAMRESPONSE"]._serialized_start = 241 - _globals["_MAPSTREAMRESPONSE"]._serialized_end = 369 - _globals["_MAPSTREAMRESPONSE_RESULT"]._serialized_start = 318 - _globals["_MAPSTREAMRESPONSE_RESULT"]._serialized_end = 369 - _globals["_READYRESPONSE"]._serialized_start = 371 - _globals["_READYRESPONSE"]._serialized_end = 401 - _globals["_MAPSTREAM"]._serialized_start = 404 - _globals["_MAPSTREAM"]._serialized_end = 561 + DESCRIPTOR._options = None + _globals['_MAPSTREAMREQUEST']._serialized_start=96 + _globals['_MAPSTREAMREQUEST']._serialized_end=238 + _globals['_MAPSTREAMRESPONSE']._serialized_start=241 + _globals['_MAPSTREAMRESPONSE']._serialized_end=369 + _globals['_MAPSTREAMRESPONSE_RESULT']._serialized_start=318 + _globals['_MAPSTREAMRESPONSE_RESULT']._serialized_end=369 + _globals['_READYRESPONSE']._serialized_start=371 + _globals['_READYRESPONSE']._serialized_end=401 + _globals['_MAPSTREAM']._serialized_start=404 + _globals['_MAPSTREAM']._serialized_end=561 # @@protoc_insertion_point(module_scope) diff --git a/pynumaflow/proto/mapstreamer/mapstream_pb2_grpc.py b/pynumaflow/proto/mapstreamer/mapstream_pb2_grpc.py index 3f2b7901..222a1614 100644 --- a/pynumaflow/proto/mapstreamer/mapstream_pb2_grpc.py +++ b/pynumaflow/proto/mapstreamer/mapstream_pb2_grpc.py @@ -3,7 +3,7 @@ import grpc from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2 -from pynumaflow.proto.mapstreamer import mapstream_pb2 as mapstream__pb2 +from . import mapstream_pb2 as mapstream__pb2 class MapStreamStub(object): @@ -16,110 +16,87 @@ def __init__(self, channel): channel: A grpc.Channel. """ self.MapStreamFn = channel.unary_stream( - "/mapstream.v1.MapStream/MapStreamFn", - request_serializer=mapstream__pb2.MapStreamRequest.SerializeToString, - response_deserializer=mapstream__pb2.MapStreamResponse.FromString, - ) + '/mapstream.v1.MapStream/MapStreamFn', + request_serializer=mapstream__pb2.MapStreamRequest.SerializeToString, + response_deserializer=mapstream__pb2.MapStreamResponse.FromString, + ) self.IsReady = channel.unary_unary( - "/mapstream.v1.MapStream/IsReady", - request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, - response_deserializer=mapstream__pb2.ReadyResponse.FromString, - ) + '/mapstream.v1.MapStream/IsReady', + request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, + response_deserializer=mapstream__pb2.ReadyResponse.FromString, + ) class MapStreamServicer(object): """Missing associated documentation comment in .proto file.""" def MapStreamFn(self, request, context): - """MapStreamFn applies a function to each request element and returns a stream.""" + """MapStreamFn applies a function to each request element and returns a stream. + """ context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') def IsReady(self, request, context): - """IsReady is the heartbeat endpoint for gRPC.""" + """IsReady is the heartbeat endpoint for gRPC. + """ context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') def add_MapStreamServicer_to_server(servicer, server): rpc_method_handlers = { - "MapStreamFn": grpc.unary_stream_rpc_method_handler( - servicer.MapStreamFn, - request_deserializer=mapstream__pb2.MapStreamRequest.FromString, - response_serializer=mapstream__pb2.MapStreamResponse.SerializeToString, - ), - "IsReady": grpc.unary_unary_rpc_method_handler( - servicer.IsReady, - request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, - response_serializer=mapstream__pb2.ReadyResponse.SerializeToString, - ), + 'MapStreamFn': grpc.unary_stream_rpc_method_handler( + servicer.MapStreamFn, + request_deserializer=mapstream__pb2.MapStreamRequest.FromString, + response_serializer=mapstream__pb2.MapStreamResponse.SerializeToString, + ), + 'IsReady': grpc.unary_unary_rpc_method_handler( + servicer.IsReady, + request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + response_serializer=mapstream__pb2.ReadyResponse.SerializeToString, + ), } generic_handler = grpc.method_handlers_generic_handler( - "mapstream.v1.MapStream", rpc_method_handlers - ) + 'mapstream.v1.MapStream', rpc_method_handlers) server.add_generic_rpc_handlers((generic_handler,)) -# This class is part of an EXPERIMENTAL API. + # This class is part of an EXPERIMENTAL API. class MapStream(object): """Missing associated documentation comment in .proto file.""" @staticmethod - def MapStreamFn( - request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.unary_stream( - request, + def MapStreamFn(request, target, - "/mapstream.v1.MapStream/MapStreamFn", + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_stream(request, target, '/mapstream.v1.MapStream/MapStreamFn', mapstream__pb2.MapStreamRequest.SerializeToString, mapstream__pb2.MapStreamResponse.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - ) + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) @staticmethod - def IsReady( - request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.unary_unary( - request, + def IsReady(request, target, - "/mapstream.v1.MapStream/IsReady", + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/mapstream.v1.MapStream/IsReady', google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, mapstream__pb2.ReadyResponse.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - ) + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) diff --git a/pynumaflow/proto/reducer/reduce_pb2.py b/pynumaflow/proto/reducer/reduce_pb2.py index f61b8887..ec107b83 100644 --- a/pynumaflow/proto/reducer/reduce_pb2.py +++ b/pynumaflow/proto/reducer/reduce_pb2.py @@ -1,12 +1,12 @@ # -*- coding: utf-8 -*- # Generated by the protocol buffer compiler. DO NOT EDIT! # source: reduce.proto +# Protobuf Python Version: 4.25.0 """Generated protocol buffer code.""" from google.protobuf import descriptor as _descriptor from google.protobuf import descriptor_pool as _descriptor_pool from google.protobuf import symbol_database as _symbol_database from google.protobuf.internal import builder as _builder - # @@protoc_insertion_point(imports) _sym_db = _symbol_database.Default() @@ -16,23 +16,21 @@ from google.protobuf import timestamp_pb2 as google_dot_protobuf_dot_timestamp__pb2 -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b'\n\x0creduce.proto\x12\treduce.v1\x1a\x1bgoogle/protobuf/empty.proto\x1a\x1fgoogle/protobuf/timestamp.proto"\x8b\x01\n\rReduceRequest\x12\x0c\n\x04keys\x18\x01 \x03(\t\x12\r\n\x05value\x18\x02 \x01(\x0c\x12.\n\nevent_time\x18\x03 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12-\n\twatermark\x18\x04 \x01(\x0b\x32\x1a.google.protobuf.Timestamp"x\n\x0eReduceResponse\x12\x31\n\x07results\x18\x01 \x03(\x0b\x32 .reduce.v1.ReduceResponse.Result\x1a\x33\n\x06Result\x12\x0c\n\x04keys\x18\x01 \x03(\t\x12\r\n\x05value\x18\x02 \x01(\x0c\x12\x0c\n\x04tags\x18\x03 \x03(\t"\x1e\n\rReadyResponse\x12\r\n\x05ready\x18\x01 \x01(\x08\x32\x8a\x01\n\x06Reduce\x12\x43\n\x08ReduceFn\x12\x18.reduce.v1.ReduceRequest\x1a\x19.reduce.v1.ReduceResponse(\x01\x30\x01\x12;\n\x07IsReady\x12\x16.google.protobuf.Empty\x1a\x18.reduce.v1.ReadyResponseb\x06proto3' -) +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0creduce.proto\x12\treduce.v1\x1a\x1bgoogle/protobuf/empty.proto\x1a\x1fgoogle/protobuf/timestamp.proto\"\x8b\x01\n\rReduceRequest\x12\x0c\n\x04keys\x18\x01 \x03(\t\x12\r\n\x05value\x18\x02 \x01(\x0c\x12.\n\nevent_time\x18\x03 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12-\n\twatermark\x18\x04 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\"x\n\x0eReduceResponse\x12\x31\n\x07results\x18\x01 \x03(\x0b\x32 .reduce.v1.ReduceResponse.Result\x1a\x33\n\x06Result\x12\x0c\n\x04keys\x18\x01 \x03(\t\x12\r\n\x05value\x18\x02 \x01(\x0c\x12\x0c\n\x04tags\x18\x03 \x03(\t\"\x1e\n\rReadyResponse\x12\r\n\x05ready\x18\x01 \x01(\x08\x32\x8a\x01\n\x06Reduce\x12\x43\n\x08ReduceFn\x12\x18.reduce.v1.ReduceRequest\x1a\x19.reduce.v1.ReduceResponse(\x01\x30\x01\x12;\n\x07IsReady\x12\x16.google.protobuf.Empty\x1a\x18.reduce.v1.ReadyResponseb\x06proto3') _globals = globals() _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) -_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "reduce_pb2", _globals) +_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'reduce_pb2', _globals) if _descriptor._USE_C_DESCRIPTORS == False: - DESCRIPTOR._options = None - _globals["_REDUCEREQUEST"]._serialized_start = 90 - _globals["_REDUCEREQUEST"]._serialized_end = 229 - _globals["_REDUCERESPONSE"]._serialized_start = 231 - _globals["_REDUCERESPONSE"]._serialized_end = 351 - _globals["_REDUCERESPONSE_RESULT"]._serialized_start = 300 - _globals["_REDUCERESPONSE_RESULT"]._serialized_end = 351 - _globals["_READYRESPONSE"]._serialized_start = 353 - _globals["_READYRESPONSE"]._serialized_end = 383 - _globals["_REDUCE"]._serialized_start = 386 - _globals["_REDUCE"]._serialized_end = 524 + DESCRIPTOR._options = None + _globals['_REDUCEREQUEST']._serialized_start=90 + _globals['_REDUCEREQUEST']._serialized_end=229 + _globals['_REDUCERESPONSE']._serialized_start=231 + _globals['_REDUCERESPONSE']._serialized_end=351 + _globals['_REDUCERESPONSE_RESULT']._serialized_start=300 + _globals['_REDUCERESPONSE_RESULT']._serialized_end=351 + _globals['_READYRESPONSE']._serialized_start=353 + _globals['_READYRESPONSE']._serialized_end=383 + _globals['_REDUCE']._serialized_start=386 + _globals['_REDUCE']._serialized_end=524 # @@protoc_insertion_point(module_scope) diff --git a/pynumaflow/proto/reducer/reduce_pb2_grpc.py b/pynumaflow/proto/reducer/reduce_pb2_grpc.py index 1e36317e..1fd860b4 100644 --- a/pynumaflow/proto/reducer/reduce_pb2_grpc.py +++ b/pynumaflow/proto/reducer/reduce_pb2_grpc.py @@ -3,7 +3,7 @@ import grpc from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2 -from pynumaflow.proto.reducer import reduce_pb2 as reduce__pb2 +from . import reduce_pb2 as reduce__pb2 class ReduceStub(object): @@ -16,108 +16,87 @@ def __init__(self, channel): channel: A grpc.Channel. """ self.ReduceFn = channel.stream_stream( - "/reduce.v1.Reduce/ReduceFn", - request_serializer=reduce__pb2.ReduceRequest.SerializeToString, - response_deserializer=reduce__pb2.ReduceResponse.FromString, - ) + '/reduce.v1.Reduce/ReduceFn', + request_serializer=reduce__pb2.ReduceRequest.SerializeToString, + response_deserializer=reduce__pb2.ReduceResponse.FromString, + ) self.IsReady = channel.unary_unary( - "/reduce.v1.Reduce/IsReady", - request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, - response_deserializer=reduce__pb2.ReadyResponse.FromString, - ) + '/reduce.v1.Reduce/IsReady', + request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, + response_deserializer=reduce__pb2.ReadyResponse.FromString, + ) class ReduceServicer(object): """Missing associated documentation comment in .proto file.""" def ReduceFn(self, request_iterator, context): - """ReduceFn applies a reduce function to a request stream.""" + """ReduceFn applies a reduce function to a request stream. + """ context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') def IsReady(self, request, context): - """IsReady is the heartbeat endpoint for gRPC.""" + """IsReady is the heartbeat endpoint for gRPC. + """ context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') def add_ReduceServicer_to_server(servicer, server): rpc_method_handlers = { - "ReduceFn": grpc.stream_stream_rpc_method_handler( - servicer.ReduceFn, - request_deserializer=reduce__pb2.ReduceRequest.FromString, - response_serializer=reduce__pb2.ReduceResponse.SerializeToString, - ), - "IsReady": grpc.unary_unary_rpc_method_handler( - servicer.IsReady, - request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, - response_serializer=reduce__pb2.ReadyResponse.SerializeToString, - ), + 'ReduceFn': grpc.stream_stream_rpc_method_handler( + servicer.ReduceFn, + request_deserializer=reduce__pb2.ReduceRequest.FromString, + response_serializer=reduce__pb2.ReduceResponse.SerializeToString, + ), + 'IsReady': grpc.unary_unary_rpc_method_handler( + servicer.IsReady, + request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + response_serializer=reduce__pb2.ReadyResponse.SerializeToString, + ), } - generic_handler = grpc.method_handlers_generic_handler("reduce.v1.Reduce", rpc_method_handlers) + generic_handler = grpc.method_handlers_generic_handler( + 'reduce.v1.Reduce', rpc_method_handlers) server.add_generic_rpc_handlers((generic_handler,)) -# This class is part of an EXPERIMENTAL API. + # This class is part of an EXPERIMENTAL API. class Reduce(object): """Missing associated documentation comment in .proto file.""" @staticmethod - def ReduceFn( - request_iterator, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.stream_stream( - request_iterator, + def ReduceFn(request_iterator, target, - "/reduce.v1.Reduce/ReduceFn", + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.stream_stream(request_iterator, target, '/reduce.v1.Reduce/ReduceFn', reduce__pb2.ReduceRequest.SerializeToString, reduce__pb2.ReduceResponse.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - ) + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) @staticmethod - def IsReady( - request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.unary_unary( - request, + def IsReady(request, target, - "/reduce.v1.Reduce/IsReady", + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/reduce.v1.Reduce/IsReady', google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, reduce__pb2.ReadyResponse.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - ) + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) diff --git a/pynumaflow/proto/sideinput/sideinput_pb2.py b/pynumaflow/proto/sideinput/sideinput_pb2.py index 8278c1df..50d3de7c 100644 --- a/pynumaflow/proto/sideinput/sideinput_pb2.py +++ b/pynumaflow/proto/sideinput/sideinput_pb2.py @@ -1,12 +1,12 @@ # -*- coding: utf-8 -*- # Generated by the protocol buffer compiler. DO NOT EDIT! # source: sideinput.proto +# Protobuf Python Version: 4.25.0 """Generated protocol buffer code.""" from google.protobuf import descriptor as _descriptor from google.protobuf import descriptor_pool as _descriptor_pool from google.protobuf import symbol_database as _symbol_database from google.protobuf.internal import builder as _builder - # @@protoc_insertion_point(imports) _sym_db = _symbol_database.Default() @@ -15,19 +15,17 @@ from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2 -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b'\n\x0fsideinput.proto\x12\x0csideinput.v1\x1a\x1bgoogle/protobuf/empty.proto"8\n\x11SideInputResponse\x12\r\n\x05value\x18\x01 \x01(\x0c\x12\x14\n\x0cno_broadcast\x18\x02 \x01(\x08"\x1e\n\rReadyResponse\x12\r\n\x05ready\x18\x01 \x01(\x08\x32\x99\x01\n\tSideInput\x12L\n\x11RetrieveSideInput\x12\x16.google.protobuf.Empty\x1a\x1f.sideinput.v1.SideInputResponse\x12>\n\x07IsReady\x12\x16.google.protobuf.Empty\x1a\x1b.sideinput.v1.ReadyResponseb\x06proto3' -) +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0fsideinput.proto\x12\x0csideinput.v1\x1a\x1bgoogle/protobuf/empty.proto\"8\n\x11SideInputResponse\x12\r\n\x05value\x18\x01 \x01(\x0c\x12\x14\n\x0cno_broadcast\x18\x02 \x01(\x08\"\x1e\n\rReadyResponse\x12\r\n\x05ready\x18\x01 \x01(\x08\x32\x99\x01\n\tSideInput\x12L\n\x11RetrieveSideInput\x12\x16.google.protobuf.Empty\x1a\x1f.sideinput.v1.SideInputResponse\x12>\n\x07IsReady\x12\x16.google.protobuf.Empty\x1a\x1b.sideinput.v1.ReadyResponseb\x06proto3') _globals = globals() _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) -_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "sideinput_pb2", _globals) +_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'sideinput_pb2', _globals) if _descriptor._USE_C_DESCRIPTORS == False: - DESCRIPTOR._options = None - _globals["_SIDEINPUTRESPONSE"]._serialized_start = 62 - _globals["_SIDEINPUTRESPONSE"]._serialized_end = 118 - _globals["_READYRESPONSE"]._serialized_start = 120 - _globals["_READYRESPONSE"]._serialized_end = 150 - _globals["_SIDEINPUT"]._serialized_start = 153 - _globals["_SIDEINPUT"]._serialized_end = 306 + DESCRIPTOR._options = None + _globals['_SIDEINPUTRESPONSE']._serialized_start=62 + _globals['_SIDEINPUTRESPONSE']._serialized_end=118 + _globals['_READYRESPONSE']._serialized_start=120 + _globals['_READYRESPONSE']._serialized_end=150 + _globals['_SIDEINPUT']._serialized_start=153 + _globals['_SIDEINPUT']._serialized_end=306 # @@protoc_insertion_point(module_scope) diff --git a/pynumaflow/proto/sideinput/sideinput_pb2_grpc.py b/pynumaflow/proto/sideinput/sideinput_pb2_grpc.py index abcd0b79..8abe64d2 100644 --- a/pynumaflow/proto/sideinput/sideinput_pb2_grpc.py +++ b/pynumaflow/proto/sideinput/sideinput_pb2_grpc.py @@ -3,7 +3,7 @@ import grpc from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2 -from pynumaflow.proto.sideinput import sideinput_pb2 as sideinput__pb2 +from . import sideinput_pb2 as sideinput__pb2 class SideInputStub(object): @@ -24,15 +24,15 @@ def __init__(self, channel): channel: A grpc.Channel. """ self.RetrieveSideInput = channel.unary_unary( - "/sideinput.v1.SideInput/RetrieveSideInput", - request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, - response_deserializer=sideinput__pb2.SideInputResponse.FromString, - ) + '/sideinput.v1.SideInput/RetrieveSideInput', + request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, + response_deserializer=sideinput__pb2.SideInputResponse.FromString, + ) self.IsReady = channel.unary_unary( - "/sideinput.v1.SideInput/IsReady", - request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, - response_deserializer=sideinput__pb2.ReadyResponse.FromString, - ) + '/sideinput.v1.SideInput/IsReady', + request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, + response_deserializer=sideinput__pb2.ReadyResponse.FromString, + ) class SideInputServicer(object): @@ -47,38 +47,39 @@ class SideInputServicer(object): """ def RetrieveSideInput(self, request, context): - """RetrieveSideInput is the endpoint to retrieve the latest value of a given Side Input.""" + """RetrieveSideInput is the endpoint to retrieve the latest value of a given Side Input. + """ context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') def IsReady(self, request, context): - """IsReady is the health check endpoint to indicate whether the service is ready to be used.""" + """IsReady is the health check endpoint to indicate whether the service is ready to be used. + """ context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') def add_SideInputServicer_to_server(servicer, server): rpc_method_handlers = { - "RetrieveSideInput": grpc.unary_unary_rpc_method_handler( - servicer.RetrieveSideInput, - request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, - response_serializer=sideinput__pb2.SideInputResponse.SerializeToString, - ), - "IsReady": grpc.unary_unary_rpc_method_handler( - servicer.IsReady, - request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, - response_serializer=sideinput__pb2.ReadyResponse.SerializeToString, - ), + 'RetrieveSideInput': grpc.unary_unary_rpc_method_handler( + servicer.RetrieveSideInput, + request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + response_serializer=sideinput__pb2.SideInputResponse.SerializeToString, + ), + 'IsReady': grpc.unary_unary_rpc_method_handler( + servicer.IsReady, + request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + response_serializer=sideinput__pb2.ReadyResponse.SerializeToString, + ), } generic_handler = grpc.method_handlers_generic_handler( - "sideinput.v1.SideInput", rpc_method_handlers - ) + 'sideinput.v1.SideInput', rpc_method_handlers) server.add_generic_rpc_handlers((generic_handler,)) -# This class is part of an EXPERIMENTAL API. + # This class is part of an EXPERIMENTAL API. class SideInput(object): """SideInput is the gRPC service for user-defined Side Inputs. It is used to propagate changes in the values of the provided Side Inputs @@ -91,59 +92,35 @@ class SideInput(object): """ @staticmethod - def RetrieveSideInput( - request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.unary_unary( - request, + def RetrieveSideInput(request, target, - "/sideinput.v1.SideInput/RetrieveSideInput", + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/sideinput.v1.SideInput/RetrieveSideInput', google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, sideinput__pb2.SideInputResponse.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - ) + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) @staticmethod - def IsReady( - request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.unary_unary( - request, + def IsReady(request, target, - "/sideinput.v1.SideInput/IsReady", + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/sideinput.v1.SideInput/IsReady', google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, sideinput__pb2.ReadyResponse.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - ) + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) diff --git a/pynumaflow/proto/sinker/sink_pb2.py b/pynumaflow/proto/sinker/sink_pb2.py index b6182a45..deb73b56 100644 --- a/pynumaflow/proto/sinker/sink_pb2.py +++ b/pynumaflow/proto/sinker/sink_pb2.py @@ -1,12 +1,12 @@ # -*- coding: utf-8 -*- # Generated by the protocol buffer compiler. DO NOT EDIT! # source: sink.proto +# Protobuf Python Version: 4.25.0 """Generated protocol buffer code.""" from google.protobuf import descriptor as _descriptor from google.protobuf import descriptor_pool as _descriptor_pool from google.protobuf import symbol_database as _symbol_database from google.protobuf.internal import builder as _builder - # @@protoc_insertion_point(imports) _sym_db = _symbol_database.Default() @@ -16,24 +16,22 @@ from google.protobuf import timestamp_pb2 as google_dot_protobuf_dot_timestamp__pb2 -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b'\n\nsink.proto\x12\x07sink.v1\x1a\x1bgoogle/protobuf/empty.proto\x1a\x1fgoogle/protobuf/timestamp.proto"\x95\x01\n\x0bSinkRequest\x12\x0c\n\x04keys\x18\x01 \x03(\t\x12\r\n\x05value\x18\x02 \x01(\x0c\x12.\n\nevent_time\x18\x03 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12-\n\twatermark\x18\x04 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12\n\n\x02id\x18\x05 \x01(\t"\x1e\n\rReadyResponse\x12\r\n\x05ready\x18\x01 \x01(\x08"u\n\x0cSinkResponse\x12-\n\x07results\x18\x01 \x03(\x0b\x32\x1c.sink.v1.SinkResponse.Result\x1a\x36\n\x06Result\x12\n\n\x02id\x18\x01 \x01(\t\x12\x0f\n\x07success\x18\x02 \x01(\x08\x12\x0f\n\x07\x65rr_msg\x18\x03 \x01(\t2z\n\x04Sink\x12\x37\n\x06SinkFn\x12\x14.sink.v1.SinkRequest\x1a\x15.sink.v1.SinkResponse(\x01\x12\x39\n\x07IsReady\x12\x16.google.protobuf.Empty\x1a\x16.sink.v1.ReadyResponseB8Z6github.com/numaproj/numaflow-go/pkg/apis/proto/sink/v1b\x06proto3' -) +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\nsink.proto\x12\x07sink.v1\x1a\x1bgoogle/protobuf/empty.proto\x1a\x1fgoogle/protobuf/timestamp.proto\"\x95\x01\n\x0bSinkRequest\x12\x0c\n\x04keys\x18\x01 \x03(\t\x12\r\n\x05value\x18\x02 \x01(\x0c\x12.\n\nevent_time\x18\x03 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12-\n\twatermark\x18\x04 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12\n\n\x02id\x18\x05 \x01(\t\"\x1e\n\rReadyResponse\x12\r\n\x05ready\x18\x01 \x01(\x08\"u\n\x0cSinkResponse\x12-\n\x07results\x18\x01 \x03(\x0b\x32\x1c.sink.v1.SinkResponse.Result\x1a\x36\n\x06Result\x12\n\n\x02id\x18\x01 \x01(\t\x12\x0f\n\x07success\x18\x02 \x01(\x08\x12\x0f\n\x07\x65rr_msg\x18\x03 \x01(\t2z\n\x04Sink\x12\x37\n\x06SinkFn\x12\x14.sink.v1.SinkRequest\x1a\x15.sink.v1.SinkResponse(\x01\x12\x39\n\x07IsReady\x12\x16.google.protobuf.Empty\x1a\x16.sink.v1.ReadyResponseB8Z6github.com/numaproj/numaflow-go/pkg/apis/proto/sink/v1b\x06proto3') _globals = globals() _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) -_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "sink_pb2", _globals) +_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'sink_pb2', _globals) if _descriptor._USE_C_DESCRIPTORS == False: - DESCRIPTOR._options = None - DESCRIPTOR._serialized_options = b"Z6github.com/numaproj/numaflow-go/pkg/apis/proto/sink/v1" - _globals["_SINKREQUEST"]._serialized_start = 86 - _globals["_SINKREQUEST"]._serialized_end = 235 - _globals["_READYRESPONSE"]._serialized_start = 237 - _globals["_READYRESPONSE"]._serialized_end = 267 - _globals["_SINKRESPONSE"]._serialized_start = 269 - _globals["_SINKRESPONSE"]._serialized_end = 386 - _globals["_SINKRESPONSE_RESULT"]._serialized_start = 332 - _globals["_SINKRESPONSE_RESULT"]._serialized_end = 386 - _globals["_SINK"]._serialized_start = 388 - _globals["_SINK"]._serialized_end = 510 + _globals['DESCRIPTOR']._options = None + _globals['DESCRIPTOR']._serialized_options = b'Z6github.com/numaproj/numaflow-go/pkg/apis/proto/sink/v1' + _globals['_SINKREQUEST']._serialized_start=86 + _globals['_SINKREQUEST']._serialized_end=235 + _globals['_READYRESPONSE']._serialized_start=237 + _globals['_READYRESPONSE']._serialized_end=267 + _globals['_SINKRESPONSE']._serialized_start=269 + _globals['_SINKRESPONSE']._serialized_end=386 + _globals['_SINKRESPONSE_RESULT']._serialized_start=332 + _globals['_SINKRESPONSE_RESULT']._serialized_end=386 + _globals['_SINK']._serialized_start=388 + _globals['_SINK']._serialized_end=510 # @@protoc_insertion_point(module_scope) diff --git a/pynumaflow/proto/sinker/sink_pb2_grpc.py b/pynumaflow/proto/sinker/sink_pb2_grpc.py index 92cf473b..2f5089a9 100644 --- a/pynumaflow/proto/sinker/sink_pb2_grpc.py +++ b/pynumaflow/proto/sinker/sink_pb2_grpc.py @@ -3,7 +3,7 @@ import grpc from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2 -from pynumaflow.proto.sinker import sink_pb2 as sink__pb2 +from . import sink_pb2 as sink__pb2 class SinkStub(object): @@ -16,108 +16,87 @@ def __init__(self, channel): channel: A grpc.Channel. """ self.SinkFn = channel.stream_unary( - "/sink.v1.Sink/SinkFn", - request_serializer=sink__pb2.SinkRequest.SerializeToString, - response_deserializer=sink__pb2.SinkResponse.FromString, - ) + '/sink.v1.Sink/SinkFn', + request_serializer=sink__pb2.SinkRequest.SerializeToString, + response_deserializer=sink__pb2.SinkResponse.FromString, + ) self.IsReady = channel.unary_unary( - "/sink.v1.Sink/IsReady", - request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, - response_deserializer=sink__pb2.ReadyResponse.FromString, - ) + '/sink.v1.Sink/IsReady', + request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, + response_deserializer=sink__pb2.ReadyResponse.FromString, + ) class SinkServicer(object): """Missing associated documentation comment in .proto file.""" def SinkFn(self, request_iterator, context): - """SinkFn writes the request to a user defined sink.""" + """SinkFn writes the request to a user defined sink. + """ context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') def IsReady(self, request, context): - """IsReady is the heartbeat endpoint for gRPC.""" + """IsReady is the heartbeat endpoint for gRPC. + """ context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') def add_SinkServicer_to_server(servicer, server): rpc_method_handlers = { - "SinkFn": grpc.stream_unary_rpc_method_handler( - servicer.SinkFn, - request_deserializer=sink__pb2.SinkRequest.FromString, - response_serializer=sink__pb2.SinkResponse.SerializeToString, - ), - "IsReady": grpc.unary_unary_rpc_method_handler( - servicer.IsReady, - request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, - response_serializer=sink__pb2.ReadyResponse.SerializeToString, - ), + 'SinkFn': grpc.stream_unary_rpc_method_handler( + servicer.SinkFn, + request_deserializer=sink__pb2.SinkRequest.FromString, + response_serializer=sink__pb2.SinkResponse.SerializeToString, + ), + 'IsReady': grpc.unary_unary_rpc_method_handler( + servicer.IsReady, + request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + response_serializer=sink__pb2.ReadyResponse.SerializeToString, + ), } - generic_handler = grpc.method_handlers_generic_handler("sink.v1.Sink", rpc_method_handlers) + generic_handler = grpc.method_handlers_generic_handler( + 'sink.v1.Sink', rpc_method_handlers) server.add_generic_rpc_handlers((generic_handler,)) -# This class is part of an EXPERIMENTAL API. + # This class is part of an EXPERIMENTAL API. class Sink(object): """Missing associated documentation comment in .proto file.""" @staticmethod - def SinkFn( - request_iterator, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.stream_unary( - request_iterator, + def SinkFn(request_iterator, target, - "/sink.v1.Sink/SinkFn", + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.stream_unary(request_iterator, target, '/sink.v1.Sink/SinkFn', sink__pb2.SinkRequest.SerializeToString, sink__pb2.SinkResponse.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - ) + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) @staticmethod - def IsReady( - request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.unary_unary( - request, + def IsReady(request, target, - "/sink.v1.Sink/IsReady", + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/sink.v1.Sink/IsReady', google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, sink__pb2.ReadyResponse.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - ) + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) diff --git a/pynumaflow/proto/sourcer/source_pb2.py b/pynumaflow/proto/sourcer/source_pb2.py index 73c282e1..cee05043 100644 --- a/pynumaflow/proto/sourcer/source_pb2.py +++ b/pynumaflow/proto/sourcer/source_pb2.py @@ -1,12 +1,12 @@ # -*- coding: utf-8 -*- # Generated by the protocol buffer compiler. DO NOT EDIT! # source: source.proto +# Protobuf Python Version: 4.25.0 """Generated protocol buffer code.""" from google.protobuf import descriptor as _descriptor from google.protobuf import descriptor_pool as _descriptor_pool from google.protobuf import symbol_database as _symbol_database from google.protobuf.internal import builder as _builder - # @@protoc_insertion_point(imports) _sym_db = _symbol_database.Default() @@ -16,43 +16,41 @@ from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2 -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b'\n\x0csource.proto\x12\tsource.v1\x1a\x1fgoogle/protobuf/timestamp.proto\x1a\x1bgoogle/protobuf/empty.proto"u\n\x0bReadRequest\x12/\n\x07request\x18\x01 \x01(\x0b\x32\x1e.source.v1.ReadRequest.Request\x1a\x35\n\x07Request\x12\x13\n\x0bnum_records\x18\x01 \x01(\x04\x12\x15\n\rtimeout_in_ms\x18\x02 \x01(\r"\xba\x01\n\x0cReadResponse\x12.\n\x06result\x18\x01 \x01(\x0b\x32\x1e.source.v1.ReadResponse.Result\x1az\n\x06Result\x12\x0f\n\x07payload\x18\x01 \x01(\x0c\x12!\n\x06offset\x18\x02 \x01(\x0b\x32\x11.source.v1.Offset\x12.\n\nevent_time\x18\x03 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12\x0c\n\x04keys\x18\x04 \x03(\t"k\n\nAckRequest\x12.\n\x07request\x18\x01 \x01(\x0b\x32\x1d.source.v1.AckRequest.Request\x1a-\n\x07Request\x12"\n\x07offsets\x18\x01 \x03(\x0b\x32\x11.source.v1.Offset"o\n\x0b\x41\x63kResponse\x12-\n\x06result\x18\x01 \x01(\x0b\x32\x1d.source.v1.AckResponse.Result\x1a\x31\n\x06Result\x12\'\n\x07success\x18\x01 \x01(\x0b\x32\x16.google.protobuf.Empty"\x1e\n\rReadyResponse\x12\r\n\x05ready\x18\x01 \x01(\x08"]\n\x0fPendingResponse\x12\x31\n\x06result\x18\x01 \x01(\x0b\x32!.source.v1.PendingResponse.Result\x1a\x17\n\x06Result\x12\r\n\x05\x63ount\x18\x01 \x01(\x03"h\n\x12PartitionsResponse\x12\x34\n\x06result\x18\x01 \x01(\x0b\x32$.source.v1.PartitionsResponse.Result\x1a\x1c\n\x06Result\x12\x12\n\npartitions\x18\x01 \x03(\x05".\n\x06Offset\x12\x0e\n\x06offset\x18\x01 \x01(\x0c\x12\x14\n\x0cpartition_id\x18\x02 \x01(\x05\x32\xc2\x02\n\x06Source\x12;\n\x06ReadFn\x12\x16.source.v1.ReadRequest\x1a\x17.source.v1.ReadResponse0\x01\x12\x36\n\x05\x41\x63kFn\x12\x15.source.v1.AckRequest\x1a\x16.source.v1.AckResponse\x12?\n\tPendingFn\x12\x16.google.protobuf.Empty\x1a\x1a.source.v1.PendingResponse\x12\x45\n\x0cPartitionsFn\x12\x16.google.protobuf.Empty\x1a\x1d.source.v1.PartitionsResponse\x12;\n\x07IsReady\x12\x16.google.protobuf.Empty\x1a\x18.source.v1.ReadyResponseb\x06proto3' -) +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0csource.proto\x12\tsource.v1\x1a\x1fgoogle/protobuf/timestamp.proto\x1a\x1bgoogle/protobuf/empty.proto\"u\n\x0bReadRequest\x12/\n\x07request\x18\x01 \x01(\x0b\x32\x1e.source.v1.ReadRequest.Request\x1a\x35\n\x07Request\x12\x13\n\x0bnum_records\x18\x01 \x01(\x04\x12\x15\n\rtimeout_in_ms\x18\x02 \x01(\r\"\xba\x01\n\x0cReadResponse\x12.\n\x06result\x18\x01 \x01(\x0b\x32\x1e.source.v1.ReadResponse.Result\x1az\n\x06Result\x12\x0f\n\x07payload\x18\x01 \x01(\x0c\x12!\n\x06offset\x18\x02 \x01(\x0b\x32\x11.source.v1.Offset\x12.\n\nevent_time\x18\x03 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12\x0c\n\x04keys\x18\x04 \x03(\t\"k\n\nAckRequest\x12.\n\x07request\x18\x01 \x01(\x0b\x32\x1d.source.v1.AckRequest.Request\x1a-\n\x07Request\x12\"\n\x07offsets\x18\x01 \x03(\x0b\x32\x11.source.v1.Offset\"o\n\x0b\x41\x63kResponse\x12-\n\x06result\x18\x01 \x01(\x0b\x32\x1d.source.v1.AckResponse.Result\x1a\x31\n\x06Result\x12\'\n\x07success\x18\x01 \x01(\x0b\x32\x16.google.protobuf.Empty\"\x1e\n\rReadyResponse\x12\r\n\x05ready\x18\x01 \x01(\x08\"]\n\x0fPendingResponse\x12\x31\n\x06result\x18\x01 \x01(\x0b\x32!.source.v1.PendingResponse.Result\x1a\x17\n\x06Result\x12\r\n\x05\x63ount\x18\x01 \x01(\x03\"h\n\x12PartitionsResponse\x12\x34\n\x06result\x18\x01 \x01(\x0b\x32$.source.v1.PartitionsResponse.Result\x1a\x1c\n\x06Result\x12\x12\n\npartitions\x18\x01 \x03(\x05\".\n\x06Offset\x12\x0e\n\x06offset\x18\x01 \x01(\x0c\x12\x14\n\x0cpartition_id\x18\x02 \x01(\x05\x32\xc2\x02\n\x06Source\x12;\n\x06ReadFn\x12\x16.source.v1.ReadRequest\x1a\x17.source.v1.ReadResponse0\x01\x12\x36\n\x05\x41\x63kFn\x12\x15.source.v1.AckRequest\x1a\x16.source.v1.AckResponse\x12?\n\tPendingFn\x12\x16.google.protobuf.Empty\x1a\x1a.source.v1.PendingResponse\x12\x45\n\x0cPartitionsFn\x12\x16.google.protobuf.Empty\x1a\x1d.source.v1.PartitionsResponse\x12;\n\x07IsReady\x12\x16.google.protobuf.Empty\x1a\x18.source.v1.ReadyResponseb\x06proto3') _globals = globals() _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) -_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "source_pb2", _globals) +_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'source_pb2', _globals) if _descriptor._USE_C_DESCRIPTORS == False: - DESCRIPTOR._options = None - _globals["_READREQUEST"]._serialized_start = 89 - _globals["_READREQUEST"]._serialized_end = 206 - _globals["_READREQUEST_REQUEST"]._serialized_start = 153 - _globals["_READREQUEST_REQUEST"]._serialized_end = 206 - _globals["_READRESPONSE"]._serialized_start = 209 - _globals["_READRESPONSE"]._serialized_end = 395 - _globals["_READRESPONSE_RESULT"]._serialized_start = 273 - _globals["_READRESPONSE_RESULT"]._serialized_end = 395 - _globals["_ACKREQUEST"]._serialized_start = 397 - _globals["_ACKREQUEST"]._serialized_end = 504 - _globals["_ACKREQUEST_REQUEST"]._serialized_start = 459 - _globals["_ACKREQUEST_REQUEST"]._serialized_end = 504 - _globals["_ACKRESPONSE"]._serialized_start = 506 - _globals["_ACKRESPONSE"]._serialized_end = 617 - _globals["_ACKRESPONSE_RESULT"]._serialized_start = 568 - _globals["_ACKRESPONSE_RESULT"]._serialized_end = 617 - _globals["_READYRESPONSE"]._serialized_start = 619 - _globals["_READYRESPONSE"]._serialized_end = 649 - _globals["_PENDINGRESPONSE"]._serialized_start = 651 - _globals["_PENDINGRESPONSE"]._serialized_end = 744 - _globals["_PENDINGRESPONSE_RESULT"]._serialized_start = 721 - _globals["_PENDINGRESPONSE_RESULT"]._serialized_end = 744 - _globals["_PARTITIONSRESPONSE"]._serialized_start = 746 - _globals["_PARTITIONSRESPONSE"]._serialized_end = 850 - _globals["_PARTITIONSRESPONSE_RESULT"]._serialized_start = 822 - _globals["_PARTITIONSRESPONSE_RESULT"]._serialized_end = 850 - _globals["_OFFSET"]._serialized_start = 852 - _globals["_OFFSET"]._serialized_end = 898 - _globals["_SOURCE"]._serialized_start = 901 - _globals["_SOURCE"]._serialized_end = 1223 + DESCRIPTOR._options = None + _globals['_READREQUEST']._serialized_start=89 + _globals['_READREQUEST']._serialized_end=206 + _globals['_READREQUEST_REQUEST']._serialized_start=153 + _globals['_READREQUEST_REQUEST']._serialized_end=206 + _globals['_READRESPONSE']._serialized_start=209 + _globals['_READRESPONSE']._serialized_end=395 + _globals['_READRESPONSE_RESULT']._serialized_start=273 + _globals['_READRESPONSE_RESULT']._serialized_end=395 + _globals['_ACKREQUEST']._serialized_start=397 + _globals['_ACKREQUEST']._serialized_end=504 + _globals['_ACKREQUEST_REQUEST']._serialized_start=459 + _globals['_ACKREQUEST_REQUEST']._serialized_end=504 + _globals['_ACKRESPONSE']._serialized_start=506 + _globals['_ACKRESPONSE']._serialized_end=617 + _globals['_ACKRESPONSE_RESULT']._serialized_start=568 + _globals['_ACKRESPONSE_RESULT']._serialized_end=617 + _globals['_READYRESPONSE']._serialized_start=619 + _globals['_READYRESPONSE']._serialized_end=649 + _globals['_PENDINGRESPONSE']._serialized_start=651 + _globals['_PENDINGRESPONSE']._serialized_end=744 + _globals['_PENDINGRESPONSE_RESULT']._serialized_start=721 + _globals['_PENDINGRESPONSE_RESULT']._serialized_end=744 + _globals['_PARTITIONSRESPONSE']._serialized_start=746 + _globals['_PARTITIONSRESPONSE']._serialized_end=850 + _globals['_PARTITIONSRESPONSE_RESULT']._serialized_start=822 + _globals['_PARTITIONSRESPONSE_RESULT']._serialized_end=850 + _globals['_OFFSET']._serialized_start=852 + _globals['_OFFSET']._serialized_end=898 + _globals['_SOURCE']._serialized_start=901 + _globals['_SOURCE']._serialized_end=1223 # @@protoc_insertion_point(module_scope) diff --git a/pynumaflow/proto/sourcer/source_pb2_grpc.py b/pynumaflow/proto/sourcer/source_pb2_grpc.py index 3a132eea..f67127c5 100644 --- a/pynumaflow/proto/sourcer/source_pb2_grpc.py +++ b/pynumaflow/proto/sourcer/source_pb2_grpc.py @@ -16,30 +16,30 @@ def __init__(self, channel): channel: A grpc.Channel. """ self.ReadFn = channel.unary_stream( - "/source.v1.Source/ReadFn", - request_serializer=source__pb2.ReadRequest.SerializeToString, - response_deserializer=source__pb2.ReadResponse.FromString, - ) + '/source.v1.Source/ReadFn', + request_serializer=source__pb2.ReadRequest.SerializeToString, + response_deserializer=source__pb2.ReadResponse.FromString, + ) self.AckFn = channel.unary_unary( - "/source.v1.Source/AckFn", - request_serializer=source__pb2.AckRequest.SerializeToString, - response_deserializer=source__pb2.AckResponse.FromString, - ) + '/source.v1.Source/AckFn', + request_serializer=source__pb2.AckRequest.SerializeToString, + response_deserializer=source__pb2.AckResponse.FromString, + ) self.PendingFn = channel.unary_unary( - "/source.v1.Source/PendingFn", - request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, - response_deserializer=source__pb2.PendingResponse.FromString, - ) + '/source.v1.Source/PendingFn', + request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, + response_deserializer=source__pb2.PendingResponse.FromString, + ) self.PartitionsFn = channel.unary_unary( - "/source.v1.Source/PartitionsFn", - request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, - response_deserializer=source__pb2.PartitionsResponse.FromString, - ) + '/source.v1.Source/PartitionsFn', + request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, + response_deserializer=source__pb2.PartitionsResponse.FromString, + ) self.IsReady = channel.unary_unary( - "/source.v1.Source/IsReady", - request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, - response_deserializer=source__pb2.ReadyResponse.FromString, - ) + '/source.v1.Source/IsReady', + request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, + response_deserializer=source__pb2.ReadyResponse.FromString, + ) class SourceServicer(object): @@ -51,216 +51,160 @@ def ReadFn(self, request, context): If the request timeout is reached on server side, the returned ReadResponse will contain all the datum that have been read (which could be an empty list). """ context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') def AckFn(self, request, context): """AckFn acknowledges a list of datum offsets. When AckFn is called, it implicitly indicates that the datum stream has been processed by the source vertex. The caller (numa) expects the AckFn to be successful, and it does not expect any errors. If there are some irrecoverable errors when the callee (UDSource) is processing the AckFn request, - then it is best to crash because there are no other retry mechanisms possible. + then it is best to crash because there are no other retry mechanisms possible. """ context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') def PendingFn(self, request, context): - """PendingFn returns the number of pending records at the user defined source.""" + """PendingFn returns the number of pending records at the user defined source. + """ context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') def PartitionsFn(self, request, context): - """PartitionsFn returns the list of partitions for the user defined source.""" + """PartitionsFn returns the list of partitions for the user defined source. + """ context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') def IsReady(self, request, context): - """IsReady is the heartbeat endpoint for user defined source gRPC.""" + """IsReady is the heartbeat endpoint for user defined source gRPC. + """ context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') def add_SourceServicer_to_server(servicer, server): rpc_method_handlers = { - "ReadFn": grpc.unary_stream_rpc_method_handler( - servicer.ReadFn, - request_deserializer=source__pb2.ReadRequest.FromString, - response_serializer=source__pb2.ReadResponse.SerializeToString, - ), - "AckFn": grpc.unary_unary_rpc_method_handler( - servicer.AckFn, - request_deserializer=source__pb2.AckRequest.FromString, - response_serializer=source__pb2.AckResponse.SerializeToString, - ), - "PendingFn": grpc.unary_unary_rpc_method_handler( - servicer.PendingFn, - request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, - response_serializer=source__pb2.PendingResponse.SerializeToString, - ), - "PartitionsFn": grpc.unary_unary_rpc_method_handler( - servicer.PartitionsFn, - request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, - response_serializer=source__pb2.PartitionsResponse.SerializeToString, - ), - "IsReady": grpc.unary_unary_rpc_method_handler( - servicer.IsReady, - request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, - response_serializer=source__pb2.ReadyResponse.SerializeToString, - ), + 'ReadFn': grpc.unary_stream_rpc_method_handler( + servicer.ReadFn, + request_deserializer=source__pb2.ReadRequest.FromString, + response_serializer=source__pb2.ReadResponse.SerializeToString, + ), + 'AckFn': grpc.unary_unary_rpc_method_handler( + servicer.AckFn, + request_deserializer=source__pb2.AckRequest.FromString, + response_serializer=source__pb2.AckResponse.SerializeToString, + ), + 'PendingFn': grpc.unary_unary_rpc_method_handler( + servicer.PendingFn, + request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + response_serializer=source__pb2.PendingResponse.SerializeToString, + ), + 'PartitionsFn': grpc.unary_unary_rpc_method_handler( + servicer.PartitionsFn, + request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + response_serializer=source__pb2.PartitionsResponse.SerializeToString, + ), + 'IsReady': grpc.unary_unary_rpc_method_handler( + servicer.IsReady, + request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + response_serializer=source__pb2.ReadyResponse.SerializeToString, + ), } - generic_handler = grpc.method_handlers_generic_handler("source.v1.Source", rpc_method_handlers) + generic_handler = grpc.method_handlers_generic_handler( + 'source.v1.Source', rpc_method_handlers) server.add_generic_rpc_handlers((generic_handler,)) -# This class is part of an EXPERIMENTAL API. + # This class is part of an EXPERIMENTAL API. class Source(object): """Missing associated documentation comment in .proto file.""" @staticmethod - def ReadFn( - request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.unary_stream( - request, + def ReadFn(request, target, - "/source.v1.Source/ReadFn", + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_stream(request, target, '/source.v1.Source/ReadFn', source__pb2.ReadRequest.SerializeToString, source__pb2.ReadResponse.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - ) + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) @staticmethod - def AckFn( - request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.unary_unary( - request, + def AckFn(request, target, - "/source.v1.Source/AckFn", + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/source.v1.Source/AckFn', source__pb2.AckRequest.SerializeToString, source__pb2.AckResponse.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - ) + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) @staticmethod - def PendingFn( - request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.unary_unary( - request, + def PendingFn(request, target, - "/source.v1.Source/PendingFn", + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/source.v1.Source/PendingFn', google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, source__pb2.PendingResponse.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - ) + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) @staticmethod - def PartitionsFn( - request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.unary_unary( - request, + def PartitionsFn(request, target, - "/source.v1.Source/PartitionsFn", + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/source.v1.Source/PartitionsFn', google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, source__pb2.PartitionsResponse.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - ) + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) @staticmethod - def IsReady( - request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.unary_unary( - request, + def IsReady(request, target, - "/source.v1.Source/IsReady", + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/source.v1.Source/IsReady', google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, source__pb2.ReadyResponse.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - ) + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) diff --git a/pynumaflow/proto/sourcetransformer/transform_pb2.py b/pynumaflow/proto/sourcetransformer/transform_pb2.py index 41946e02..31c0da87 100644 --- a/pynumaflow/proto/sourcetransformer/transform_pb2.py +++ b/pynumaflow/proto/sourcetransformer/transform_pb2.py @@ -1,12 +1,12 @@ # -*- coding: utf-8 -*- # Generated by the protocol buffer compiler. DO NOT EDIT! # source: transform.proto +# Protobuf Python Version: 4.25.0 """Generated protocol buffer code.""" from google.protobuf import descriptor as _descriptor from google.protobuf import descriptor_pool as _descriptor_pool from google.protobuf import symbol_database as _symbol_database from google.protobuf.internal import builder as _builder - # @@protoc_insertion_point(imports) _sym_db = _symbol_database.Default() @@ -16,23 +16,21 @@ from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2 -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b'\n\x0ftransform.proto\x12\x14sourcetransformer.v1\x1a\x1fgoogle/protobuf/timestamp.proto\x1a\x1bgoogle/protobuf/empty.proto"\x94\x01\n\x16SourceTransformRequest\x12\x0c\n\x04keys\x18\x01 \x03(\t\x12\r\n\x05value\x18\x02 \x01(\x0c\x12.\n\nevent_time\x18\x03 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12-\n\twatermark\x18\x04 \x01(\x0b\x32\x1a.google.protobuf.Timestamp"\xc5\x01\n\x17SourceTransformResponse\x12\x45\n\x07results\x18\x01 \x03(\x0b\x32\x34.sourcetransformer.v1.SourceTransformResponse.Result\x1a\x63\n\x06Result\x12\x0c\n\x04keys\x18\x01 \x03(\t\x12\r\n\x05value\x18\x02 \x01(\x0c\x12.\n\nevent_time\x18\x03 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12\x0c\n\x04tags\x18\x04 \x03(\t"\x1e\n\rReadyResponse\x12\r\n\x05ready\x18\x01 \x01(\x08\x32\xcb\x01\n\x0fSourceTransform\x12p\n\x11SourceTransformFn\x12,.sourcetransformer.v1.SourceTransformRequest\x1a-.sourcetransformer.v1.SourceTransformResponse\x12\x46\n\x07IsReady\x12\x16.google.protobuf.Empty\x1a#.sourcetransformer.v1.ReadyResponseb\x06proto3' -) +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0ftransform.proto\x12\x14sourcetransformer.v1\x1a\x1fgoogle/protobuf/timestamp.proto\x1a\x1bgoogle/protobuf/empty.proto\"\x94\x01\n\x16SourceTransformRequest\x12\x0c\n\x04keys\x18\x01 \x03(\t\x12\r\n\x05value\x18\x02 \x01(\x0c\x12.\n\nevent_time\x18\x03 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12-\n\twatermark\x18\x04 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\"\xc5\x01\n\x17SourceTransformResponse\x12\x45\n\x07results\x18\x01 \x03(\x0b\x32\x34.sourcetransformer.v1.SourceTransformResponse.Result\x1a\x63\n\x06Result\x12\x0c\n\x04keys\x18\x01 \x03(\t\x12\r\n\x05value\x18\x02 \x01(\x0c\x12.\n\nevent_time\x18\x03 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12\x0c\n\x04tags\x18\x04 \x03(\t\"\x1e\n\rReadyResponse\x12\r\n\x05ready\x18\x01 \x01(\x08\x32\xcb\x01\n\x0fSourceTransform\x12p\n\x11SourceTransformFn\x12,.sourcetransformer.v1.SourceTransformRequest\x1a-.sourcetransformer.v1.SourceTransformResponse\x12\x46\n\x07IsReady\x12\x16.google.protobuf.Empty\x1a#.sourcetransformer.v1.ReadyResponseb\x06proto3') _globals = globals() _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) -_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "transform_pb2", _globals) +_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'transform_pb2', _globals) if _descriptor._USE_C_DESCRIPTORS == False: - DESCRIPTOR._options = None - _globals["_SOURCETRANSFORMREQUEST"]._serialized_start = 104 - _globals["_SOURCETRANSFORMREQUEST"]._serialized_end = 252 - _globals["_SOURCETRANSFORMRESPONSE"]._serialized_start = 255 - _globals["_SOURCETRANSFORMRESPONSE"]._serialized_end = 452 - _globals["_SOURCETRANSFORMRESPONSE_RESULT"]._serialized_start = 353 - _globals["_SOURCETRANSFORMRESPONSE_RESULT"]._serialized_end = 452 - _globals["_READYRESPONSE"]._serialized_start = 454 - _globals["_READYRESPONSE"]._serialized_end = 484 - _globals["_SOURCETRANSFORM"]._serialized_start = 487 - _globals["_SOURCETRANSFORM"]._serialized_end = 690 + DESCRIPTOR._options = None + _globals['_SOURCETRANSFORMREQUEST']._serialized_start=104 + _globals['_SOURCETRANSFORMREQUEST']._serialized_end=252 + _globals['_SOURCETRANSFORMRESPONSE']._serialized_start=255 + _globals['_SOURCETRANSFORMRESPONSE']._serialized_end=452 + _globals['_SOURCETRANSFORMRESPONSE_RESULT']._serialized_start=353 + _globals['_SOURCETRANSFORMRESPONSE_RESULT']._serialized_end=452 + _globals['_READYRESPONSE']._serialized_start=454 + _globals['_READYRESPONSE']._serialized_end=484 + _globals['_SOURCETRANSFORM']._serialized_start=487 + _globals['_SOURCETRANSFORM']._serialized_end=690 # @@protoc_insertion_point(module_scope) diff --git a/pynumaflow/proto/sourcetransformer/transform_pb2_grpc.py b/pynumaflow/proto/sourcetransformer/transform_pb2_grpc.py index 2e67a11b..7ec9346c 100644 --- a/pynumaflow/proto/sourcetransformer/transform_pb2_grpc.py +++ b/pynumaflow/proto/sourcetransformer/transform_pb2_grpc.py @@ -16,15 +16,15 @@ def __init__(self, channel): channel: A grpc.Channel. """ self.SourceTransformFn = channel.unary_unary( - "/sourcetransformer.v1.SourceTransform/SourceTransformFn", - request_serializer=transform__pb2.SourceTransformRequest.SerializeToString, - response_deserializer=transform__pb2.SourceTransformResponse.FromString, - ) + '/sourcetransformer.v1.SourceTransform/SourceTransformFn', + request_serializer=transform__pb2.SourceTransformRequest.SerializeToString, + response_deserializer=transform__pb2.SourceTransformResponse.FromString, + ) self.IsReady = channel.unary_unary( - "/sourcetransformer.v1.SourceTransform/IsReady", - request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, - response_deserializer=transform__pb2.ReadyResponse.FromString, - ) + '/sourcetransformer.v1.SourceTransform/IsReady', + request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, + response_deserializer=transform__pb2.ReadyResponse.FromString, + ) class SourceTransformServicer(object): @@ -36,93 +36,69 @@ def SourceTransformFn(self, request, context): SourceTransformFn can be used only at source vertex by source data transformer. """ context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') def IsReady(self, request, context): - """IsReady is the heartbeat endpoint for gRPC.""" + """IsReady is the heartbeat endpoint for gRPC. + """ context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') def add_SourceTransformServicer_to_server(servicer, server): rpc_method_handlers = { - "SourceTransformFn": grpc.unary_unary_rpc_method_handler( - servicer.SourceTransformFn, - request_deserializer=transform__pb2.SourceTransformRequest.FromString, - response_serializer=transform__pb2.SourceTransformResponse.SerializeToString, - ), - "IsReady": grpc.unary_unary_rpc_method_handler( - servicer.IsReady, - request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, - response_serializer=transform__pb2.ReadyResponse.SerializeToString, - ), + 'SourceTransformFn': grpc.unary_unary_rpc_method_handler( + servicer.SourceTransformFn, + request_deserializer=transform__pb2.SourceTransformRequest.FromString, + response_serializer=transform__pb2.SourceTransformResponse.SerializeToString, + ), + 'IsReady': grpc.unary_unary_rpc_method_handler( + servicer.IsReady, + request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + response_serializer=transform__pb2.ReadyResponse.SerializeToString, + ), } generic_handler = grpc.method_handlers_generic_handler( - "sourcetransformer.v1.SourceTransform", rpc_method_handlers - ) + 'sourcetransformer.v1.SourceTransform', rpc_method_handlers) server.add_generic_rpc_handlers((generic_handler,)) -# This class is part of an EXPERIMENTAL API. + # This class is part of an EXPERIMENTAL API. class SourceTransform(object): """Missing associated documentation comment in .proto file.""" @staticmethod - def SourceTransformFn( - request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.unary_unary( - request, + def SourceTransformFn(request, target, - "/sourcetransformer.v1.SourceTransform/SourceTransformFn", + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/sourcetransformer.v1.SourceTransform/SourceTransformFn', transform__pb2.SourceTransformRequest.SerializeToString, transform__pb2.SourceTransformResponse.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - ) + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) @staticmethod - def IsReady( - request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.unary_unary( - request, + def IsReady(request, target, - "/sourcetransformer.v1.SourceTransform/IsReady", + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/sourcetransformer.v1.SourceTransform/IsReady', google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, transform__pb2.ReadyResponse.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - ) + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) diff --git a/pynumaflow/reducer/_dtypes.py b/pynumaflow/reducer/_dtypes.py index 4f96d03f..7cc1595b 100644 --- a/pynumaflow/reducer/_dtypes.py +++ b/pynumaflow/reducer/_dtypes.py @@ -237,14 +237,12 @@ class ReducerClass(metaclass=ABCMeta): """ Provides an interface to write a Reducer which will be exposed over a gRPC server. - - Args: - """ def __call__(self, *args, **kwargs): """ Allow to call handler function directly if class instance is sent + as the reducer_instance. """ return self.handler(*args, **kwargs) @@ -259,4 +257,5 @@ async def handler( ReduceAsyncCallable = Callable[[list[str], AsyncIterable[Datum], Metadata], Awaitable[Messages]] +# ReduceCallable is a callable which can be used as a handler for the reduce UDF. ReduceCallable = Union[ReduceAsyncCallable, ReducerClass] diff --git a/pynumaflow/reducer/async_server.py b/pynumaflow/reducer/async_server.py index 0a6d5b3b..e83a8106 100644 --- a/pynumaflow/reducer/async_server.py +++ b/pynumaflow/reducer/async_server.py @@ -1,7 +1,4 @@ import asyncio -import logging -import multiprocessing -import os from datetime import datetime, timezone from collections.abc import AsyncIterable @@ -9,7 +6,6 @@ import grpc from google.protobuf import empty_pb2 as _empty_pb2 -from pynumaflow import setup_logging from pynumaflow._constants import ( WIN_START_TIME, WIN_END_TIME, @@ -21,13 +17,7 @@ from pynumaflow.reducer.asynciter import NonBlockingIterator from pynumaflow.proto.reducer import reduce_pb2, reduce_pb2_grpc from pynumaflow.types import NumaflowServicerContext - -_LOGGER = setup_logging(__name__) -if os.getenv("PYTHONDEBUG"): - _LOGGER.setLevel(logging.DEBUG) - -_PROCESS_COUNT = multiprocessing.cpu_count() -MAX_THREADS = int(os.getenv("MAX_THREADS", 0)) or (_PROCESS_COUNT * 4) +from pynumaflow._constants import _LOGGER async def datum_generator( @@ -45,36 +35,9 @@ async def datum_generator( class AsyncReducer(reduce_pb2_grpc.ReduceServicer): """ - Provides an interface to write a Reduce Function - which will be exposed over gRPC. - - Args: - handler: Function callable following the type signature of ReduceCallable - sock_path: Path to the UNIX Domain Socket - max_message_size: The max message size in bytes the server can receive and send - max_threads: The max number of threads to be spawned; - defaults to number of processors x4 - - Example invocation: - >>> from typing import Iterator - >>> from pynumaflow.reducer import Messages, Message\ - ... Datum, Metadata, AsyncReducer - ... import aiorun - ... - >>> async def reduce_handler(key: list[str], datums: AsyncIterable[Datum], - >>> md: Metadata) -> Messages: - ... interval_window = md.interval_window - ... counter = 0 - ... async for _ in datums: - ... counter += 1 - ... msg = ( - ... f"counter:{counter} interval_window_start:{interval_window.start} " - ... f"interval_window_end:{interval_window.end}" - ... ) - ... return Messages(Message(value=str.encode(msg), keys=keys)) - ... - >>> grpc_server = AsyncReducer(handler=reduce_handler) - >>> aiorun.run(grpc_server.start()) + This class is used to create a new grpc Reduce servicer instance. + It implements the MapServicer interface from the proto reduce.proto file. + Provides the functionality for the required rpc methods. """ def __init__( diff --git a/pynumaflow/reducer/reduce.py b/pynumaflow/reducer/reduce.py index 1b3e1711..e7742486 100644 --- a/pynumaflow/reducer/reduce.py +++ b/pynumaflow/reducer/reduce.py @@ -19,6 +19,10 @@ class ReduceServer(NumaflowServer): + """ + Class for a new Reduce Server instance. + """ + def __init__( self, reducer_instance: ReduceCallable, @@ -27,6 +31,18 @@ def __init__( max_threads=MAX_THREADS, server_type=ServerType.Async, ): + """ + Create a new grpc Reduce Server instance. + A new servicer instance is created and attached to the server. + The server instance is returned. + Args: + reducer_instance: The reducer instance to be used for Reduce UDF + sock_path: The UNIX socket path to be used for the server + max_message_size: The max message size in bytes the server can receive and send + max_threads: The max number of threads to be spawned; + defaults to number of processors x4 + server_type: The type of server to be used + """ self.reducer_instance: ReduceCallable = reducer_instance self.sock_path = f"unix://{sock_path}" self.max_message_size = max_message_size @@ -39,13 +55,27 @@ def __init__( ] def start(self): + """ + Starter function for the Reduce server, Handles the server type and + starts the server. + Currently supported server types are: + 1. Async + """ if self.server_type == ServerType.Async: aiorun.run(self.aexec()) else: - _LOGGER.error("Server type not supported", self.server_type) + _LOGGER.error("Server type not supported - %s", str(self.server_type)) raise NotImplementedError async def aexec(self): + """ + Starts the Async gRPC server on the given UNIX socket with + given max threads. + """ + # As the server is async, we need to create a new server instance in the + # same thread as the event loop so that all the async calls are made in the + # same context + # Create a new async server instance and add the servicer to it server = grpc.aio.server() server.add_insecure_port(self.sock_path) reduce_servicer = self.get_servicer( @@ -55,7 +85,6 @@ async def aexec(self): await start_async_server(server, self.sock_path, self.max_threads, self._server_options) def get_servicer(self, reducer_instance: ReduceCallable, server_type: ServerType): + """Get the servicer instance for the given server type""" if server_type == ServerType.Async: return AsyncReducer(reducer_instance) - else: - raise NotImplementedError diff --git a/pynumaflow/shared/server.py b/pynumaflow/shared/server.py index de3be5a5..dce4fdf4 100644 --- a/pynumaflow/shared/server.py +++ b/pynumaflow/shared/server.py @@ -31,50 +31,17 @@ class NumaflowServer: """ Provides an interface to write a Numaflow Server which will be exposed over gRPC. - - Members: - sock_path: Path to the UNIX Domain Socket - max_message_size: The max message size in bytes the server can receive and send - max_threads: The max number of threads to be spawned; - defaults to number of processors x4 """ @abstractmethod def start(self): """ - Start the server + Start the gRPC server """ raise NotImplementedError -# def prepare_server( -# sock_path: str, -# server_type: ServerType, -# max_threads=MAX_THREADS, -# server_options=None, -# process_count=1, -# ): -# """ -# Create a new grpc Server instance. -# A new servicer instance is created and attached to the server. -# The server instance is returned. -# -# """ -# if server_type == ServerType.Sync: -# server = _get_sync_server( -# bind_address=sock_path, threads_per_proc=max_threads, server_options=server_options -# ) -# return server -# # elif server_type == ServerType.Multiproc: -# # servers, server_ports = get_multiproc_servers( -# # max_threads=max_threads, -# # server_options=server_options, -# # process_count=process_count, -# # ) -# # return servers, server_ports - - -def write_info_file(protocol: Protocol) -> None: +def write_info_file(protocol: Protocol, info_file=SERVER_INFO_FILE_PATH) -> None: """ Write the server info file to the given path. """ @@ -83,15 +50,14 @@ def write_info_file(protocol: Protocol) -> None: language=Language.PYTHON, version=get_sdk_version(), ) - info_server_write(server_info=serv_info, info_file=SERVER_INFO_FILE_PATH) + info_server_write(server_info=serv_info, info_file=info_file) def sync_server_start( servicer, bind_address: str, max_threads: int, server_options=None, udf_type: str = UDFType.Map ): """ - Starts the Synchronous server instance on the given UNIX socket with given max threads. - Wait for the server to terminate. + Utility function to start a sync grpc server instance. """ # Add the server information to the server info file, # here we just write the protocol and language information @@ -235,44 +201,6 @@ async def server_graceful_shutdown(): await server_async.wait_for_termination() -# async def __serve_async(self, server) -> None: -# async def server_graceful_shutdown(): -# """ -# Shuts down the server with 5 seconds of grace period. During the -# grace period, the server won't accept new connections and allow -# existing RPCs to continue within the grace period. -# """ -# _LOGGER.info("Starting graceful shutdown...") -# await server.stop(5) -# -# self.cleanup_coroutines.append(server_graceful_shutdown()) -# await server.wait_for_termination() -# -# -# async def start(self) -> None: -# """Starts the Async gRPC mapper on the given UNIX socket.""" -# server = grpc.aio.server(options=self._server_options) -# await self.__serve_async(server) - - -def _get_sync_server(bind_address: str, threads_per_proc: int, server_options: list): - """Get a new sync grpc server instance.""" - try: - server = grpc.server( - ThreadPoolExecutor( - max_workers=threads_per_proc, - ), - options=server_options, - ) - server.add_insecure_port(bind_address) - print("bind_address", bind_address) - _LOGGER.info("Starting new server with bind_address: %s", bind_address) - except Exception as err: - _LOGGER.critical("Failed to start server: %s", err, exc_info=True) - raise err - return server - - @contextlib.contextmanager def _reserve_port(port_num: int) -> Iterator[int]: """Find and reserve a port for all subprocesses to use.""" diff --git a/pynumaflow/sinker/_dtypes.py b/pynumaflow/sinker/_dtypes.py index 3c4b820e..f0914469 100644 --- a/pynumaflow/sinker/_dtypes.py +++ b/pynumaflow/sinker/_dtypes.py @@ -168,24 +168,24 @@ class SinkerClass(metaclass=ABCMeta): Provides an interface to write a Sinker which will be exposed over a gRPC server. - Args: - """ def __call__(self, *args, **kwargs): """ Allow to call handler function directly if class instance is sent + as the sinker_instance. """ return self.handler(*args, **kwargs) @abstractmethod def handler(self, datums: Iterator[Datum]) -> Responses: """ - Write a handler function which implements the MapCallable interface. + Write a handler function which implements the SinkCallable interface. """ pass SinkHandlerCallable = Callable[[Iterator[Datum]], Responses] AsyncSinkCallable = Callable[[AsyncIterable[Datum]], Awaitable[Responses]] +# SinkCallable is a callable which can be used as a handler for the UDSink. SinkCallable = Union[SinkerClass, SinkHandlerCallable, AsyncSinkCallable] diff --git a/pynumaflow/sinker/async_sink.py b/pynumaflow/sinker/async_sink.py index c705b6ae..e836bce2 100644 --- a/pynumaflow/sinker/async_sink.py +++ b/pynumaflow/sinker/async_sink.py @@ -35,26 +35,9 @@ async def datum_generator( class AsyncSinker(sink_pb2_grpc.SinkServicer): """ - Provides an interface to write an Async Sinker - which will be exposed over an Asyncronous gRPC server. - - Args: - handler: Function callable following the type signature of SinkCallable - sock_path: Path to the UNIX Domain Socket - max_message_size: The max message size in bytes the server can receive and send - max_threads: The max number of threads to be spawned; - defaults to number of processors x 4 - - Example invocation: - >>> import aiorun - >>> from pynumaflow.sinker import Datum, Responses, Response, AsyncSinker - >>> async def my_handler(datums: AsyncIterable[Datum]) -> Responses: - ... responses = Responses() - ... async for msg in datums: - ... responses.append(Response.as_success(msg.id)) - ... return responses - >>> grpc_server = AsyncSinker(handler=my_handler) - >>> aiorun.run(grpc_server.start()) + This class is used to create a new grpc Sink servicer instance. + It implements the SinkServicer interface from the proto sink.proto file. + Provides the functionality for the required rpc methods. """ def __init__( diff --git a/pynumaflow/sinker/server.py b/pynumaflow/sinker/server.py index ee928845..b166936e 100644 --- a/pynumaflow/sinker/server.py +++ b/pynumaflow/sinker/server.py @@ -34,26 +34,9 @@ def datum_generator(request_iterator: Iterable[sink_pb2.SinkRequest]) -> Iterabl class Sinker(sink_pb2_grpc.SinkServicer): """ - Provides an interface to write a Sinker - which will be exposed over gRPC. - - Args: - handler: Function callable following the type signature of SinkCallable - sock_path: Path to the UNIX Domain Socket - max_message_size: The max message size in bytes the server can receive and send - max_threads: The max number of threads to be spawned; - defaults to number of processors x 4 - - Example invocation: - >>> from typing import List - >>> from pynumaflow.sinker import Datum, Responses, Response, Sinker - >>> def my_handler(datums: Iterator[Datum]) -> Responses: - ... responses = Responses() - ... for msg in datums: - ... responses.append(Response.as_success(msg.id)) - ... return responses - >>> grpc_server = Sinker(handler=my_handler) - >>> grpc_server.start() + This class is used to create a new grpc Sink servicer instance. + It implements the SinkServicer interface from the proto sink.proto file. + Provides the functionality for the required rpc methods. """ def __init__( diff --git a/pynumaflow/sinker/sink.py b/pynumaflow/sinker/sink.py index b4f898a9..32139b53 100644 --- a/pynumaflow/sinker/sink.py +++ b/pynumaflow/sinker/sink.py @@ -22,6 +22,10 @@ class SinkServer(NumaflowServer): + """ + SinkServer is the main class to start a gRPC server for a sinker. + """ + def __init__( self, sinker_instance: SinkCallable, @@ -30,6 +34,20 @@ def __init__( max_threads=MAX_THREADS, server_type=ServerType.Sync, ): + """ + Create a new grpc Sink Server instance. + A new servicer instance is created and attached to the server. + The server instance is returned. + Args: + sinker_instance: The sinker instance to be used for Sink UDF + sock_path: The UNIX socket path to be used for the server + max_message_size: The max message size in bytes the server can receive and send + max_threads: The max number of threads to be spawned; + defaults to number of processors x4 + server_type: The type of server to be used, this can be one of the following: + - ServerType.Sync: Synchronous server + - ServerType.Async: Asynchronous server + """ self.sock_path = f"unix://{sock_path}" self.max_threads = min(max_threads, int(os.getenv("MAX_THREADS", "4"))) self.max_message_size = max_message_size @@ -43,18 +61,28 @@ def __init__( ] def start(self): + """ + Starter function for the server class, Handles the server type and + starts the server accordingly. If the server type is not supported, + raises NotImplementedError. + Currently supported server types are: + - ServerType.Sync: Synchronous server + - ServerType.Async: Asynchronous server + """ if self.server_type == ServerType.Sync: self.exec() elif self.server_type == ServerType.Async: aiorun.run(self.aexec()) else: - _LOGGER.error("Server type not supported", self.server_type) + _LOGGER.error("Server type not supported - %s", str(self.server_type)) raise NotImplementedError def exec(self): """ - Starts the Synchronous gRPC server on the given UNIX socket with given max threads. + Starts the Synchronous gRPC server on the + given UNIX socket with given max threads. """ + # Get the servicer instance sink_servicer = self.get_servicer( sinker_instance=self.sinker_instance, server_type=self.server_type ) @@ -63,7 +91,7 @@ def exec(self): self.sock_path, self.max_threads, ) - + # Start the server sync_server_start( servicer=sink_servicer, bind_address=self.sock_path, @@ -76,6 +104,10 @@ async def aexec(self): """ Starts the Asynchronous gRPC server on the given UNIX socket with given max threads. """ + # As the server is async, we need to create a new server instance in the + # same thread as the event loop so that all the async calls are made in the + # same context + # Create a new server instance, add the servicer to it and start the server server = grpc.aio.server() server.add_insecure_port(self.sock_path) sink_servicer = self.get_servicer( @@ -85,9 +117,10 @@ async def aexec(self): await start_async_server(server, self.sock_path, self.max_threads, self._server_options) def get_servicer(self, sinker_instance: SinkCallable, server_type: ServerType): + """ + Returns the servicer instance based on the server type. + """ if server_type == ServerType.Sync: return Sinker(sinker_instance) elif server_type == ServerType.Async: return AsyncSinker(sinker_instance) - else: - raise NotImplementedError diff --git a/pynumaflow/sourcer/async_server.py b/pynumaflow/sourcer/async_server.py index 8b68b01e..58598068 100644 --- a/pynumaflow/sourcer/async_server.py +++ b/pynumaflow/sourcer/async_server.py @@ -1,62 +1,21 @@ -import logging -import os - from collections.abc import AsyncIterable from google.protobuf import timestamp_pb2 as _timestamp_pb2 import grpc from google.protobuf import empty_pb2 as _empty_pb2 -from pynumaflow import setup_logging from pynumaflow.sourcer._dtypes import ReadRequest from pynumaflow.sourcer._dtypes import Offset, AckRequest, SourceCallable from pynumaflow.proto.sourcer import source_pb2 from pynumaflow.proto.sourcer import source_pb2_grpc from pynumaflow.types import NumaflowServicerContext - -_LOGGER = setup_logging(__name__) -if os.getenv("PYTHONDEBUG"): - _LOGGER.setLevel(logging.DEBUG) +from pynumaflow._constants import _LOGGER class AsyncSourcer(source_pb2_grpc.SourceServicer): """ - Provides an interface to write an Asynchronous Sourcer - which will be exposed over gRPC. - - Args: - read_handler: Function callable following the type signature of AsyncSourceReadCallable - ack_handler: Function handler for AckFn - pending_handler: Function handler for PendingFn - partitions_handler: Function handler for PartitionsFn - - sock_path: Path to the UNIX Domain Socket - max_message_size: The max message size in bytes the server can receive and send - max_threads: The max number of threads to be spawned; - defaults to number of processors x4 - - Example invocation: - >>> from typing import Iterator - >>> from pynumaflow.sourcer import Message, get_default_partitions \ - ... ReadRequest, AsyncSourcer, - ... import aiorun - ... async def read_handler(datum: ReadRequest) -> AsyncIterable[Message]: - ... payload = b"payload:test_mock_message" - ... keys = ["test_key"] - ... offset = mock_offset() - ... event_time = mock_event_time() - ... for i in range(10): - ... yield Message(payload=payload, keys=keys, offset=offset, event_time=event_time) - ... async def ack_handler(ack_request: AckRequest): - ... return - ... async def pending_handler() -> PendingResponse: - ... PendingResponse(count=10) - ... async def partitions_handler() -> PartitionsResponse: - ... return PartitionsResponse(partitions=get_default_partitions()) - >>> grpc_server = AsyncSourcer(read_handler=read_handler, - ... ack_handler=ack_handler, - ... pending_handler=pending_handler, - ... partitions_handler=partitions_handler) - >>> aiorun.run(grpc_server.start()) + This class is used to create a new grpc Source servicer instance. + It implements the SourceServicer interface from the proto source.proto file. + Provides the functionality for the required rpc methods. """ def __init__(self, source_handler: SourceCallable): diff --git a/pynumaflow/sourcer/server.py b/pynumaflow/sourcer/server.py index ae4c1cc9..e5149d01 100644 --- a/pynumaflow/sourcer/server.py +++ b/pynumaflow/sourcer/server.py @@ -1,13 +1,9 @@ -import logging -import os - from collections.abc import Iterable from google.protobuf import timestamp_pb2 as _timestamp_pb2 import grpc from google.protobuf import empty_pb2 as _empty_pb2 -from pynumaflow import setup_logging from pynumaflow.sourcer._dtypes import ReadRequest from pynumaflow.sourcer._dtypes import ( SourceReadCallable, @@ -19,42 +15,14 @@ from pynumaflow.proto.sourcer import source_pb2 from pynumaflow.proto.sourcer import source_pb2_grpc from pynumaflow.types import NumaflowServicerContext - -_LOGGER = setup_logging(__name__) -if os.getenv("PYTHONDEBUG"): - _LOGGER.setLevel(logging.DEBUG) +from pynumaflow._constants import _LOGGER class Sourcer(source_pb2_grpc.SourceServicer): """ - Provides an interface to write a Sourcer - which will be exposed over gRPC. - - Args: - source_handler: Class of the type SourcerClass which implements the UDS methods - - Example invocation: - >>> from typing import Iterator - >>> from pynumaflow.sourcer import Message, get_default_partitions, PartitionsResponse \ - ... ReadRequest, Sourcer, AckRequest, - ... def read_handler(datum: ReadRequest) -> Iterable[Message]: - ... payload = b"payload:test_mock_message" - ... keys = ["test_key"] - ... offset = mock_offset() - ... event_time = mock_event_time() - ... for i in range(10): - ... yield Message(payload=payload, keys=keys, offset=offset, event_time=event_time) - ... def ack_handler(ack_request: AckRequest): - ... return - ... def pending_handler() -> PendingResponse: - ... PendingResponse(count=10) - ... def partitions_handler() -> PartitionsResponse: - ... return PartitionsResponse(partitions=get_default_partitions()) - >>> grpc_server = Sourcer(read_handler=read_handler, - ... ack_handler=ack_handler, - ... pending_handler=pending_handler, - ... partitions_handler=partition_handler,) - >>> grpc_server.start() + This class is used to create a new grpc Source servicer instance. + It implements the SourceServicer interface from the proto source.proto file. + Provides the functionality for the required rpc methods. """ def __init__( diff --git a/pynumaflow/sourcer/source.py b/pynumaflow/sourcer/source.py index 2f1c203e..de77f28a 100644 --- a/pynumaflow/sourcer/source.py +++ b/pynumaflow/sourcer/source.py @@ -20,6 +20,10 @@ class SourceServer(NumaflowServer): + """ + Class for a new Source Server instance. + """ + def __init__( self, sourcer_instance: SourceCallable, @@ -29,13 +33,16 @@ def __init__( server_type=ServerType.Sync, ): """ - Create a new grpc Server instance. + Create a new grpc Source Server instance. A new servicer instance is created and attached to the server. The server instance is returned. - + Args: + sourcer_instance: The sourcer instance to be used for Source UDF + sock_path: The UNIX socket path to be used for the server max_message_size: The max message size in bytes the server can receive and send max_threads: The max number of threads to be spawned; - defaults to number of processors x4 + defaults to number of processors x4 + server_type: The type of server to be used """ self.sock_path = f"unix://{sock_path}" self.max_threads = min(max_threads, int(os.getenv("MAX_THREADS", "4"))) @@ -51,20 +58,25 @@ def __init__( def start(self): """ - Starts the gRPC server on the given UNIX socket with given max threads. + Starter function for the Source server, Handles the server type and + starts the server. + Currrently supported server types: + 1. ServerType.Sync + 2. ServerType.Async """ if self.server_type == ServerType.Sync: self.exec() elif self.server_type == ServerType.Async: aiorun.run(self.aexec()) else: - _LOGGER.error("Server type not supported", self.server_type) + _LOGGER.error("Server type not supported - %s", str(self.server_type)) raise NotImplementedError def exec(self): """ Starts the Synchronous gRPC server on the given UNIX socket with given max threads. """ + # Get the servicer instance source_servicer = self.get_servicer( sourcer_instance=self.sourcer_instance, server_type=self.server_type ) @@ -73,7 +85,7 @@ def exec(self): self.sock_path, self.max_threads, ) - + # Start the sync server sync_server_start( servicer=source_servicer, bind_address=self.sock_path, @@ -86,6 +98,11 @@ async def aexec(self): """ Starts the Async gRPC server on the given UNIX socket with given max threads """ + + # As the server is async, we need to create a new server instance in the + # same thread as the event loop so that all the async calls are made in the + # same context + # Create a new async server instance and add the servicer to it server = grpc.aio.server() server.add_insecure_port(self.sock_path) source_servicer = self.get_servicer( diff --git a/pynumaflow/sourcetransformer/_dtypes.py b/pynumaflow/sourcetransformer/_dtypes.py index 2cb2987f..b4526c17 100644 --- a/pynumaflow/sourcetransformer/_dtypes.py +++ b/pynumaflow/sourcetransformer/_dtypes.py @@ -177,24 +177,25 @@ class SourceTransformerClass(metaclass=ABCMeta): """ Provides an interface to write a Source Transformer which will be exposed over a GRPC server. - - Args: - """ def __call__(self, *args, **kwargs): """ Allow to call handler function directly if class instance is sent + as the source_transformer_instance. """ return self.handler(*args, **kwargs) @abstractmethod def handler(self, keys: list[str], datum: Datum) -> Messages: """ - Write a handler function which implements the MapCallable interface. + Write a handler function which implements the + SourceTransformCallable interface. """ pass SourceTransformHandler = Callable[[list[str], Datum], Messages] +# SourceTransformCallable is the type of the handler function for the +# Source Transformer UDFunction. SourceTransformCallable = Union[SourceTransformHandler, SourceTransformerClass] diff --git a/pynumaflow/sourcetransformer/server.py b/pynumaflow/sourcetransformer/server.py index cd9c39dd..b63fa13c 100644 --- a/pynumaflow/sourcetransformer/server.py +++ b/pynumaflow/sourcetransformer/server.py @@ -1,47 +1,20 @@ -import logging -import os - import grpc from google.protobuf import empty_pb2 as _empty_pb2 from google.protobuf import timestamp_pb2 as _timestamp_pb2 -from pynumaflow import setup_logging from pynumaflow.sourcetransformer import Datum from pynumaflow.sourcetransformer._dtypes import SourceTransformCallable from pynumaflow.proto.sourcetransformer import transform_pb2 from pynumaflow.proto.sourcetransformer import transform_pb2_grpc from pynumaflow.types import NumaflowServicerContext - -_LOGGER = setup_logging(__name__) -if os.getenv("PYTHONDEBUG"): - _LOGGER.setLevel(logging.DEBUG) +from pynumaflow._constants import _LOGGER class SourceTransformer(transform_pb2_grpc.SourceTransformServicer): """ - Provides an interface to write a Source Transformer - which will be exposed over a Synchronous gRPC server. - - Args: - handler: Function callable following the type signature of SourceTransformCallable - sock_path: Path to the UNIX Domain Socket - max_message_size: The max message size in bytes the server can receive and send - max_threads: The max number of threads to be spawned; - defaults to number of processors x4 - - Example invocation: - >>> from typing import Iterator - >>> from pynumaflow.sourcetransformer import Messages, Message \ - ... Datum, SourceTransformer - >>> def transform_handler(key: [str], datum: Datum) -> Messages: - ... val = datum.value - ... new_event_time = datetime.time() - ... _ = datum.watermark - ... message_t_s = Messages(Message(val, event_time=new_event_time, keys=key)) - ... return message_t_s - ... - >>> grpc_server = SourceTransformer(handler=transform_handler) - >>> grpc_server.start() + This class is used to create a new grpc SourceTransform servicer instance. + It implements the SourceTransformServicer interface from the proto transform.proto file. + Provides the functionality for the required rpc methods. """ def __init__( diff --git a/pynumaflow/sourcetransformer/sourcetransform.py b/pynumaflow/sourcetransformer/sourcetransform.py index 97734532..591a0119 100644 --- a/pynumaflow/sourcetransformer/sourcetransform.py +++ b/pynumaflow/sourcetransformer/sourcetransform.py @@ -19,7 +19,9 @@ class SourceTransformServer(NumaflowServer): - """ """ + """ + Class for a new Source Transformer Server instance. + """ def __init__( self, @@ -30,13 +32,17 @@ def __init__( server_type=ServerType.Sync, ): """ - Create a new grpc Server instance. + Create a new grpc Source Transformer Server instance. A new servicer instance is created and attached to the server. The server instance is returned. - + Args: + source_transform_instance: The source transformer instance to be used for + Source Transformer UDF + sock_path: The UNIX socket path to be used for the server max_message_size: The max message size in bytes the server can receive and send max_threads: The max number of threads to be spawned; - defaults to number of processors x4 + defaults to number of processors x4 + server_type: The type of server to be used """ self.sock_path = f"unix://{sock_path}" self.max_threads = min(max_threads, int(os.getenv("MAX_THREADS", "4"))) @@ -63,14 +69,18 @@ def __init__( def start(self): """ - Starts the gRPC server on the given UNIX socket with given max threads. + Starter function for the Source Transformer server, + Handles the server type and starts the server. + Currrently supported server types: + 1. ServerType.Sync + 2. ServerType.Multiproc """ if self.server_type == ServerType.Sync: self.exec() elif self.server_type == ServerType.Multiproc: self.exec_multiproc() else: - _LOGGER.error("Server type not supported", self.server_type) + _LOGGER.error("Server type not supported - %s", str(self.server_type)) raise NotImplementedError def exec(self): @@ -85,7 +95,7 @@ def exec(self): self.sock_path, self.max_threads, ) - + # Start the sync server sync_server_start( servicer=transform_servicer, bind_address=self.sock_path, @@ -96,7 +106,8 @@ def exec(self): def exec_multiproc(self): """ - Starts the Multiproc gRPC server on the given UNIX socket with given max threads. + Starts the Multiproc gRPC server on the given TCP sockets + with given max threads. """ transform_servicer = self.get_servicer( source_transform_instance=self.source_transform_instance, server_type=self.server_type @@ -112,10 +123,11 @@ def exec_multiproc(self): def get_servicer( self, source_transform_instance: SourceTransformCallable, server_type: ServerType ): + """ + Returns the servicer instance for the given server type. + """ if server_type == ServerType.Sync: transform_servicer = SourceTransformer(handler=source_transform_instance) elif server_type == ServerType.Multiproc: transform_servicer = SourceTransformer(handler=source_transform_instance) - else: - raise NotImplementedError return transform_servicer diff --git a/tests/map/test_multiproc_mapper.py b/tests/map/test_multiproc_mapper.py index b3608620..62cbdfdb 100644 --- a/tests/map/test_multiproc_mapper.py +++ b/tests/map/test_multiproc_mapper.py @@ -35,44 +35,20 @@ def setUp(self) -> None: @mockenv(NUM_CPU_MULTIPROC="3") def test_multiproc_init(self) -> None: - my_server = MapServer(mapper_instance=map_handler) + my_server = MapServer(mapper_instance=map_handler, server_type=ServerType.Multiproc) self.assertEqual(my_server._process_count, 3) @patch("os.cpu_count", Mock(return_value=4)) def test_multiproc_process_count(self) -> None: - my_server = MapServer(mapper_instance=map_handler) + my_server = MapServer(mapper_instance=map_handler, server_type=ServerType.Multiproc) self.assertEqual(my_server._process_count, 4) @patch("os.cpu_count", Mock(return_value=4)) @mockenv(NUM_CPU_MULTIPROC="10") def test_max_process_count(self) -> None: - server = MapServer(mapper_instance=map_handler) + server = MapServer(mapper_instance=map_handler, server_type=ServerType.Multiproc) self.assertEqual(server._process_count, 8) - # # To test the reuse property for the grpc servers which allow multiple - # # bindings to the same server - # def test_reuse_port(self): - # serv_options = [("grpc.so_reuseaddr", 1)] - # - # server = MapServer(mapper_instance=map_handler) - # - # with server._reserve_port(0) as port: - # print(port) - # bind_address = f"localhost:{port}" - # server1 = grpc.server(thread_pool=None, options=serv_options) - # map_pb2_grpc.add_MapServicer_to_server(server, server1) - # server1.add_insecure_port(bind_address) - # - # # so_reuseport=0 -> the bind should raise an error - # server2 = grpc.server(thread_pool=None, options=(("grpc.so_reuseport", 0),)) - # map_pb2_grpc.add_MapServicer_to_server(server, server2) - # self.assertRaises(RuntimeError, server2.add_insecure_port, bind_address) - # - # # so_reuseport=1 -> should allow server to bind to port again - # server3 = grpc.server(thread_pool=None, options=(("grpc.so_reuseport", 1),)) - # map_pb2_grpc.add_MapServicer_to_server(server, server3) - # server3.add_insecure_port(bind_address) - def test_udf_map_err(self): my_server = MapServer(mapper_instance=err_map_handler) my_servicer = my_server.get_servicer( @@ -155,7 +131,7 @@ def test_map_forward_message(self): def test_invalid_input(self): with self.assertRaises(TypeError): - MapServer() + MapServer(server_type=ServerType.Multiproc) if __name__ == "__main__": diff --git a/tests/map/test_sync_mapper.py b/tests/map/test_sync_mapper.py index d0ebe1b1..af046692 100644 --- a/tests/map/test_sync_mapper.py +++ b/tests/map/test_sync_mapper.py @@ -7,8 +7,6 @@ from grpc_testing import server_from_dictionary, strict_real_time from pynumaflow._constants import ServerType -from pynumaflow.mapper.server import Mapper - from pynumaflow.mapper import MapServer from pynumaflow.proto.mapper import map_pb2 from tests.map.utils import map_handler, err_map_handler @@ -39,7 +37,10 @@ def test_init_with_args(self) -> None: self.assertEqual(my_servicer.max_message_size, 1024 * 1024 * 5) def test_udf_map_err(self): - my_servicer = Mapper(handler=err_map_handler) + my_server = MapServer(mapper_instance=err_map_handler) + my_servicer = my_server.get_servicer( + mapper_instance=my_server.mapper_instance, server_type=ServerType.Sync + ) services = {map_pb2.DESCRIPTOR.services_by_name["Map"]: my_servicer} self.test_server = server_from_dictionary(services, strict_real_time()) @@ -66,7 +67,10 @@ def test_udf_map_err(self): self.assertEqual(grpc.StatusCode.UNKNOWN, code) def test_udf_map_error_response(self): - my_servicer = Mapper(handler=err_map_handler) + my_server = MapServer(mapper_instance=err_map_handler) + my_servicer = my_server.get_servicer( + mapper_instance=my_server.mapper_instance, server_type=my_server.server_type + ) services = {map_pb2.DESCRIPTOR.services_by_name["Map"]: my_servicer} self.test_server = server_from_dictionary(services, strict_real_time()) @@ -144,7 +148,9 @@ def test_map_forward_message(self): def test_invalid_input(self): with self.assertRaises(TypeError): - Mapper() + MapServer() + with self.assertRaises(NotImplementedError): + MapServer(mapper_instance=map_handler, server_type="ERORR").start() if __name__ == "__main__": diff --git a/tests/mapstream/test_async_map_stream_err.py b/tests/mapstream/test_async_map_stream_err.py index 10927163..7e7d5890 100644 --- a/tests/mapstream/test_async_map_stream_err.py +++ b/tests/mapstream/test_async_map_stream_err.py @@ -105,6 +105,10 @@ def __stub(self): def test_invalid_input(self): with self.assertRaises(TypeError): MapStreamServer(server_type=ServerType.Async) + with self.assertRaises(NotImplementedError): + MapStreamServer( + map_stream_instance=err_async_map_stream_handler, server_type="ERORR" + ).start() if __name__ == "__main__": diff --git a/tests/reduce/test_async_reduce.py b/tests/reduce/test_async_reduce.py index 329e3561..98ba6083 100644 --- a/tests/reduce/test_async_reduce.py +++ b/tests/reduce/test_async_reduce.py @@ -237,6 +237,12 @@ def test_is_ready(self) -> None: def __stub(self): return reduce_pb2_grpc.ReduceStub(_channel) + def test_error_init(self): + with self.assertRaises(TypeError): + ReduceServer() + with self.assertRaises(NotImplementedError): + ReduceServer(reducer_instance=async_reduce_handler, server_type="ERORR").start() + if __name__ == "__main__": logging.basicConfig(level=logging.DEBUG) diff --git a/tests/sink/test_async_sink.py b/tests/sink/test_async_sink.py index db2ea793..2fc22809 100644 --- a/tests/sink/test_async_sink.py +++ b/tests/sink/test_async_sink.py @@ -161,6 +161,12 @@ def test_sink_err(self) -> None: def __stub(self): return sink_pb2_grpc.SinkStub(_channel) + def test_invalid_server_type(self) -> None: + with self.assertRaises(TypeError): + SinkServer(server_type=ServerType.Async) + with self.assertRaises(NotImplementedError): + SinkServer(sinker_instance=udsink_handler, server_type="ERORR").start() + if __name__ == "__main__": logging.basicConfig(level=logging.DEBUG) diff --git a/tests/sink/test_server.py b/tests/sink/test_server.py index 28b8af3e..8678469c 100644 --- a/tests/sink/test_server.py +++ b/tests/sink/test_server.py @@ -162,6 +162,12 @@ def test_forward_message(self): self.assertEqual("mock sink message error", response.results[1].err_msg) self.assertEqual(code, StatusCode.OK) + def test_invalid_init(self): + with self.assertRaises(TypeError): + SinkServer() + with self.assertRaises(NotImplementedError): + SinkServer(sinker_instance=udsink_handler, server_type="ERORR").start() + if __name__ == "__main__": unittest.main() diff --git a/tests/source/test_sync_source_err.py b/tests/source/test_sync_source_err.py index b2464e2e..d92cada1 100644 --- a/tests/source/test_sync_source_err.py +++ b/tests/source/test_sync_source_err.py @@ -103,6 +103,8 @@ def test_source_partition(self): def test_invalid_input(self): with self.assertRaises(TypeError): SourceServer() + with self.assertRaises(NotImplementedError): + SourceServer(sourcer_instance=SyncSourceError(), server_type="random").start() if __name__ == "__main__": diff --git a/tests/sourcetransform/test_sync_server.py b/tests/sourcetransform/test_sync_server.py index f340878d..7ca38141 100644 --- a/tests/sourcetransform/test_sync_server.py +++ b/tests/sourcetransform/test_sync_server.py @@ -145,6 +145,10 @@ def test_mapt_assign_new_event_time(self, test_server=None): def test_invalid_input(self): with self.assertRaises(TypeError): SourceTransformServer() + with self.assertRaises(NotImplementedError): + SourceTransformServer( + source_transform_instance=transform_handler, server_type=ServerType.Async + ).start() if __name__ == "__main__": From bfde53532efd571418368678162b307f9b6c21dd Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Wed, 10 Jan 2024 15:15:28 -0800 Subject: [PATCH 47/78] lint Signed-off-by: Sidhant Kohli --- pynumaflow/proto/mapper/map_pb2.py | 29 +- pynumaflow/proto/mapper/map_pb2_grpc.py | 127 +++++--- pynumaflow/proto/mapstreamer/mapstream_pb2.py | 29 +- .../proto/mapstreamer/mapstream_pb2_grpc.py | 127 +++++--- pynumaflow/proto/reducer/reduce_pb2.py | 29 +- pynumaflow/proto/reducer/reduce_pb2_grpc.py | 127 +++++--- pynumaflow/proto/sideinput/sideinput_pb2.py | 21 +- .../proto/sideinput/sideinput_pb2_grpc.py | 127 +++++--- pynumaflow/proto/sinker/sink_pb2.py | 33 +- pynumaflow/proto/sinker/sink_pb2_grpc.py | 127 +++++--- pynumaflow/proto/sourcer/source_pb2.py | 69 ++-- pynumaflow/proto/sourcer/source_pb2_grpc.py | 306 +++++++++++------- .../proto/sourcetransformer/transform_pb2.py | 29 +- .../sourcetransformer/transform_pb2_grpc.py | 124 ++++--- 14 files changed, 758 insertions(+), 546 deletions(-) diff --git a/pynumaflow/proto/mapper/map_pb2.py b/pynumaflow/proto/mapper/map_pb2.py index edb4dbd4..881e4fb3 100644 --- a/pynumaflow/proto/mapper/map_pb2.py +++ b/pynumaflow/proto/mapper/map_pb2.py @@ -7,6 +7,7 @@ from google.protobuf import descriptor_pool as _descriptor_pool from google.protobuf import symbol_database as _symbol_database from google.protobuf.internal import builder as _builder + # @@protoc_insertion_point(imports) _sym_db = _symbol_database.Default() @@ -16,21 +17,23 @@ from google.protobuf import timestamp_pb2 as google_dot_protobuf_dot_timestamp__pb2 -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\tmap.proto\x12\x06map.v1\x1a\x1bgoogle/protobuf/empty.proto\x1a\x1fgoogle/protobuf/timestamp.proto\"\x88\x01\n\nMapRequest\x12\x0c\n\x04keys\x18\x01 \x03(\t\x12\r\n\x05value\x18\x02 \x01(\x0c\x12.\n\nevent_time\x18\x03 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12-\n\twatermark\x18\x04 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\"o\n\x0bMapResponse\x12+\n\x07results\x18\x01 \x03(\x0b\x32\x1a.map.v1.MapResponse.Result\x1a\x33\n\x06Result\x12\x0c\n\x04keys\x18\x01 \x03(\t\x12\r\n\x05value\x18\x02 \x01(\x0c\x12\x0c\n\x04tags\x18\x03 \x03(\t\"\x1e\n\rReadyResponse\x12\r\n\x05ready\x18\x01 \x01(\x08\x32q\n\x03Map\x12\x30\n\x05MapFn\x12\x12.map.v1.MapRequest\x1a\x13.map.v1.MapResponse\x12\x38\n\x07IsReady\x12\x16.google.protobuf.Empty\x1a\x15.map.v1.ReadyResponseb\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( + b'\n\tmap.proto\x12\x06map.v1\x1a\x1bgoogle/protobuf/empty.proto\x1a\x1fgoogle/protobuf/timestamp.proto"\x88\x01\n\nMapRequest\x12\x0c\n\x04keys\x18\x01 \x03(\t\x12\r\n\x05value\x18\x02 \x01(\x0c\x12.\n\nevent_time\x18\x03 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12-\n\twatermark\x18\x04 \x01(\x0b\x32\x1a.google.protobuf.Timestamp"o\n\x0bMapResponse\x12+\n\x07results\x18\x01 \x03(\x0b\x32\x1a.map.v1.MapResponse.Result\x1a\x33\n\x06Result\x12\x0c\n\x04keys\x18\x01 \x03(\t\x12\r\n\x05value\x18\x02 \x01(\x0c\x12\x0c\n\x04tags\x18\x03 \x03(\t"\x1e\n\rReadyResponse\x12\r\n\x05ready\x18\x01 \x01(\x08\x32q\n\x03Map\x12\x30\n\x05MapFn\x12\x12.map.v1.MapRequest\x1a\x13.map.v1.MapResponse\x12\x38\n\x07IsReady\x12\x16.google.protobuf.Empty\x1a\x15.map.v1.ReadyResponseb\x06proto3' +) _globals = globals() _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) -_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'map_pb2', _globals) +_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "map_pb2", _globals) if _descriptor._USE_C_DESCRIPTORS == False: - DESCRIPTOR._options = None - _globals['_MAPREQUEST']._serialized_start=84 - _globals['_MAPREQUEST']._serialized_end=220 - _globals['_MAPRESPONSE']._serialized_start=222 - _globals['_MAPRESPONSE']._serialized_end=333 - _globals['_MAPRESPONSE_RESULT']._serialized_start=282 - _globals['_MAPRESPONSE_RESULT']._serialized_end=333 - _globals['_READYRESPONSE']._serialized_start=335 - _globals['_READYRESPONSE']._serialized_end=365 - _globals['_MAP']._serialized_start=367 - _globals['_MAP']._serialized_end=480 + DESCRIPTOR._options = None + _globals["_MAPREQUEST"]._serialized_start = 84 + _globals["_MAPREQUEST"]._serialized_end = 220 + _globals["_MAPRESPONSE"]._serialized_start = 222 + _globals["_MAPRESPONSE"]._serialized_end = 333 + _globals["_MAPRESPONSE_RESULT"]._serialized_start = 282 + _globals["_MAPRESPONSE_RESULT"]._serialized_end = 333 + _globals["_READYRESPONSE"]._serialized_start = 335 + _globals["_READYRESPONSE"]._serialized_end = 365 + _globals["_MAP"]._serialized_start = 367 + _globals["_MAP"]._serialized_end = 480 # @@protoc_insertion_point(module_scope) diff --git a/pynumaflow/proto/mapper/map_pb2_grpc.py b/pynumaflow/proto/mapper/map_pb2_grpc.py index 17345658..da8edc68 100644 --- a/pynumaflow/proto/mapper/map_pb2_grpc.py +++ b/pynumaflow/proto/mapper/map_pb2_grpc.py @@ -16,87 +16,108 @@ def __init__(self, channel): channel: A grpc.Channel. """ self.MapFn = channel.unary_unary( - '/map.v1.Map/MapFn', - request_serializer=map__pb2.MapRequest.SerializeToString, - response_deserializer=map__pb2.MapResponse.FromString, - ) + "/map.v1.Map/MapFn", + request_serializer=map__pb2.MapRequest.SerializeToString, + response_deserializer=map__pb2.MapResponse.FromString, + ) self.IsReady = channel.unary_unary( - '/map.v1.Map/IsReady', - request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, - response_deserializer=map__pb2.ReadyResponse.FromString, - ) + "/map.v1.Map/IsReady", + request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, + response_deserializer=map__pb2.ReadyResponse.FromString, + ) class MapServicer(object): """Missing associated documentation comment in .proto file.""" def MapFn(self, request, context): - """MapFn applies a function to each map request element. - """ + """MapFn applies a function to each map request element.""" context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details('Method not implemented!') - raise NotImplementedError('Method not implemented!') + context.set_details("Method not implemented!") + raise NotImplementedError("Method not implemented!") def IsReady(self, request, context): - """IsReady is the heartbeat endpoint for gRPC. - """ + """IsReady is the heartbeat endpoint for gRPC.""" context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details('Method not implemented!') - raise NotImplementedError('Method not implemented!') + context.set_details("Method not implemented!") + raise NotImplementedError("Method not implemented!") def add_MapServicer_to_server(servicer, server): rpc_method_handlers = { - 'MapFn': grpc.unary_unary_rpc_method_handler( - servicer.MapFn, - request_deserializer=map__pb2.MapRequest.FromString, - response_serializer=map__pb2.MapResponse.SerializeToString, - ), - 'IsReady': grpc.unary_unary_rpc_method_handler( - servicer.IsReady, - request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, - response_serializer=map__pb2.ReadyResponse.SerializeToString, - ), + "MapFn": grpc.unary_unary_rpc_method_handler( + servicer.MapFn, + request_deserializer=map__pb2.MapRequest.FromString, + response_serializer=map__pb2.MapResponse.SerializeToString, + ), + "IsReady": grpc.unary_unary_rpc_method_handler( + servicer.IsReady, + request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + response_serializer=map__pb2.ReadyResponse.SerializeToString, + ), } - generic_handler = grpc.method_handlers_generic_handler( - 'map.v1.Map', rpc_method_handlers) + generic_handler = grpc.method_handlers_generic_handler("map.v1.Map", rpc_method_handlers) server.add_generic_rpc_handlers((generic_handler,)) - # This class is part of an EXPERIMENTAL API. +# This class is part of an EXPERIMENTAL API. class Map(object): """Missing associated documentation comment in .proto file.""" @staticmethod - def MapFn(request, + def MapFn( + request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None, + ): + return grpc.experimental.unary_unary( + request, target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary(request, target, '/map.v1.Map/MapFn', + "/map.v1.Map/MapFn", map__pb2.MapRequest.SerializeToString, map__pb2.MapResponse.FromString, - options, channel_credentials, - insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + ) @staticmethod - def IsReady(request, + def IsReady( + request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None, + ): + return grpc.experimental.unary_unary( + request, target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary(request, target, '/map.v1.Map/IsReady', + "/map.v1.Map/IsReady", google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, map__pb2.ReadyResponse.FromString, - options, channel_credentials, - insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + ) diff --git a/pynumaflow/proto/mapstreamer/mapstream_pb2.py b/pynumaflow/proto/mapstreamer/mapstream_pb2.py index 7613aed0..abbdf0a0 100644 --- a/pynumaflow/proto/mapstreamer/mapstream_pb2.py +++ b/pynumaflow/proto/mapstreamer/mapstream_pb2.py @@ -7,6 +7,7 @@ from google.protobuf import descriptor_pool as _descriptor_pool from google.protobuf import symbol_database as _symbol_database from google.protobuf.internal import builder as _builder + # @@protoc_insertion_point(imports) _sym_db = _symbol_database.Default() @@ -16,21 +17,23 @@ from google.protobuf import timestamp_pb2 as google_dot_protobuf_dot_timestamp__pb2 -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0fmapstream.proto\x12\x0cmapstream.v1\x1a\x1bgoogle/protobuf/empty.proto\x1a\x1fgoogle/protobuf/timestamp.proto\"\x8e\x01\n\x10MapStreamRequest\x12\x0c\n\x04keys\x18\x01 \x03(\t\x12\r\n\x05value\x18\x02 \x01(\x0c\x12.\n\nevent_time\x18\x03 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12-\n\twatermark\x18\x04 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\"\x80\x01\n\x11MapStreamResponse\x12\x36\n\x06result\x18\x01 \x01(\x0b\x32&.mapstream.v1.MapStreamResponse.Result\x1a\x33\n\x06Result\x12\x0c\n\x04keys\x18\x01 \x03(\t\x12\r\n\x05value\x18\x02 \x01(\x0c\x12\x0c\n\x04tags\x18\x03 \x03(\t\"\x1e\n\rReadyResponse\x12\r\n\x05ready\x18\x01 \x01(\x08\x32\x9d\x01\n\tMapStream\x12P\n\x0bMapStreamFn\x12\x1e.mapstream.v1.MapStreamRequest\x1a\x1f.mapstream.v1.MapStreamResponse0\x01\x12>\n\x07IsReady\x12\x16.google.protobuf.Empty\x1a\x1b.mapstream.v1.ReadyResponseb\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( + b'\n\x0fmapstream.proto\x12\x0cmapstream.v1\x1a\x1bgoogle/protobuf/empty.proto\x1a\x1fgoogle/protobuf/timestamp.proto"\x8e\x01\n\x10MapStreamRequest\x12\x0c\n\x04keys\x18\x01 \x03(\t\x12\r\n\x05value\x18\x02 \x01(\x0c\x12.\n\nevent_time\x18\x03 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12-\n\twatermark\x18\x04 \x01(\x0b\x32\x1a.google.protobuf.Timestamp"\x80\x01\n\x11MapStreamResponse\x12\x36\n\x06result\x18\x01 \x01(\x0b\x32&.mapstream.v1.MapStreamResponse.Result\x1a\x33\n\x06Result\x12\x0c\n\x04keys\x18\x01 \x03(\t\x12\r\n\x05value\x18\x02 \x01(\x0c\x12\x0c\n\x04tags\x18\x03 \x03(\t"\x1e\n\rReadyResponse\x12\r\n\x05ready\x18\x01 \x01(\x08\x32\x9d\x01\n\tMapStream\x12P\n\x0bMapStreamFn\x12\x1e.mapstream.v1.MapStreamRequest\x1a\x1f.mapstream.v1.MapStreamResponse0\x01\x12>\n\x07IsReady\x12\x16.google.protobuf.Empty\x1a\x1b.mapstream.v1.ReadyResponseb\x06proto3' +) _globals = globals() _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) -_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'mapstream_pb2', _globals) +_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "mapstream_pb2", _globals) if _descriptor._USE_C_DESCRIPTORS == False: - DESCRIPTOR._options = None - _globals['_MAPSTREAMREQUEST']._serialized_start=96 - _globals['_MAPSTREAMREQUEST']._serialized_end=238 - _globals['_MAPSTREAMRESPONSE']._serialized_start=241 - _globals['_MAPSTREAMRESPONSE']._serialized_end=369 - _globals['_MAPSTREAMRESPONSE_RESULT']._serialized_start=318 - _globals['_MAPSTREAMRESPONSE_RESULT']._serialized_end=369 - _globals['_READYRESPONSE']._serialized_start=371 - _globals['_READYRESPONSE']._serialized_end=401 - _globals['_MAPSTREAM']._serialized_start=404 - _globals['_MAPSTREAM']._serialized_end=561 + DESCRIPTOR._options = None + _globals["_MAPSTREAMREQUEST"]._serialized_start = 96 + _globals["_MAPSTREAMREQUEST"]._serialized_end = 238 + _globals["_MAPSTREAMRESPONSE"]._serialized_start = 241 + _globals["_MAPSTREAMRESPONSE"]._serialized_end = 369 + _globals["_MAPSTREAMRESPONSE_RESULT"]._serialized_start = 318 + _globals["_MAPSTREAMRESPONSE_RESULT"]._serialized_end = 369 + _globals["_READYRESPONSE"]._serialized_start = 371 + _globals["_READYRESPONSE"]._serialized_end = 401 + _globals["_MAPSTREAM"]._serialized_start = 404 + _globals["_MAPSTREAM"]._serialized_end = 561 # @@protoc_insertion_point(module_scope) diff --git a/pynumaflow/proto/mapstreamer/mapstream_pb2_grpc.py b/pynumaflow/proto/mapstreamer/mapstream_pb2_grpc.py index 222a1614..305c8e05 100644 --- a/pynumaflow/proto/mapstreamer/mapstream_pb2_grpc.py +++ b/pynumaflow/proto/mapstreamer/mapstream_pb2_grpc.py @@ -16,87 +16,110 @@ def __init__(self, channel): channel: A grpc.Channel. """ self.MapStreamFn = channel.unary_stream( - '/mapstream.v1.MapStream/MapStreamFn', - request_serializer=mapstream__pb2.MapStreamRequest.SerializeToString, - response_deserializer=mapstream__pb2.MapStreamResponse.FromString, - ) + "/mapstream.v1.MapStream/MapStreamFn", + request_serializer=mapstream__pb2.MapStreamRequest.SerializeToString, + response_deserializer=mapstream__pb2.MapStreamResponse.FromString, + ) self.IsReady = channel.unary_unary( - '/mapstream.v1.MapStream/IsReady', - request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, - response_deserializer=mapstream__pb2.ReadyResponse.FromString, - ) + "/mapstream.v1.MapStream/IsReady", + request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, + response_deserializer=mapstream__pb2.ReadyResponse.FromString, + ) class MapStreamServicer(object): """Missing associated documentation comment in .proto file.""" def MapStreamFn(self, request, context): - """MapStreamFn applies a function to each request element and returns a stream. - """ + """MapStreamFn applies a function to each request element and returns a stream.""" context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details('Method not implemented!') - raise NotImplementedError('Method not implemented!') + context.set_details("Method not implemented!") + raise NotImplementedError("Method not implemented!") def IsReady(self, request, context): - """IsReady is the heartbeat endpoint for gRPC. - """ + """IsReady is the heartbeat endpoint for gRPC.""" context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details('Method not implemented!') - raise NotImplementedError('Method not implemented!') + context.set_details("Method not implemented!") + raise NotImplementedError("Method not implemented!") def add_MapStreamServicer_to_server(servicer, server): rpc_method_handlers = { - 'MapStreamFn': grpc.unary_stream_rpc_method_handler( - servicer.MapStreamFn, - request_deserializer=mapstream__pb2.MapStreamRequest.FromString, - response_serializer=mapstream__pb2.MapStreamResponse.SerializeToString, - ), - 'IsReady': grpc.unary_unary_rpc_method_handler( - servicer.IsReady, - request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, - response_serializer=mapstream__pb2.ReadyResponse.SerializeToString, - ), + "MapStreamFn": grpc.unary_stream_rpc_method_handler( + servicer.MapStreamFn, + request_deserializer=mapstream__pb2.MapStreamRequest.FromString, + response_serializer=mapstream__pb2.MapStreamResponse.SerializeToString, + ), + "IsReady": grpc.unary_unary_rpc_method_handler( + servicer.IsReady, + request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + response_serializer=mapstream__pb2.ReadyResponse.SerializeToString, + ), } generic_handler = grpc.method_handlers_generic_handler( - 'mapstream.v1.MapStream', rpc_method_handlers) + "mapstream.v1.MapStream", rpc_method_handlers + ) server.add_generic_rpc_handlers((generic_handler,)) - # This class is part of an EXPERIMENTAL API. +# This class is part of an EXPERIMENTAL API. class MapStream(object): """Missing associated documentation comment in .proto file.""" @staticmethod - def MapStreamFn(request, + def MapStreamFn( + request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None, + ): + return grpc.experimental.unary_stream( + request, target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_stream(request, target, '/mapstream.v1.MapStream/MapStreamFn', + "/mapstream.v1.MapStream/MapStreamFn", mapstream__pb2.MapStreamRequest.SerializeToString, mapstream__pb2.MapStreamResponse.FromString, - options, channel_credentials, - insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + ) @staticmethod - def IsReady(request, + def IsReady( + request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None, + ): + return grpc.experimental.unary_unary( + request, target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary(request, target, '/mapstream.v1.MapStream/IsReady', + "/mapstream.v1.MapStream/IsReady", google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, mapstream__pb2.ReadyResponse.FromString, - options, channel_credentials, - insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + ) diff --git a/pynumaflow/proto/reducer/reduce_pb2.py b/pynumaflow/proto/reducer/reduce_pb2.py index ec107b83..e5b2aceb 100644 --- a/pynumaflow/proto/reducer/reduce_pb2.py +++ b/pynumaflow/proto/reducer/reduce_pb2.py @@ -7,6 +7,7 @@ from google.protobuf import descriptor_pool as _descriptor_pool from google.protobuf import symbol_database as _symbol_database from google.protobuf.internal import builder as _builder + # @@protoc_insertion_point(imports) _sym_db = _symbol_database.Default() @@ -16,21 +17,23 @@ from google.protobuf import timestamp_pb2 as google_dot_protobuf_dot_timestamp__pb2 -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0creduce.proto\x12\treduce.v1\x1a\x1bgoogle/protobuf/empty.proto\x1a\x1fgoogle/protobuf/timestamp.proto\"\x8b\x01\n\rReduceRequest\x12\x0c\n\x04keys\x18\x01 \x03(\t\x12\r\n\x05value\x18\x02 \x01(\x0c\x12.\n\nevent_time\x18\x03 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12-\n\twatermark\x18\x04 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\"x\n\x0eReduceResponse\x12\x31\n\x07results\x18\x01 \x03(\x0b\x32 .reduce.v1.ReduceResponse.Result\x1a\x33\n\x06Result\x12\x0c\n\x04keys\x18\x01 \x03(\t\x12\r\n\x05value\x18\x02 \x01(\x0c\x12\x0c\n\x04tags\x18\x03 \x03(\t\"\x1e\n\rReadyResponse\x12\r\n\x05ready\x18\x01 \x01(\x08\x32\x8a\x01\n\x06Reduce\x12\x43\n\x08ReduceFn\x12\x18.reduce.v1.ReduceRequest\x1a\x19.reduce.v1.ReduceResponse(\x01\x30\x01\x12;\n\x07IsReady\x12\x16.google.protobuf.Empty\x1a\x18.reduce.v1.ReadyResponseb\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( + b'\n\x0creduce.proto\x12\treduce.v1\x1a\x1bgoogle/protobuf/empty.proto\x1a\x1fgoogle/protobuf/timestamp.proto"\x8b\x01\n\rReduceRequest\x12\x0c\n\x04keys\x18\x01 \x03(\t\x12\r\n\x05value\x18\x02 \x01(\x0c\x12.\n\nevent_time\x18\x03 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12-\n\twatermark\x18\x04 \x01(\x0b\x32\x1a.google.protobuf.Timestamp"x\n\x0eReduceResponse\x12\x31\n\x07results\x18\x01 \x03(\x0b\x32 .reduce.v1.ReduceResponse.Result\x1a\x33\n\x06Result\x12\x0c\n\x04keys\x18\x01 \x03(\t\x12\r\n\x05value\x18\x02 \x01(\x0c\x12\x0c\n\x04tags\x18\x03 \x03(\t"\x1e\n\rReadyResponse\x12\r\n\x05ready\x18\x01 \x01(\x08\x32\x8a\x01\n\x06Reduce\x12\x43\n\x08ReduceFn\x12\x18.reduce.v1.ReduceRequest\x1a\x19.reduce.v1.ReduceResponse(\x01\x30\x01\x12;\n\x07IsReady\x12\x16.google.protobuf.Empty\x1a\x18.reduce.v1.ReadyResponseb\x06proto3' +) _globals = globals() _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) -_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'reduce_pb2', _globals) +_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "reduce_pb2", _globals) if _descriptor._USE_C_DESCRIPTORS == False: - DESCRIPTOR._options = None - _globals['_REDUCEREQUEST']._serialized_start=90 - _globals['_REDUCEREQUEST']._serialized_end=229 - _globals['_REDUCERESPONSE']._serialized_start=231 - _globals['_REDUCERESPONSE']._serialized_end=351 - _globals['_REDUCERESPONSE_RESULT']._serialized_start=300 - _globals['_REDUCERESPONSE_RESULT']._serialized_end=351 - _globals['_READYRESPONSE']._serialized_start=353 - _globals['_READYRESPONSE']._serialized_end=383 - _globals['_REDUCE']._serialized_start=386 - _globals['_REDUCE']._serialized_end=524 + DESCRIPTOR._options = None + _globals["_REDUCEREQUEST"]._serialized_start = 90 + _globals["_REDUCEREQUEST"]._serialized_end = 229 + _globals["_REDUCERESPONSE"]._serialized_start = 231 + _globals["_REDUCERESPONSE"]._serialized_end = 351 + _globals["_REDUCERESPONSE_RESULT"]._serialized_start = 300 + _globals["_REDUCERESPONSE_RESULT"]._serialized_end = 351 + _globals["_READYRESPONSE"]._serialized_start = 353 + _globals["_READYRESPONSE"]._serialized_end = 383 + _globals["_REDUCE"]._serialized_start = 386 + _globals["_REDUCE"]._serialized_end = 524 # @@protoc_insertion_point(module_scope) diff --git a/pynumaflow/proto/reducer/reduce_pb2_grpc.py b/pynumaflow/proto/reducer/reduce_pb2_grpc.py index 1fd860b4..5a0a15f6 100644 --- a/pynumaflow/proto/reducer/reduce_pb2_grpc.py +++ b/pynumaflow/proto/reducer/reduce_pb2_grpc.py @@ -16,87 +16,108 @@ def __init__(self, channel): channel: A grpc.Channel. """ self.ReduceFn = channel.stream_stream( - '/reduce.v1.Reduce/ReduceFn', - request_serializer=reduce__pb2.ReduceRequest.SerializeToString, - response_deserializer=reduce__pb2.ReduceResponse.FromString, - ) + "/reduce.v1.Reduce/ReduceFn", + request_serializer=reduce__pb2.ReduceRequest.SerializeToString, + response_deserializer=reduce__pb2.ReduceResponse.FromString, + ) self.IsReady = channel.unary_unary( - '/reduce.v1.Reduce/IsReady', - request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, - response_deserializer=reduce__pb2.ReadyResponse.FromString, - ) + "/reduce.v1.Reduce/IsReady", + request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, + response_deserializer=reduce__pb2.ReadyResponse.FromString, + ) class ReduceServicer(object): """Missing associated documentation comment in .proto file.""" def ReduceFn(self, request_iterator, context): - """ReduceFn applies a reduce function to a request stream. - """ + """ReduceFn applies a reduce function to a request stream.""" context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details('Method not implemented!') - raise NotImplementedError('Method not implemented!') + context.set_details("Method not implemented!") + raise NotImplementedError("Method not implemented!") def IsReady(self, request, context): - """IsReady is the heartbeat endpoint for gRPC. - """ + """IsReady is the heartbeat endpoint for gRPC.""" context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details('Method not implemented!') - raise NotImplementedError('Method not implemented!') + context.set_details("Method not implemented!") + raise NotImplementedError("Method not implemented!") def add_ReduceServicer_to_server(servicer, server): rpc_method_handlers = { - 'ReduceFn': grpc.stream_stream_rpc_method_handler( - servicer.ReduceFn, - request_deserializer=reduce__pb2.ReduceRequest.FromString, - response_serializer=reduce__pb2.ReduceResponse.SerializeToString, - ), - 'IsReady': grpc.unary_unary_rpc_method_handler( - servicer.IsReady, - request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, - response_serializer=reduce__pb2.ReadyResponse.SerializeToString, - ), + "ReduceFn": grpc.stream_stream_rpc_method_handler( + servicer.ReduceFn, + request_deserializer=reduce__pb2.ReduceRequest.FromString, + response_serializer=reduce__pb2.ReduceResponse.SerializeToString, + ), + "IsReady": grpc.unary_unary_rpc_method_handler( + servicer.IsReady, + request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + response_serializer=reduce__pb2.ReadyResponse.SerializeToString, + ), } - generic_handler = grpc.method_handlers_generic_handler( - 'reduce.v1.Reduce', rpc_method_handlers) + generic_handler = grpc.method_handlers_generic_handler("reduce.v1.Reduce", rpc_method_handlers) server.add_generic_rpc_handlers((generic_handler,)) - # This class is part of an EXPERIMENTAL API. +# This class is part of an EXPERIMENTAL API. class Reduce(object): """Missing associated documentation comment in .proto file.""" @staticmethod - def ReduceFn(request_iterator, + def ReduceFn( + request_iterator, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None, + ): + return grpc.experimental.stream_stream( + request_iterator, target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.stream_stream(request_iterator, target, '/reduce.v1.Reduce/ReduceFn', + "/reduce.v1.Reduce/ReduceFn", reduce__pb2.ReduceRequest.SerializeToString, reduce__pb2.ReduceResponse.FromString, - options, channel_credentials, - insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + ) @staticmethod - def IsReady(request, + def IsReady( + request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None, + ): + return grpc.experimental.unary_unary( + request, target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary(request, target, '/reduce.v1.Reduce/IsReady', + "/reduce.v1.Reduce/IsReady", google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, reduce__pb2.ReadyResponse.FromString, - options, channel_credentials, - insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + ) diff --git a/pynumaflow/proto/sideinput/sideinput_pb2.py b/pynumaflow/proto/sideinput/sideinput_pb2.py index 50d3de7c..82983082 100644 --- a/pynumaflow/proto/sideinput/sideinput_pb2.py +++ b/pynumaflow/proto/sideinput/sideinput_pb2.py @@ -7,6 +7,7 @@ from google.protobuf import descriptor_pool as _descriptor_pool from google.protobuf import symbol_database as _symbol_database from google.protobuf.internal import builder as _builder + # @@protoc_insertion_point(imports) _sym_db = _symbol_database.Default() @@ -15,17 +16,19 @@ from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2 -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0fsideinput.proto\x12\x0csideinput.v1\x1a\x1bgoogle/protobuf/empty.proto\"8\n\x11SideInputResponse\x12\r\n\x05value\x18\x01 \x01(\x0c\x12\x14\n\x0cno_broadcast\x18\x02 \x01(\x08\"\x1e\n\rReadyResponse\x12\r\n\x05ready\x18\x01 \x01(\x08\x32\x99\x01\n\tSideInput\x12L\n\x11RetrieveSideInput\x12\x16.google.protobuf.Empty\x1a\x1f.sideinput.v1.SideInputResponse\x12>\n\x07IsReady\x12\x16.google.protobuf.Empty\x1a\x1b.sideinput.v1.ReadyResponseb\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( + b'\n\x0fsideinput.proto\x12\x0csideinput.v1\x1a\x1bgoogle/protobuf/empty.proto"8\n\x11SideInputResponse\x12\r\n\x05value\x18\x01 \x01(\x0c\x12\x14\n\x0cno_broadcast\x18\x02 \x01(\x08"\x1e\n\rReadyResponse\x12\r\n\x05ready\x18\x01 \x01(\x08\x32\x99\x01\n\tSideInput\x12L\n\x11RetrieveSideInput\x12\x16.google.protobuf.Empty\x1a\x1f.sideinput.v1.SideInputResponse\x12>\n\x07IsReady\x12\x16.google.protobuf.Empty\x1a\x1b.sideinput.v1.ReadyResponseb\x06proto3' +) _globals = globals() _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) -_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'sideinput_pb2', _globals) +_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "sideinput_pb2", _globals) if _descriptor._USE_C_DESCRIPTORS == False: - DESCRIPTOR._options = None - _globals['_SIDEINPUTRESPONSE']._serialized_start=62 - _globals['_SIDEINPUTRESPONSE']._serialized_end=118 - _globals['_READYRESPONSE']._serialized_start=120 - _globals['_READYRESPONSE']._serialized_end=150 - _globals['_SIDEINPUT']._serialized_start=153 - _globals['_SIDEINPUT']._serialized_end=306 + DESCRIPTOR._options = None + _globals["_SIDEINPUTRESPONSE"]._serialized_start = 62 + _globals["_SIDEINPUTRESPONSE"]._serialized_end = 118 + _globals["_READYRESPONSE"]._serialized_start = 120 + _globals["_READYRESPONSE"]._serialized_end = 150 + _globals["_SIDEINPUT"]._serialized_start = 153 + _globals["_SIDEINPUT"]._serialized_end = 306 # @@protoc_insertion_point(module_scope) diff --git a/pynumaflow/proto/sideinput/sideinput_pb2_grpc.py b/pynumaflow/proto/sideinput/sideinput_pb2_grpc.py index 8abe64d2..72ea87ed 100644 --- a/pynumaflow/proto/sideinput/sideinput_pb2_grpc.py +++ b/pynumaflow/proto/sideinput/sideinput_pb2_grpc.py @@ -24,15 +24,15 @@ def __init__(self, channel): channel: A grpc.Channel. """ self.RetrieveSideInput = channel.unary_unary( - '/sideinput.v1.SideInput/RetrieveSideInput', - request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, - response_deserializer=sideinput__pb2.SideInputResponse.FromString, - ) + "/sideinput.v1.SideInput/RetrieveSideInput", + request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, + response_deserializer=sideinput__pb2.SideInputResponse.FromString, + ) self.IsReady = channel.unary_unary( - '/sideinput.v1.SideInput/IsReady', - request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, - response_deserializer=sideinput__pb2.ReadyResponse.FromString, - ) + "/sideinput.v1.SideInput/IsReady", + request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, + response_deserializer=sideinput__pb2.ReadyResponse.FromString, + ) class SideInputServicer(object): @@ -47,39 +47,38 @@ class SideInputServicer(object): """ def RetrieveSideInput(self, request, context): - """RetrieveSideInput is the endpoint to retrieve the latest value of a given Side Input. - """ + """RetrieveSideInput is the endpoint to retrieve the latest value of a given Side Input.""" context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details('Method not implemented!') - raise NotImplementedError('Method not implemented!') + context.set_details("Method not implemented!") + raise NotImplementedError("Method not implemented!") def IsReady(self, request, context): - """IsReady is the health check endpoint to indicate whether the service is ready to be used. - """ + """IsReady is the health check endpoint to indicate whether the service is ready to be used.""" context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details('Method not implemented!') - raise NotImplementedError('Method not implemented!') + context.set_details("Method not implemented!") + raise NotImplementedError("Method not implemented!") def add_SideInputServicer_to_server(servicer, server): rpc_method_handlers = { - 'RetrieveSideInput': grpc.unary_unary_rpc_method_handler( - servicer.RetrieveSideInput, - request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, - response_serializer=sideinput__pb2.SideInputResponse.SerializeToString, - ), - 'IsReady': grpc.unary_unary_rpc_method_handler( - servicer.IsReady, - request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, - response_serializer=sideinput__pb2.ReadyResponse.SerializeToString, - ), + "RetrieveSideInput": grpc.unary_unary_rpc_method_handler( + servicer.RetrieveSideInput, + request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + response_serializer=sideinput__pb2.SideInputResponse.SerializeToString, + ), + "IsReady": grpc.unary_unary_rpc_method_handler( + servicer.IsReady, + request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + response_serializer=sideinput__pb2.ReadyResponse.SerializeToString, + ), } generic_handler = grpc.method_handlers_generic_handler( - 'sideinput.v1.SideInput', rpc_method_handlers) + "sideinput.v1.SideInput", rpc_method_handlers + ) server.add_generic_rpc_handlers((generic_handler,)) - # This class is part of an EXPERIMENTAL API. +# This class is part of an EXPERIMENTAL API. class SideInput(object): """SideInput is the gRPC service for user-defined Side Inputs. It is used to propagate changes in the values of the provided Side Inputs @@ -92,35 +91,59 @@ class SideInput(object): """ @staticmethod - def RetrieveSideInput(request, + def RetrieveSideInput( + request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None, + ): + return grpc.experimental.unary_unary( + request, target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary(request, target, '/sideinput.v1.SideInput/RetrieveSideInput', + "/sideinput.v1.SideInput/RetrieveSideInput", google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, sideinput__pb2.SideInputResponse.FromString, - options, channel_credentials, - insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + ) @staticmethod - def IsReady(request, + def IsReady( + request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None, + ): + return grpc.experimental.unary_unary( + request, target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary(request, target, '/sideinput.v1.SideInput/IsReady', + "/sideinput.v1.SideInput/IsReady", google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, sideinput__pb2.ReadyResponse.FromString, - options, channel_credentials, - insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + ) diff --git a/pynumaflow/proto/sinker/sink_pb2.py b/pynumaflow/proto/sinker/sink_pb2.py index deb73b56..eada281c 100644 --- a/pynumaflow/proto/sinker/sink_pb2.py +++ b/pynumaflow/proto/sinker/sink_pb2.py @@ -7,6 +7,7 @@ from google.protobuf import descriptor_pool as _descriptor_pool from google.protobuf import symbol_database as _symbol_database from google.protobuf.internal import builder as _builder + # @@protoc_insertion_point(imports) _sym_db = _symbol_database.Default() @@ -16,22 +17,26 @@ from google.protobuf import timestamp_pb2 as google_dot_protobuf_dot_timestamp__pb2 -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\nsink.proto\x12\x07sink.v1\x1a\x1bgoogle/protobuf/empty.proto\x1a\x1fgoogle/protobuf/timestamp.proto\"\x95\x01\n\x0bSinkRequest\x12\x0c\n\x04keys\x18\x01 \x03(\t\x12\r\n\x05value\x18\x02 \x01(\x0c\x12.\n\nevent_time\x18\x03 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12-\n\twatermark\x18\x04 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12\n\n\x02id\x18\x05 \x01(\t\"\x1e\n\rReadyResponse\x12\r\n\x05ready\x18\x01 \x01(\x08\"u\n\x0cSinkResponse\x12-\n\x07results\x18\x01 \x03(\x0b\x32\x1c.sink.v1.SinkResponse.Result\x1a\x36\n\x06Result\x12\n\n\x02id\x18\x01 \x01(\t\x12\x0f\n\x07success\x18\x02 \x01(\x08\x12\x0f\n\x07\x65rr_msg\x18\x03 \x01(\t2z\n\x04Sink\x12\x37\n\x06SinkFn\x12\x14.sink.v1.SinkRequest\x1a\x15.sink.v1.SinkResponse(\x01\x12\x39\n\x07IsReady\x12\x16.google.protobuf.Empty\x1a\x16.sink.v1.ReadyResponseB8Z6github.com/numaproj/numaflow-go/pkg/apis/proto/sink/v1b\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( + b'\n\nsink.proto\x12\x07sink.v1\x1a\x1bgoogle/protobuf/empty.proto\x1a\x1fgoogle/protobuf/timestamp.proto"\x95\x01\n\x0bSinkRequest\x12\x0c\n\x04keys\x18\x01 \x03(\t\x12\r\n\x05value\x18\x02 \x01(\x0c\x12.\n\nevent_time\x18\x03 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12-\n\twatermark\x18\x04 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12\n\n\x02id\x18\x05 \x01(\t"\x1e\n\rReadyResponse\x12\r\n\x05ready\x18\x01 \x01(\x08"u\n\x0cSinkResponse\x12-\n\x07results\x18\x01 \x03(\x0b\x32\x1c.sink.v1.SinkResponse.Result\x1a\x36\n\x06Result\x12\n\n\x02id\x18\x01 \x01(\t\x12\x0f\n\x07success\x18\x02 \x01(\x08\x12\x0f\n\x07\x65rr_msg\x18\x03 \x01(\t2z\n\x04Sink\x12\x37\n\x06SinkFn\x12\x14.sink.v1.SinkRequest\x1a\x15.sink.v1.SinkResponse(\x01\x12\x39\n\x07IsReady\x12\x16.google.protobuf.Empty\x1a\x16.sink.v1.ReadyResponseB8Z6github.com/numaproj/numaflow-go/pkg/apis/proto/sink/v1b\x06proto3' +) _globals = globals() _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) -_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'sink_pb2', _globals) +_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "sink_pb2", _globals) if _descriptor._USE_C_DESCRIPTORS == False: - _globals['DESCRIPTOR']._options = None - _globals['DESCRIPTOR']._serialized_options = b'Z6github.com/numaproj/numaflow-go/pkg/apis/proto/sink/v1' - _globals['_SINKREQUEST']._serialized_start=86 - _globals['_SINKREQUEST']._serialized_end=235 - _globals['_READYRESPONSE']._serialized_start=237 - _globals['_READYRESPONSE']._serialized_end=267 - _globals['_SINKRESPONSE']._serialized_start=269 - _globals['_SINKRESPONSE']._serialized_end=386 - _globals['_SINKRESPONSE_RESULT']._serialized_start=332 - _globals['_SINKRESPONSE_RESULT']._serialized_end=386 - _globals['_SINK']._serialized_start=388 - _globals['_SINK']._serialized_end=510 + _globals["DESCRIPTOR"]._options = None + _globals[ + "DESCRIPTOR" + ]._serialized_options = b"Z6github.com/numaproj/numaflow-go/pkg/apis/proto/sink/v1" + _globals["_SINKREQUEST"]._serialized_start = 86 + _globals["_SINKREQUEST"]._serialized_end = 235 + _globals["_READYRESPONSE"]._serialized_start = 237 + _globals["_READYRESPONSE"]._serialized_end = 267 + _globals["_SINKRESPONSE"]._serialized_start = 269 + _globals["_SINKRESPONSE"]._serialized_end = 386 + _globals["_SINKRESPONSE_RESULT"]._serialized_start = 332 + _globals["_SINKRESPONSE_RESULT"]._serialized_end = 386 + _globals["_SINK"]._serialized_start = 388 + _globals["_SINK"]._serialized_end = 510 # @@protoc_insertion_point(module_scope) diff --git a/pynumaflow/proto/sinker/sink_pb2_grpc.py b/pynumaflow/proto/sinker/sink_pb2_grpc.py index 2f5089a9..ef673e9d 100644 --- a/pynumaflow/proto/sinker/sink_pb2_grpc.py +++ b/pynumaflow/proto/sinker/sink_pb2_grpc.py @@ -16,87 +16,108 @@ def __init__(self, channel): channel: A grpc.Channel. """ self.SinkFn = channel.stream_unary( - '/sink.v1.Sink/SinkFn', - request_serializer=sink__pb2.SinkRequest.SerializeToString, - response_deserializer=sink__pb2.SinkResponse.FromString, - ) + "/sink.v1.Sink/SinkFn", + request_serializer=sink__pb2.SinkRequest.SerializeToString, + response_deserializer=sink__pb2.SinkResponse.FromString, + ) self.IsReady = channel.unary_unary( - '/sink.v1.Sink/IsReady', - request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, - response_deserializer=sink__pb2.ReadyResponse.FromString, - ) + "/sink.v1.Sink/IsReady", + request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, + response_deserializer=sink__pb2.ReadyResponse.FromString, + ) class SinkServicer(object): """Missing associated documentation comment in .proto file.""" def SinkFn(self, request_iterator, context): - """SinkFn writes the request to a user defined sink. - """ + """SinkFn writes the request to a user defined sink.""" context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details('Method not implemented!') - raise NotImplementedError('Method not implemented!') + context.set_details("Method not implemented!") + raise NotImplementedError("Method not implemented!") def IsReady(self, request, context): - """IsReady is the heartbeat endpoint for gRPC. - """ + """IsReady is the heartbeat endpoint for gRPC.""" context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details('Method not implemented!') - raise NotImplementedError('Method not implemented!') + context.set_details("Method not implemented!") + raise NotImplementedError("Method not implemented!") def add_SinkServicer_to_server(servicer, server): rpc_method_handlers = { - 'SinkFn': grpc.stream_unary_rpc_method_handler( - servicer.SinkFn, - request_deserializer=sink__pb2.SinkRequest.FromString, - response_serializer=sink__pb2.SinkResponse.SerializeToString, - ), - 'IsReady': grpc.unary_unary_rpc_method_handler( - servicer.IsReady, - request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, - response_serializer=sink__pb2.ReadyResponse.SerializeToString, - ), + "SinkFn": grpc.stream_unary_rpc_method_handler( + servicer.SinkFn, + request_deserializer=sink__pb2.SinkRequest.FromString, + response_serializer=sink__pb2.SinkResponse.SerializeToString, + ), + "IsReady": grpc.unary_unary_rpc_method_handler( + servicer.IsReady, + request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + response_serializer=sink__pb2.ReadyResponse.SerializeToString, + ), } - generic_handler = grpc.method_handlers_generic_handler( - 'sink.v1.Sink', rpc_method_handlers) + generic_handler = grpc.method_handlers_generic_handler("sink.v1.Sink", rpc_method_handlers) server.add_generic_rpc_handlers((generic_handler,)) - # This class is part of an EXPERIMENTAL API. +# This class is part of an EXPERIMENTAL API. class Sink(object): """Missing associated documentation comment in .proto file.""" @staticmethod - def SinkFn(request_iterator, + def SinkFn( + request_iterator, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None, + ): + return grpc.experimental.stream_unary( + request_iterator, target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.stream_unary(request_iterator, target, '/sink.v1.Sink/SinkFn', + "/sink.v1.Sink/SinkFn", sink__pb2.SinkRequest.SerializeToString, sink__pb2.SinkResponse.FromString, - options, channel_credentials, - insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + ) @staticmethod - def IsReady(request, + def IsReady( + request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None, + ): + return grpc.experimental.unary_unary( + request, target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary(request, target, '/sink.v1.Sink/IsReady', + "/sink.v1.Sink/IsReady", google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, sink__pb2.ReadyResponse.FromString, - options, channel_credentials, - insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + ) diff --git a/pynumaflow/proto/sourcer/source_pb2.py b/pynumaflow/proto/sourcer/source_pb2.py index cee05043..10fe18d7 100644 --- a/pynumaflow/proto/sourcer/source_pb2.py +++ b/pynumaflow/proto/sourcer/source_pb2.py @@ -7,6 +7,7 @@ from google.protobuf import descriptor_pool as _descriptor_pool from google.protobuf import symbol_database as _symbol_database from google.protobuf.internal import builder as _builder + # @@protoc_insertion_point(imports) _sym_db = _symbol_database.Default() @@ -16,41 +17,43 @@ from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2 -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0csource.proto\x12\tsource.v1\x1a\x1fgoogle/protobuf/timestamp.proto\x1a\x1bgoogle/protobuf/empty.proto\"u\n\x0bReadRequest\x12/\n\x07request\x18\x01 \x01(\x0b\x32\x1e.source.v1.ReadRequest.Request\x1a\x35\n\x07Request\x12\x13\n\x0bnum_records\x18\x01 \x01(\x04\x12\x15\n\rtimeout_in_ms\x18\x02 \x01(\r\"\xba\x01\n\x0cReadResponse\x12.\n\x06result\x18\x01 \x01(\x0b\x32\x1e.source.v1.ReadResponse.Result\x1az\n\x06Result\x12\x0f\n\x07payload\x18\x01 \x01(\x0c\x12!\n\x06offset\x18\x02 \x01(\x0b\x32\x11.source.v1.Offset\x12.\n\nevent_time\x18\x03 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12\x0c\n\x04keys\x18\x04 \x03(\t\"k\n\nAckRequest\x12.\n\x07request\x18\x01 \x01(\x0b\x32\x1d.source.v1.AckRequest.Request\x1a-\n\x07Request\x12\"\n\x07offsets\x18\x01 \x03(\x0b\x32\x11.source.v1.Offset\"o\n\x0b\x41\x63kResponse\x12-\n\x06result\x18\x01 \x01(\x0b\x32\x1d.source.v1.AckResponse.Result\x1a\x31\n\x06Result\x12\'\n\x07success\x18\x01 \x01(\x0b\x32\x16.google.protobuf.Empty\"\x1e\n\rReadyResponse\x12\r\n\x05ready\x18\x01 \x01(\x08\"]\n\x0fPendingResponse\x12\x31\n\x06result\x18\x01 \x01(\x0b\x32!.source.v1.PendingResponse.Result\x1a\x17\n\x06Result\x12\r\n\x05\x63ount\x18\x01 \x01(\x03\"h\n\x12PartitionsResponse\x12\x34\n\x06result\x18\x01 \x01(\x0b\x32$.source.v1.PartitionsResponse.Result\x1a\x1c\n\x06Result\x12\x12\n\npartitions\x18\x01 \x03(\x05\".\n\x06Offset\x12\x0e\n\x06offset\x18\x01 \x01(\x0c\x12\x14\n\x0cpartition_id\x18\x02 \x01(\x05\x32\xc2\x02\n\x06Source\x12;\n\x06ReadFn\x12\x16.source.v1.ReadRequest\x1a\x17.source.v1.ReadResponse0\x01\x12\x36\n\x05\x41\x63kFn\x12\x15.source.v1.AckRequest\x1a\x16.source.v1.AckResponse\x12?\n\tPendingFn\x12\x16.google.protobuf.Empty\x1a\x1a.source.v1.PendingResponse\x12\x45\n\x0cPartitionsFn\x12\x16.google.protobuf.Empty\x1a\x1d.source.v1.PartitionsResponse\x12;\n\x07IsReady\x12\x16.google.protobuf.Empty\x1a\x18.source.v1.ReadyResponseb\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( + b'\n\x0csource.proto\x12\tsource.v1\x1a\x1fgoogle/protobuf/timestamp.proto\x1a\x1bgoogle/protobuf/empty.proto"u\n\x0bReadRequest\x12/\n\x07request\x18\x01 \x01(\x0b\x32\x1e.source.v1.ReadRequest.Request\x1a\x35\n\x07Request\x12\x13\n\x0bnum_records\x18\x01 \x01(\x04\x12\x15\n\rtimeout_in_ms\x18\x02 \x01(\r"\xba\x01\n\x0cReadResponse\x12.\n\x06result\x18\x01 \x01(\x0b\x32\x1e.source.v1.ReadResponse.Result\x1az\n\x06Result\x12\x0f\n\x07payload\x18\x01 \x01(\x0c\x12!\n\x06offset\x18\x02 \x01(\x0b\x32\x11.source.v1.Offset\x12.\n\nevent_time\x18\x03 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12\x0c\n\x04keys\x18\x04 \x03(\t"k\n\nAckRequest\x12.\n\x07request\x18\x01 \x01(\x0b\x32\x1d.source.v1.AckRequest.Request\x1a-\n\x07Request\x12"\n\x07offsets\x18\x01 \x03(\x0b\x32\x11.source.v1.Offset"o\n\x0b\x41\x63kResponse\x12-\n\x06result\x18\x01 \x01(\x0b\x32\x1d.source.v1.AckResponse.Result\x1a\x31\n\x06Result\x12\'\n\x07success\x18\x01 \x01(\x0b\x32\x16.google.protobuf.Empty"\x1e\n\rReadyResponse\x12\r\n\x05ready\x18\x01 \x01(\x08"]\n\x0fPendingResponse\x12\x31\n\x06result\x18\x01 \x01(\x0b\x32!.source.v1.PendingResponse.Result\x1a\x17\n\x06Result\x12\r\n\x05\x63ount\x18\x01 \x01(\x03"h\n\x12PartitionsResponse\x12\x34\n\x06result\x18\x01 \x01(\x0b\x32$.source.v1.PartitionsResponse.Result\x1a\x1c\n\x06Result\x12\x12\n\npartitions\x18\x01 \x03(\x05".\n\x06Offset\x12\x0e\n\x06offset\x18\x01 \x01(\x0c\x12\x14\n\x0cpartition_id\x18\x02 \x01(\x05\x32\xc2\x02\n\x06Source\x12;\n\x06ReadFn\x12\x16.source.v1.ReadRequest\x1a\x17.source.v1.ReadResponse0\x01\x12\x36\n\x05\x41\x63kFn\x12\x15.source.v1.AckRequest\x1a\x16.source.v1.AckResponse\x12?\n\tPendingFn\x12\x16.google.protobuf.Empty\x1a\x1a.source.v1.PendingResponse\x12\x45\n\x0cPartitionsFn\x12\x16.google.protobuf.Empty\x1a\x1d.source.v1.PartitionsResponse\x12;\n\x07IsReady\x12\x16.google.protobuf.Empty\x1a\x18.source.v1.ReadyResponseb\x06proto3' +) _globals = globals() _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) -_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'source_pb2', _globals) +_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "source_pb2", _globals) if _descriptor._USE_C_DESCRIPTORS == False: - DESCRIPTOR._options = None - _globals['_READREQUEST']._serialized_start=89 - _globals['_READREQUEST']._serialized_end=206 - _globals['_READREQUEST_REQUEST']._serialized_start=153 - _globals['_READREQUEST_REQUEST']._serialized_end=206 - _globals['_READRESPONSE']._serialized_start=209 - _globals['_READRESPONSE']._serialized_end=395 - _globals['_READRESPONSE_RESULT']._serialized_start=273 - _globals['_READRESPONSE_RESULT']._serialized_end=395 - _globals['_ACKREQUEST']._serialized_start=397 - _globals['_ACKREQUEST']._serialized_end=504 - _globals['_ACKREQUEST_REQUEST']._serialized_start=459 - _globals['_ACKREQUEST_REQUEST']._serialized_end=504 - _globals['_ACKRESPONSE']._serialized_start=506 - _globals['_ACKRESPONSE']._serialized_end=617 - _globals['_ACKRESPONSE_RESULT']._serialized_start=568 - _globals['_ACKRESPONSE_RESULT']._serialized_end=617 - _globals['_READYRESPONSE']._serialized_start=619 - _globals['_READYRESPONSE']._serialized_end=649 - _globals['_PENDINGRESPONSE']._serialized_start=651 - _globals['_PENDINGRESPONSE']._serialized_end=744 - _globals['_PENDINGRESPONSE_RESULT']._serialized_start=721 - _globals['_PENDINGRESPONSE_RESULT']._serialized_end=744 - _globals['_PARTITIONSRESPONSE']._serialized_start=746 - _globals['_PARTITIONSRESPONSE']._serialized_end=850 - _globals['_PARTITIONSRESPONSE_RESULT']._serialized_start=822 - _globals['_PARTITIONSRESPONSE_RESULT']._serialized_end=850 - _globals['_OFFSET']._serialized_start=852 - _globals['_OFFSET']._serialized_end=898 - _globals['_SOURCE']._serialized_start=901 - _globals['_SOURCE']._serialized_end=1223 + DESCRIPTOR._options = None + _globals["_READREQUEST"]._serialized_start = 89 + _globals["_READREQUEST"]._serialized_end = 206 + _globals["_READREQUEST_REQUEST"]._serialized_start = 153 + _globals["_READREQUEST_REQUEST"]._serialized_end = 206 + _globals["_READRESPONSE"]._serialized_start = 209 + _globals["_READRESPONSE"]._serialized_end = 395 + _globals["_READRESPONSE_RESULT"]._serialized_start = 273 + _globals["_READRESPONSE_RESULT"]._serialized_end = 395 + _globals["_ACKREQUEST"]._serialized_start = 397 + _globals["_ACKREQUEST"]._serialized_end = 504 + _globals["_ACKREQUEST_REQUEST"]._serialized_start = 459 + _globals["_ACKREQUEST_REQUEST"]._serialized_end = 504 + _globals["_ACKRESPONSE"]._serialized_start = 506 + _globals["_ACKRESPONSE"]._serialized_end = 617 + _globals["_ACKRESPONSE_RESULT"]._serialized_start = 568 + _globals["_ACKRESPONSE_RESULT"]._serialized_end = 617 + _globals["_READYRESPONSE"]._serialized_start = 619 + _globals["_READYRESPONSE"]._serialized_end = 649 + _globals["_PENDINGRESPONSE"]._serialized_start = 651 + _globals["_PENDINGRESPONSE"]._serialized_end = 744 + _globals["_PENDINGRESPONSE_RESULT"]._serialized_start = 721 + _globals["_PENDINGRESPONSE_RESULT"]._serialized_end = 744 + _globals["_PARTITIONSRESPONSE"]._serialized_start = 746 + _globals["_PARTITIONSRESPONSE"]._serialized_end = 850 + _globals["_PARTITIONSRESPONSE_RESULT"]._serialized_start = 822 + _globals["_PARTITIONSRESPONSE_RESULT"]._serialized_end = 850 + _globals["_OFFSET"]._serialized_start = 852 + _globals["_OFFSET"]._serialized_end = 898 + _globals["_SOURCE"]._serialized_start = 901 + _globals["_SOURCE"]._serialized_end = 1223 # @@protoc_insertion_point(module_scope) diff --git a/pynumaflow/proto/sourcer/source_pb2_grpc.py b/pynumaflow/proto/sourcer/source_pb2_grpc.py index f67127c5..3a132eea 100644 --- a/pynumaflow/proto/sourcer/source_pb2_grpc.py +++ b/pynumaflow/proto/sourcer/source_pb2_grpc.py @@ -16,30 +16,30 @@ def __init__(self, channel): channel: A grpc.Channel. """ self.ReadFn = channel.unary_stream( - '/source.v1.Source/ReadFn', - request_serializer=source__pb2.ReadRequest.SerializeToString, - response_deserializer=source__pb2.ReadResponse.FromString, - ) + "/source.v1.Source/ReadFn", + request_serializer=source__pb2.ReadRequest.SerializeToString, + response_deserializer=source__pb2.ReadResponse.FromString, + ) self.AckFn = channel.unary_unary( - '/source.v1.Source/AckFn', - request_serializer=source__pb2.AckRequest.SerializeToString, - response_deserializer=source__pb2.AckResponse.FromString, - ) + "/source.v1.Source/AckFn", + request_serializer=source__pb2.AckRequest.SerializeToString, + response_deserializer=source__pb2.AckResponse.FromString, + ) self.PendingFn = channel.unary_unary( - '/source.v1.Source/PendingFn', - request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, - response_deserializer=source__pb2.PendingResponse.FromString, - ) + "/source.v1.Source/PendingFn", + request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, + response_deserializer=source__pb2.PendingResponse.FromString, + ) self.PartitionsFn = channel.unary_unary( - '/source.v1.Source/PartitionsFn', - request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, - response_deserializer=source__pb2.PartitionsResponse.FromString, - ) + "/source.v1.Source/PartitionsFn", + request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, + response_deserializer=source__pb2.PartitionsResponse.FromString, + ) self.IsReady = channel.unary_unary( - '/source.v1.Source/IsReady', - request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, - response_deserializer=source__pb2.ReadyResponse.FromString, - ) + "/source.v1.Source/IsReady", + request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, + response_deserializer=source__pb2.ReadyResponse.FromString, + ) class SourceServicer(object): @@ -51,160 +51,216 @@ def ReadFn(self, request, context): If the request timeout is reached on server side, the returned ReadResponse will contain all the datum that have been read (which could be an empty list). """ context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details('Method not implemented!') - raise NotImplementedError('Method not implemented!') + context.set_details("Method not implemented!") + raise NotImplementedError("Method not implemented!") def AckFn(self, request, context): """AckFn acknowledges a list of datum offsets. When AckFn is called, it implicitly indicates that the datum stream has been processed by the source vertex. The caller (numa) expects the AckFn to be successful, and it does not expect any errors. If there are some irrecoverable errors when the callee (UDSource) is processing the AckFn request, - then it is best to crash because there are no other retry mechanisms possible. + then it is best to crash because there are no other retry mechanisms possible. """ context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details('Method not implemented!') - raise NotImplementedError('Method not implemented!') + context.set_details("Method not implemented!") + raise NotImplementedError("Method not implemented!") def PendingFn(self, request, context): - """PendingFn returns the number of pending records at the user defined source. - """ + """PendingFn returns the number of pending records at the user defined source.""" context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details('Method not implemented!') - raise NotImplementedError('Method not implemented!') + context.set_details("Method not implemented!") + raise NotImplementedError("Method not implemented!") def PartitionsFn(self, request, context): - """PartitionsFn returns the list of partitions for the user defined source. - """ + """PartitionsFn returns the list of partitions for the user defined source.""" context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details('Method not implemented!') - raise NotImplementedError('Method not implemented!') + context.set_details("Method not implemented!") + raise NotImplementedError("Method not implemented!") def IsReady(self, request, context): - """IsReady is the heartbeat endpoint for user defined source gRPC. - """ + """IsReady is the heartbeat endpoint for user defined source gRPC.""" context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details('Method not implemented!') - raise NotImplementedError('Method not implemented!') + context.set_details("Method not implemented!") + raise NotImplementedError("Method not implemented!") def add_SourceServicer_to_server(servicer, server): rpc_method_handlers = { - 'ReadFn': grpc.unary_stream_rpc_method_handler( - servicer.ReadFn, - request_deserializer=source__pb2.ReadRequest.FromString, - response_serializer=source__pb2.ReadResponse.SerializeToString, - ), - 'AckFn': grpc.unary_unary_rpc_method_handler( - servicer.AckFn, - request_deserializer=source__pb2.AckRequest.FromString, - response_serializer=source__pb2.AckResponse.SerializeToString, - ), - 'PendingFn': grpc.unary_unary_rpc_method_handler( - servicer.PendingFn, - request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, - response_serializer=source__pb2.PendingResponse.SerializeToString, - ), - 'PartitionsFn': grpc.unary_unary_rpc_method_handler( - servicer.PartitionsFn, - request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, - response_serializer=source__pb2.PartitionsResponse.SerializeToString, - ), - 'IsReady': grpc.unary_unary_rpc_method_handler( - servicer.IsReady, - request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, - response_serializer=source__pb2.ReadyResponse.SerializeToString, - ), + "ReadFn": grpc.unary_stream_rpc_method_handler( + servicer.ReadFn, + request_deserializer=source__pb2.ReadRequest.FromString, + response_serializer=source__pb2.ReadResponse.SerializeToString, + ), + "AckFn": grpc.unary_unary_rpc_method_handler( + servicer.AckFn, + request_deserializer=source__pb2.AckRequest.FromString, + response_serializer=source__pb2.AckResponse.SerializeToString, + ), + "PendingFn": grpc.unary_unary_rpc_method_handler( + servicer.PendingFn, + request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + response_serializer=source__pb2.PendingResponse.SerializeToString, + ), + "PartitionsFn": grpc.unary_unary_rpc_method_handler( + servicer.PartitionsFn, + request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + response_serializer=source__pb2.PartitionsResponse.SerializeToString, + ), + "IsReady": grpc.unary_unary_rpc_method_handler( + servicer.IsReady, + request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + response_serializer=source__pb2.ReadyResponse.SerializeToString, + ), } - generic_handler = grpc.method_handlers_generic_handler( - 'source.v1.Source', rpc_method_handlers) + generic_handler = grpc.method_handlers_generic_handler("source.v1.Source", rpc_method_handlers) server.add_generic_rpc_handlers((generic_handler,)) - # This class is part of an EXPERIMENTAL API. +# This class is part of an EXPERIMENTAL API. class Source(object): """Missing associated documentation comment in .proto file.""" @staticmethod - def ReadFn(request, + def ReadFn( + request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None, + ): + return grpc.experimental.unary_stream( + request, target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_stream(request, target, '/source.v1.Source/ReadFn', + "/source.v1.Source/ReadFn", source__pb2.ReadRequest.SerializeToString, source__pb2.ReadResponse.FromString, - options, channel_credentials, - insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + ) @staticmethod - def AckFn(request, + def AckFn( + request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None, + ): + return grpc.experimental.unary_unary( + request, target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary(request, target, '/source.v1.Source/AckFn', + "/source.v1.Source/AckFn", source__pb2.AckRequest.SerializeToString, source__pb2.AckResponse.FromString, - options, channel_credentials, - insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + ) @staticmethod - def PendingFn(request, + def PendingFn( + request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None, + ): + return grpc.experimental.unary_unary( + request, target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary(request, target, '/source.v1.Source/PendingFn', + "/source.v1.Source/PendingFn", google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, source__pb2.PendingResponse.FromString, - options, channel_credentials, - insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + ) @staticmethod - def PartitionsFn(request, + def PartitionsFn( + request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None, + ): + return grpc.experimental.unary_unary( + request, target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary(request, target, '/source.v1.Source/PartitionsFn', + "/source.v1.Source/PartitionsFn", google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, source__pb2.PartitionsResponse.FromString, - options, channel_credentials, - insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + ) @staticmethod - def IsReady(request, + def IsReady( + request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None, + ): + return grpc.experimental.unary_unary( + request, target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary(request, target, '/source.v1.Source/IsReady', + "/source.v1.Source/IsReady", google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, source__pb2.ReadyResponse.FromString, - options, channel_credentials, - insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + ) diff --git a/pynumaflow/proto/sourcetransformer/transform_pb2.py b/pynumaflow/proto/sourcetransformer/transform_pb2.py index 31c0da87..2f96e5fb 100644 --- a/pynumaflow/proto/sourcetransformer/transform_pb2.py +++ b/pynumaflow/proto/sourcetransformer/transform_pb2.py @@ -7,6 +7,7 @@ from google.protobuf import descriptor_pool as _descriptor_pool from google.protobuf import symbol_database as _symbol_database from google.protobuf.internal import builder as _builder + # @@protoc_insertion_point(imports) _sym_db = _symbol_database.Default() @@ -16,21 +17,23 @@ from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2 -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0ftransform.proto\x12\x14sourcetransformer.v1\x1a\x1fgoogle/protobuf/timestamp.proto\x1a\x1bgoogle/protobuf/empty.proto\"\x94\x01\n\x16SourceTransformRequest\x12\x0c\n\x04keys\x18\x01 \x03(\t\x12\r\n\x05value\x18\x02 \x01(\x0c\x12.\n\nevent_time\x18\x03 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12-\n\twatermark\x18\x04 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\"\xc5\x01\n\x17SourceTransformResponse\x12\x45\n\x07results\x18\x01 \x03(\x0b\x32\x34.sourcetransformer.v1.SourceTransformResponse.Result\x1a\x63\n\x06Result\x12\x0c\n\x04keys\x18\x01 \x03(\t\x12\r\n\x05value\x18\x02 \x01(\x0c\x12.\n\nevent_time\x18\x03 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12\x0c\n\x04tags\x18\x04 \x03(\t\"\x1e\n\rReadyResponse\x12\r\n\x05ready\x18\x01 \x01(\x08\x32\xcb\x01\n\x0fSourceTransform\x12p\n\x11SourceTransformFn\x12,.sourcetransformer.v1.SourceTransformRequest\x1a-.sourcetransformer.v1.SourceTransformResponse\x12\x46\n\x07IsReady\x12\x16.google.protobuf.Empty\x1a#.sourcetransformer.v1.ReadyResponseb\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( + b'\n\x0ftransform.proto\x12\x14sourcetransformer.v1\x1a\x1fgoogle/protobuf/timestamp.proto\x1a\x1bgoogle/protobuf/empty.proto"\x94\x01\n\x16SourceTransformRequest\x12\x0c\n\x04keys\x18\x01 \x03(\t\x12\r\n\x05value\x18\x02 \x01(\x0c\x12.\n\nevent_time\x18\x03 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12-\n\twatermark\x18\x04 \x01(\x0b\x32\x1a.google.protobuf.Timestamp"\xc5\x01\n\x17SourceTransformResponse\x12\x45\n\x07results\x18\x01 \x03(\x0b\x32\x34.sourcetransformer.v1.SourceTransformResponse.Result\x1a\x63\n\x06Result\x12\x0c\n\x04keys\x18\x01 \x03(\t\x12\r\n\x05value\x18\x02 \x01(\x0c\x12.\n\nevent_time\x18\x03 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12\x0c\n\x04tags\x18\x04 \x03(\t"\x1e\n\rReadyResponse\x12\r\n\x05ready\x18\x01 \x01(\x08\x32\xcb\x01\n\x0fSourceTransform\x12p\n\x11SourceTransformFn\x12,.sourcetransformer.v1.SourceTransformRequest\x1a-.sourcetransformer.v1.SourceTransformResponse\x12\x46\n\x07IsReady\x12\x16.google.protobuf.Empty\x1a#.sourcetransformer.v1.ReadyResponseb\x06proto3' +) _globals = globals() _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) -_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'transform_pb2', _globals) +_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "transform_pb2", _globals) if _descriptor._USE_C_DESCRIPTORS == False: - DESCRIPTOR._options = None - _globals['_SOURCETRANSFORMREQUEST']._serialized_start=104 - _globals['_SOURCETRANSFORMREQUEST']._serialized_end=252 - _globals['_SOURCETRANSFORMRESPONSE']._serialized_start=255 - _globals['_SOURCETRANSFORMRESPONSE']._serialized_end=452 - _globals['_SOURCETRANSFORMRESPONSE_RESULT']._serialized_start=353 - _globals['_SOURCETRANSFORMRESPONSE_RESULT']._serialized_end=452 - _globals['_READYRESPONSE']._serialized_start=454 - _globals['_READYRESPONSE']._serialized_end=484 - _globals['_SOURCETRANSFORM']._serialized_start=487 - _globals['_SOURCETRANSFORM']._serialized_end=690 + DESCRIPTOR._options = None + _globals["_SOURCETRANSFORMREQUEST"]._serialized_start = 104 + _globals["_SOURCETRANSFORMREQUEST"]._serialized_end = 252 + _globals["_SOURCETRANSFORMRESPONSE"]._serialized_start = 255 + _globals["_SOURCETRANSFORMRESPONSE"]._serialized_end = 452 + _globals["_SOURCETRANSFORMRESPONSE_RESULT"]._serialized_start = 353 + _globals["_SOURCETRANSFORMRESPONSE_RESULT"]._serialized_end = 452 + _globals["_READYRESPONSE"]._serialized_start = 454 + _globals["_READYRESPONSE"]._serialized_end = 484 + _globals["_SOURCETRANSFORM"]._serialized_start = 487 + _globals["_SOURCETRANSFORM"]._serialized_end = 690 # @@protoc_insertion_point(module_scope) diff --git a/pynumaflow/proto/sourcetransformer/transform_pb2_grpc.py b/pynumaflow/proto/sourcetransformer/transform_pb2_grpc.py index 7ec9346c..2e67a11b 100644 --- a/pynumaflow/proto/sourcetransformer/transform_pb2_grpc.py +++ b/pynumaflow/proto/sourcetransformer/transform_pb2_grpc.py @@ -16,15 +16,15 @@ def __init__(self, channel): channel: A grpc.Channel. """ self.SourceTransformFn = channel.unary_unary( - '/sourcetransformer.v1.SourceTransform/SourceTransformFn', - request_serializer=transform__pb2.SourceTransformRequest.SerializeToString, - response_deserializer=transform__pb2.SourceTransformResponse.FromString, - ) + "/sourcetransformer.v1.SourceTransform/SourceTransformFn", + request_serializer=transform__pb2.SourceTransformRequest.SerializeToString, + response_deserializer=transform__pb2.SourceTransformResponse.FromString, + ) self.IsReady = channel.unary_unary( - '/sourcetransformer.v1.SourceTransform/IsReady', - request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, - response_deserializer=transform__pb2.ReadyResponse.FromString, - ) + "/sourcetransformer.v1.SourceTransform/IsReady", + request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, + response_deserializer=transform__pb2.ReadyResponse.FromString, + ) class SourceTransformServicer(object): @@ -36,69 +36,93 @@ def SourceTransformFn(self, request, context): SourceTransformFn can be used only at source vertex by source data transformer. """ context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details('Method not implemented!') - raise NotImplementedError('Method not implemented!') + context.set_details("Method not implemented!") + raise NotImplementedError("Method not implemented!") def IsReady(self, request, context): - """IsReady is the heartbeat endpoint for gRPC. - """ + """IsReady is the heartbeat endpoint for gRPC.""" context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details('Method not implemented!') - raise NotImplementedError('Method not implemented!') + context.set_details("Method not implemented!") + raise NotImplementedError("Method not implemented!") def add_SourceTransformServicer_to_server(servicer, server): rpc_method_handlers = { - 'SourceTransformFn': grpc.unary_unary_rpc_method_handler( - servicer.SourceTransformFn, - request_deserializer=transform__pb2.SourceTransformRequest.FromString, - response_serializer=transform__pb2.SourceTransformResponse.SerializeToString, - ), - 'IsReady': grpc.unary_unary_rpc_method_handler( - servicer.IsReady, - request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, - response_serializer=transform__pb2.ReadyResponse.SerializeToString, - ), + "SourceTransformFn": grpc.unary_unary_rpc_method_handler( + servicer.SourceTransformFn, + request_deserializer=transform__pb2.SourceTransformRequest.FromString, + response_serializer=transform__pb2.SourceTransformResponse.SerializeToString, + ), + "IsReady": grpc.unary_unary_rpc_method_handler( + servicer.IsReady, + request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + response_serializer=transform__pb2.ReadyResponse.SerializeToString, + ), } generic_handler = grpc.method_handlers_generic_handler( - 'sourcetransformer.v1.SourceTransform', rpc_method_handlers) + "sourcetransformer.v1.SourceTransform", rpc_method_handlers + ) server.add_generic_rpc_handlers((generic_handler,)) - # This class is part of an EXPERIMENTAL API. +# This class is part of an EXPERIMENTAL API. class SourceTransform(object): """Missing associated documentation comment in .proto file.""" @staticmethod - def SourceTransformFn(request, + def SourceTransformFn( + request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None, + ): + return grpc.experimental.unary_unary( + request, target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary(request, target, '/sourcetransformer.v1.SourceTransform/SourceTransformFn', + "/sourcetransformer.v1.SourceTransform/SourceTransformFn", transform__pb2.SourceTransformRequest.SerializeToString, transform__pb2.SourceTransformResponse.FromString, - options, channel_credentials, - insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + ) @staticmethod - def IsReady(request, + def IsReady( + request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None, + ): + return grpc.experimental.unary_unary( + request, target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary(request, target, '/sourcetransformer.v1.SourceTransform/IsReady', + "/sourcetransformer.v1.SourceTransform/IsReady", google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, transform__pb2.ReadyResponse.FromString, - options, channel_credentials, - insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + ) From ebecebfe6db5f610f7397aceb125120da3aa3389 Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Thu, 11 Jan 2024 00:15:33 -0800 Subject: [PATCH 48/78] lint Signed-off-by: Sidhant Kohli --- .coveragerc | 12 ++++++++---- pynumaflow/shared/server.py | 18 +++++++++++++++--- 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/.coveragerc b/.coveragerc index 95134092..c3dc24a6 100644 --- a/.coveragerc +++ b/.coveragerc @@ -5,14 +5,18 @@ source = pynumaflow omit = pynumaflow/tests/* examples/* + pynumaflow/proto/* + pynumaflow/shared/server.py [report] exclude_lines = - def start - def start_async - def __serve_async - def start_multiproc + def sync_server_start def _run_server + def start_multiproc_server + async def start_async_server def _reserve_port if os.getenv("PYTHONDEBUG"): _LOGGER.setLevel(logging.DEBUG) + def exec_multiproc + def exec + async def aexec diff --git a/pynumaflow/shared/server.py b/pynumaflow/shared/server.py index dce4fdf4..232dc229 100644 --- a/pynumaflow/shared/server.py +++ b/pynumaflow/shared/server.py @@ -54,7 +54,12 @@ def write_info_file(protocol: Protocol, info_file=SERVER_INFO_FILE_PATH) -> None def sync_server_start( - servicer, bind_address: str, max_threads: int, server_options=None, udf_type: str = UDFType.Map + servicer, + bind_address: str, + max_threads: int, + server_options=None, + udf_type: str = UDFType.Map, + server_info_file=SERVER_INFO_FILE_PATH, ): """ Utility function to start a sync grpc server instance. @@ -74,11 +79,18 @@ def sync_server_start( server_options=server_options, udf_type=udf_type, server_info=server_info, + server_info_file=server_info_file, ) def _run_server( - servicer, bind_address: str, threads_per_proc, server_options, udf_type: str, server_info=None + servicer, + bind_address: str, + threads_per_proc, + server_options, + udf_type: str, + server_info=None, + server_info_file=SERVER_INFO_FILE_PATH, ) -> None: """ Starts the Synchronous server instance on the given UNIX socket @@ -108,7 +120,7 @@ def _run_server( # Add the server information to the server info file if provided if server_info: - info_server_write(server_info=server_info, info_file=SERVER_INFO_FILE_PATH) + info_server_write(server_info=server_info, info_file=server_info_file) _LOGGER.info("GRPC Server listening on: %s %d", bind_address, os.getpid()) server.wait_for_termination() From b11d086937b112648965f9c64fbc0cebe63b8667 Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Thu, 11 Jan 2024 01:16:35 -0800 Subject: [PATCH 49/78] README Signed-off-by: Sidhant Kohli --- README.md | 114 ++++++++++++++++++++++++++++------ tests/map/test_sync_mapper.py | 5 +- tests/map/utils.py | 16 ++++- 3 files changed, 114 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index f9d99129..f41252fd 100644 --- a/README.md +++ b/README.md @@ -46,18 +46,25 @@ pre-commit install ### Map ```python -from pynumaflow.mapper import Messages, Message, Datum, Mapper +from pynumaflow.mapper import Messages, Message, Datum, MapServer -def my_handler(keys: list[str], datum: Datum) -> Messages: +def handler(keys: list[str], datum: Datum) -> Messages: val = datum.value _ = datum.event_time _ = datum.watermark - return Messages(Message(value=val, keys=keys)) + strs = val.decode("utf-8").split(",") + messages = Messages() + if len(strs) == 0: + messages.append(Message.to_drop()) + return messages + for s in strs: + messages.append(Message(str.encode(s))) + return messages if __name__ == "__main__": - grpc_server = Mapper(handler=my_handler) + grpc_server = MapServer(mapper_instance=handler) grpc_server.start() ``` ### SourceTransformer - Map with event time assignment capability @@ -66,7 +73,7 @@ SourceTransformer is only supported at source vertex to enable (a) early data fi ```python from datetime import datetime -from pynumaflow.sourcetransformer import Messages, Message, Datum, SourceTransformer +from pynumaflow.sourcetransformer import Messages, Message, Datum, SourceTransformServer def transform_handler(keys: list[str], datum: Datum) -> Messages: @@ -78,21 +85,18 @@ def transform_handler(keys: list[str], datum: Datum) -> Messages: if __name__ == "__main__": - grpc_server = SourceTransformer(handler=transform_handler) + grpc_server = SourceTransformServer(source_transform_instance=transform_handler) grpc_server.start() ``` ### Reduce ```python -import aiorun -from typing import Iterator, List -from pynumaflow.reducer import Messages, Message, Datum, Metadata, AsyncReducer +from typing import AsyncIterable +from pynumaflow.reducer import Messages, Message, Datum, Metadata, ReduceServer -async def my_handler( - keys: List[str], datums: Iterator[Datum], md: Metadata -) -> Messages: +async def reduce_handler(keys: list[str], datums: AsyncIterable[Datum], md: Metadata) -> Messages: interval_window = md.interval_window counter = 0 async for _ in datums: @@ -101,12 +105,12 @@ async def my_handler( f"counter:{counter} interval_window_start:{interval_window.start} " f"interval_window_end:{interval_window.end}" ) - return Messages(Message(str.encode(msg), keys)) + return Messages(Message(str.encode(msg), keys=keys)) if __name__ == "__main__": - grpc_server = AsyncReducer(handler=my_handler) - aiorun.run(grpc_server.start()) + grpc_server = ReduceServer(reducer_instance=reduce_handler) + grpc_server.start() ``` ### Sample Image @@ -117,7 +121,7 @@ under [examples](examples/map/forward_message). ```python from typing import Iterator -from pynumaflow.sinker import Datum, Responses, Response, Sinker +from pynumaflow.sinker import Datum, Responses, Response, SinkServer def my_handler(datums: Iterator[Datum]) -> Responses: @@ -129,11 +133,85 @@ def my_handler(datums: Iterator[Datum]) -> Responses: if __name__ == "__main__": - grpc_server = Sinker(my_handler) + grpc_server = SinkServer(sinker_instance=my_handler) grpc_server.start() ``` ### Sample Image A sample UDSink [Dockerfile](examples/sink/log/Dockerfile) is provided -under [examples](examples/sink/log). \ No newline at end of file +under [examples](examples/sink/log). + +## Class based handlers + +We can also implement UDFs and UDSinks using class based handlers. + +The class based handlers are useful when we want to maintain state across multiple invocations of the handler. + +Here we can pass the class instance to the server and the server will invoke the handler methods on the instance. + +To use a class based handler, we the user needs to inherit the base class of the UDF/UDSink. +And implement the required methods in the class. + +Example For Mapper, the user needs to inherit the [MapperClass](pynumaflow/mapper/_dtypes.py#170) class and then implement the [handler](pynumaflow/mapper/_dtypes.py#170) method. + +### Map + +```python +from pynumaflow.mapper import Messages, Message, Datum, MapServer, MapperClass + +class MyHandler(MapperClass): + def handler(self, keys: list[str], datum: Datum) -> Messages: + val = datum.value + _ = datum.event_time + _ = datum.watermark + strs = val.decode("utf-8").split(",") + messages = Messages() + if len(strs) == 0: + messages.append(Message.to_drop()) + return messages + for s in strs: + messages.append(Message(str.encode(s))) + return messages + +if __name__ == "__main__": + class_instance = MyHandler() + grpc_server = MapServer(mapper_instance=class_instance) + grpc_server.start() +``` + + +## Server Types + +For different types of UDFs and UDSinks, we have different server types which are supported. + +These have different functionalities and are used for different use cases. + +Currently we support the following server types: +1) SyncServer +2) AsyncServer +3) MultiProcessServer + +Not all of the above are supported for all UDFs and UDSinks. + +To use a server type, the user needs to pass the server type to the server constructor. + +There is a class of the ```ServerType``` which can be imported from the package and be used. + + +### SyncServer +``` +grpc_server = MapServer(mapper_instance=handler, server_type=ServerType.Sync) +``` + +### AsyncServer +``` +grpc_server = MapServer(mapper_instance=handler, server_type=ServerType.Async) +``` + +### MultiProcessServer +``` +grpc_server = MapServer(mapper_instance=handler, server_type=ServerType.MultiProc) +``` + + diff --git a/tests/map/test_sync_mapper.py b/tests/map/test_sync_mapper.py index af046692..afe0b6a5 100644 --- a/tests/map/test_sync_mapper.py +++ b/tests/map/test_sync_mapper.py @@ -9,7 +9,7 @@ from pynumaflow.mapper import MapServer from pynumaflow.proto.mapper import map_pb2 -from tests.map.utils import map_handler, err_map_handler +from tests.map.utils import map_handler, err_map_handler, ExampleMap from tests.testing_utils import ( mock_event_time, mock_watermark, @@ -19,7 +19,8 @@ class TestSyncMapper(unittest.TestCase): def setUp(self) -> None: - my_server = MapServer(mapper_instance=map_handler) + class_instance = ExampleMap() + my_server = MapServer(mapper_instance=class_instance) my_servicer = my_server.get_servicer( mapper_instance=map_handler, server_type=ServerType.Sync ) diff --git a/tests/map/utils.py b/tests/map/utils.py index ef5d7c21..fb7a36be 100644 --- a/tests/map/utils.py +++ b/tests/map/utils.py @@ -1,10 +1,24 @@ -from pynumaflow.mapper import Datum, Messages, Message +from pynumaflow.mapper import Datum, Messages, Message, MapperClass async def async_map_error_fn(keys: list[str], datum: Datum) -> Messages: raise ValueError("error invoking map") +class ExampleMap(MapperClass): + def handler(self, keys: list[str], datum: Datum) -> Messages: + val = datum.value + msg = "payload:{} event_time:{} watermark:{}".format( + val.decode("utf-8"), + datum.event_time, + datum.watermark, + ) + val = bytes(msg, encoding="utf-8") + messages = Messages() + messages.append(Message(val, keys=keys)) + return messages + + def map_handler(keys: list[str], datum: Datum) -> Messages: val = datum.value msg = "payload:{} event_time:{} watermark:{}".format( From b2e7be31d59723a881b86b310df13e98d2e893b0 Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Thu, 11 Jan 2024 11:24:54 -0800 Subject: [PATCH 50/78] SideInput Signed-off-by: Sidhant Kohli --- .../sideinput/simple-sideinput/example.py | 4 +- .../sideinput/simple-sideinput/udf/example.py | 2 +- pynumaflow/_constants.py | 1 + pynumaflow/mapper/map.py | 4 + pynumaflow/shared/server.py | 34 +++++--- pynumaflow/sideinput/__init__.py | 7 +- pynumaflow/sideinput/_dtypes.py | 28 ++++++- pynumaflow/sideinput/server.py | 74 +---------------- pynumaflow/sideinput/sideinput.py | 83 +++++++++++++++++++ tests/sideinput/test_side_input_server.py | 28 ++++--- 10 files changed, 164 insertions(+), 101 deletions(-) create mode 100644 pynumaflow/sideinput/sideinput.py diff --git a/examples/sideinput/simple-sideinput/example.py b/examples/sideinput/simple-sideinput/example.py index 8cb73a62..cb7c18f9 100644 --- a/examples/sideinput/simple-sideinput/example.py +++ b/examples/sideinput/simple-sideinput/example.py @@ -1,5 +1,5 @@ import datetime -from pynumaflow.sideinput import Response, SideInput +from pynumaflow.sideinput import Response, SideInputServer counter = 0 @@ -22,5 +22,5 @@ def my_handler() -> Response: if __name__ == "__main__": - grpc_server = SideInput(handler=my_handler) + grpc_server = SideInputServer(side_input_instance=my_handler) grpc_server.start() diff --git a/examples/sideinput/simple-sideinput/udf/example.py b/examples/sideinput/simple-sideinput/udf/example.py index ba1f7fc9..e9eaa464 100644 --- a/examples/sideinput/simple-sideinput/udf/example.py +++ b/examples/sideinput/simple-sideinput/udf/example.py @@ -14,7 +14,7 @@ def watcher(): """ This function is used to watch the side input directory for changes. """ - path = sideinputsdk.SideInput.SIDE_INPUT_DIR_PATH + path = sideinputsdk.SIDE_INPUT_DIR_PATH for changes in watch(path): print(changes) diff --git a/pynumaflow/_constants.py b/pynumaflow/_constants.py index 0330d70b..883e8731 100644 --- a/pynumaflow/_constants.py +++ b/pynumaflow/_constants.py @@ -14,6 +14,7 @@ MULTIPROC_MAP_SOCK_ADDR = "0.0.0.0" SIDE_INPUT_SOCK_PATH = "/var/run/numaflow/sideinput.sock" SOURCE_SOCK_PATH = "/var/run/numaflow/source.sock" +SIDE_INPUT_DIR_PATH = "/var/numaflow/side-inputs" # TODO: need to make sure the DATUM_KEY value is the same as # https://github.com/numaproj/numaflow-go/blob/main/pkg/function/configs.go#L6 diff --git a/pynumaflow/mapper/map.py b/pynumaflow/mapper/map.py index 2d844bab..3013103c 100644 --- a/pynumaflow/mapper/map.py +++ b/pynumaflow/mapper/map.py @@ -81,6 +81,10 @@ def start(self) -> None: Starter function for the server class, Handles the server type and starts the server accordingly. If the server type is not supported, raises NotImplementedError. + Currently supported server types are: + - ServerType.Sync: Synchronous server + - ServerType.Async: Asynchronous server + - ServerType.Multiproc: Multiprocess server """ if self.server_type == ServerType.Sync: self.exec() diff --git a/pynumaflow/shared/server.py b/pynumaflow/shared/server.py index 232dc229..853cd4da 100644 --- a/pynumaflow/shared/server.py +++ b/pynumaflow/shared/server.py @@ -2,7 +2,7 @@ import multiprocessing import os import socket -from abc import abstractmethod +from abc import ABCMeta, abstractmethod from collections.abc import Iterator from concurrent.futures import ThreadPoolExecutor @@ -22,12 +22,13 @@ METADATA_ENVS, ) from pynumaflow.proto.mapper import map_pb2_grpc +from pynumaflow.proto.sideinput import sideinput_pb2_grpc from pynumaflow.proto.sinker import sink_pb2_grpc from pynumaflow.proto.sourcer import source_pb2_grpc from pynumaflow.proto.sourcetransformer import transform_pb2_grpc -class NumaflowServer: +class NumaflowServer(metaclass=ABCMeta): """ Provides an interface to write a Numaflow Server which will be exposed over gRPC. @@ -38,7 +39,7 @@ def start(self): """ Start the gRPC server """ - raise NotImplementedError + pass def write_info_file(protocol: Protocol, info_file=SERVER_INFO_FILE_PATH) -> None: @@ -59,18 +60,21 @@ def sync_server_start( max_threads: int, server_options=None, udf_type: str = UDFType.Map, - server_info_file=SERVER_INFO_FILE_PATH, + add_info_server=True, ): """ Utility function to start a sync grpc server instance. """ # Add the server information to the server info file, # here we just write the protocol and language information - server_info = ServerInfo( - protocol=Protocol.UDS, - language=Language.PYTHON, - version=get_sdk_version(), - ) + if add_info_server: + server_info = ServerInfo( + protocol=Protocol.UDS, + language=Language.PYTHON, + version=get_sdk_version(), + ) + else: + server_info = None # Run a sync server instances _run_server( servicer=servicer, @@ -79,7 +83,6 @@ def sync_server_start( server_options=server_options, udf_type=udf_type, server_info=server_info, - server_info_file=server_info_file, ) @@ -112,6 +115,8 @@ def _run_server( transform_pb2_grpc.add_SourceTransformServicer_to_server(servicer, server) elif udf_type == UDFType.Source: source_pb2_grpc.add_SourceServicer_to_server(servicer, server) + elif udf_type == UDFType.SideInput: + sideinput_pb2_grpc.add_SideInputServicer_to_server(servicer, server) # bind the server to the UDS/TCP socket server.add_insecure_port(bind_address) @@ -119,7 +124,7 @@ def _run_server( server.start() # Add the server information to the server info file if provided - if server_info: + if server_info and server_info_file: info_server_write(server_info=server_info, info_file=server_info_file) _LOGGER.info("GRPC Server listening on: %s %d", bind_address, os.getpid()) @@ -191,7 +196,12 @@ async def start_async_server( # Add the server information to the server info file # Here we just write the protocol and language information - write_info_file(Protocol.UDS) + serv_info = ServerInfo( + protocol=Protocol.UDS, + language=Language.PYTHON, + version=get_sdk_version(), + ) + info_server_write(server_info=serv_info, info_file=SERVER_INFO_FILE_PATH) # Log the server start _LOGGER.info( diff --git a/pynumaflow/sideinput/__init__.py b/pynumaflow/sideinput/__init__.py index 8a3c36f3..c8713710 100644 --- a/pynumaflow/sideinput/__init__.py +++ b/pynumaflow/sideinput/__init__.py @@ -1,4 +1,5 @@ -from pynumaflow.sideinput._dtypes import Response -from pynumaflow.sideinput.server import SideInput +from pynumaflow._constants import SIDE_INPUT_DIR_PATH +from pynumaflow.sideinput._dtypes import Response, SideInputClass +from pynumaflow.sideinput.sideinput import SideInputServer -__all__ = ["Response", "SideInput"] +__all__ = ["Response", "SideInputClass", "SideInputServer", "SIDE_INPUT_DIR_PATH"] diff --git a/pynumaflow/sideinput/_dtypes.py b/pynumaflow/sideinput/_dtypes.py index 86826578..2225af14 100644 --- a/pynumaflow/sideinput/_dtypes.py +++ b/pynumaflow/sideinput/_dtypes.py @@ -1,5 +1,6 @@ +from abc import ABCMeta, abstractmethod from dataclasses import dataclass -from typing import TypeVar +from typing import TypeVar, Callable, Union R = TypeVar("R", bound="Response") @@ -36,3 +37,28 @@ def no_broadcast_message(cls: type[R]) -> R: This event will not be broadcasted. """ return Response(value=b"", no_broadcast=True) + + +class SideInputClass(metaclass=ABCMeta): + """ + Provides an interface to write a SideInput Class + which will be exposed over gRPC. + """ + + def __call__(self, *args, **kwargs): + """ + This allows to execute the handler function directly if + class instance is sent as a callable. + """ + return self.retrieve_handler(*args, **kwargs) + + @abstractmethod + def retrieve_handler(self) -> Response: + """ + This function is called when a Side Input request is received. + """ + pass + + +RetrieverHandlerCallable = Callable[[], Response] +RetrieverCallable = Union[SideInputClass, RetrieverHandlerCallable] diff --git a/pynumaflow/sideinput/server.py b/pynumaflow/sideinput/server.py index 7d700a61..458365b7 100644 --- a/pynumaflow/sideinput/server.py +++ b/pynumaflow/sideinput/server.py @@ -1,71 +1,20 @@ -import logging -import multiprocessing -import os -from concurrent.futures import ThreadPoolExecutor -from typing import Callable - import grpc from google.protobuf import empty_pb2 as _empty_pb2 -from pynumaflow import setup_logging from pynumaflow._constants import ( - MAX_MESSAGE_SIZE, - SIDE_INPUT_SOCK_PATH, + _LOGGER, ) -from pynumaflow.sideinput import Response from pynumaflow.proto.sideinput import sideinput_pb2_grpc, sideinput_pb2 +from pynumaflow.sideinput._dtypes import RetrieverCallable from pynumaflow.types import NumaflowServicerContext -_LOGGER = setup_logging(__name__) -if os.getenv("PYTHONDEBUG"): - _LOGGER.setLevel(logging.DEBUG) - -RetrieverCallable = Callable[[], Response] -_PROCESS_COUNT = multiprocessing.cpu_count() -MAX_THREADS = int(os.getenv("MAX_THREADS", 0)) or (_PROCESS_COUNT * 4) - class SideInput(sideinput_pb2_grpc.SideInputServicer): - """ - Provides an interface to write a User Defined Side Input (UDSideInput) - which will be exposed over gRPC. - - Args: - handler: Function callable following the type signature of RetrieverCallable - sock_path: Path to the UNIX Domain Socket - max_message_size: The max message size in bytes the server can receive and send - max_threads: The max number of threads to be spawned; - defaults to number of processors x 4 - - Example invocation: - >>> from typing import List - >>> from pynumaflow.sideinput import Response, SideInput - >>> def my_handler() -> Response: - ... response = Response.broadcast_message(b"hello") - ... return response - >>> grpc_server = SideInput(my_handler) - >>> grpc_server.start() - """ - - SIDE_INPUT_DIR_PATH = "/var/numaflow/side-inputs" - def __init__( self, handler: RetrieverCallable, - sock_path=SIDE_INPUT_SOCK_PATH, - max_message_size=MAX_MESSAGE_SIZE, - max_threads=MAX_THREADS, ): self.__retrieve_handler: RetrieverCallable = handler - self.sock_path = f"unix://{sock_path}" - self._max_message_size = max_message_size - self._max_threads = max_threads - self.cleanup_coroutines = [] - - self._server_options = [ - ("grpc.max_send_message_length", self._max_message_size), - ("grpc.max_receive_message_length", self._max_message_size), - ] def RetrieveSideInput( self, request: _empty_pb2.Empty, context: NumaflowServicerContext @@ -94,22 +43,3 @@ def IsReady( The pascal case function name comes from the proto sideinput_pb2_grpc.py file. """ return sideinput_pb2.ReadyResponse(ready=True) - - def start(self) -> None: - """ - Starts the gRPC server on the given UNIX socket with given max threads. - """ - server = grpc.server( - ThreadPoolExecutor(max_workers=self._max_threads), options=self._server_options - ) - sideinput_pb2_grpc.add_SideInputServicer_to_server( - SideInput(self.__retrieve_handler), server - ) - server.add_insecure_port(self.sock_path) - server.start() - _LOGGER.info( - "Side Input gRPC Server listening on: %s with max threads: %s", - self.sock_path, - self._max_threads, - ) - server.wait_for_termination() diff --git a/pynumaflow/sideinput/sideinput.py b/pynumaflow/sideinput/sideinput.py new file mode 100644 index 00000000..3076e2d7 --- /dev/null +++ b/pynumaflow/sideinput/sideinput.py @@ -0,0 +1,83 @@ +import os +from pynumaflow.shared import NumaflowServer +from pynumaflow.shared.server import sync_server_start +from pynumaflow.sideinput._dtypes import RetrieverCallable +from pynumaflow.sideinput.server import SideInput +from pynumaflow._constants import ( + MAX_THREADS, + MAX_MESSAGE_SIZE, + SIDE_INPUT_SOCK_PATH, + ServerType, + _LOGGER, + UDFType, + SIDE_INPUT_DIR_PATH, +) + + +class SideInputServer(NumaflowServer): + """Server for side input""" + + def __init__( + self, + side_input_instance: RetrieverCallable, + sock_path=SIDE_INPUT_SOCK_PATH, + max_message_size=MAX_MESSAGE_SIZE, + max_threads=MAX_THREADS, + server_type=ServerType.Sync, + side_input_dir_path=SIDE_INPUT_DIR_PATH, + ): + self.sock_path = f"unix://{sock_path}" + self.max_threads = min(max_threads, int(os.getenv("MAX_THREADS", "4"))) + self.max_message_size = max_message_size + + self._server_options = [ + ("grpc.max_send_message_length", self.max_message_size), + ("grpc.max_receive_message_length", self.max_message_size), + ] + + self.side_input_instance = side_input_instance + self.server_type = server_type + self.side_input_dir_path = side_input_dir_path + + def start(self): + """Starter function for the server class, Handles the server type and + starts the server accordingly. If the server type is not supported, + raises NotImplementedError. + Currently supported server types: + 1) ServerType.Sync + """ + if self.server_type == ServerType.Sync: + return self.exec() + else: + _LOGGER.error("Server type not supported - %s", str(self.server_type)) + raise NotImplementedError + + def exec(self): + """ + Starts the Synchronous gRPC server on the given UNIX socket with given max threads. + """ + # Get the servicer instance based on the server type + side_input_servicer = self.get_servicer( + side_input_instance=self.side_input_instance, server_type=self.server_type + ) + _LOGGER.info( + "Side Input GRPC Server listening on: %s with max threads: %s", + self.sock_path, + self.max_threads, + ) + # Start the server + sync_server_start( + servicer=side_input_servicer, + bind_address=self.sock_path, + max_threads=self.max_threads, + server_options=self._server_options, + udf_type=UDFType.SideInput, + add_info_server=False, + ) + + def get_servicer(self, side_input_instance, server_type): + """ + Returns the servicer instance based on the server type + """ + if server_type == ServerType.Sync: + return SideInput(side_input_instance) diff --git a/tests/sideinput/test_side_input_server.py b/tests/sideinput/test_side_input_server.py index 4d8db881..c172d27c 100644 --- a/tests/sideinput/test_side_input_server.py +++ b/tests/sideinput/test_side_input_server.py @@ -4,10 +4,9 @@ from google.protobuf import empty_pb2 as _empty_pb2 from grpc import StatusCode from grpc_testing import server_from_dictionary, strict_real_time -from pynumaflow.sideinput import SideInput from pynumaflow.proto.sideinput import sideinput_pb2 -from pynumaflow.sideinput import Response +from pynumaflow.sideinput import Response, SideInputServer def retrieve_side_input_handler() -> Response: @@ -34,7 +33,10 @@ class TestServer(unittest.TestCase): """ def setUp(self) -> None: - my_service = SideInput(handler=retrieve_side_input_handler) + server = SideInputServer(side_input_instance=retrieve_side_input_handler) + my_service = server.get_servicer( + side_input_instance=server.side_input_instance, server_type=server.server_type + ) services = {sideinput_pb2.DESCRIPTOR.services_by_name["SideInput"]: my_service} self.test_server = server_from_dictionary(services, strict_real_time()) @@ -42,20 +44,23 @@ def test_init_with_args(self) -> None: """ Test the initialization of the SideInput class, """ - my_servicer = SideInput( - handler=retrieve_side_input_handler, + my_servicer = SideInputServer( + side_input_instance=retrieve_side_input_handler, sock_path="/tmp/test_side_input.sock", max_message_size=1024 * 1024 * 5, ) self.assertEqual(my_servicer.sock_path, "unix:///tmp/test_side_input.sock") - self.assertEqual(my_servicer._max_message_size, 1024 * 1024 * 5) + self.assertEqual(my_servicer.max_message_size, 1024 * 1024 * 5) def test_side_input_err(self): """ Test the error case for the RetrieveSideInput method, """ - my_servicer = SideInput(handler=err_retrieve_handler) - services = {sideinput_pb2.DESCRIPTOR.services_by_name["SideInput"]: my_servicer} + server = SideInputServer(side_input_instance=err_retrieve_handler) + my_service = server.get_servicer( + side_input_instance=server.side_input_instance, server_type=server.server_type + ) + services = {sideinput_pb2.DESCRIPTOR.services_by_name["SideInput"]: my_service} self.test_server = server_from_dictionary(services, strict_real_time()) method = self.test_server.invoke_unary_unary( @@ -115,7 +120,10 @@ def test_side_input_no_broadcast(self): Test the no_broadcast_message method, where we expect the no_broadcast flag to be True. """ - my_servicer = SideInput(handler=retrieve_no_broadcast_handler) + server = SideInputServer(side_input_instance=retrieve_no_broadcast_handler) + my_servicer = server.get_servicer( + side_input_instance=server.side_input_instance, server_type=server.server_type + ) services = {sideinput_pb2.DESCRIPTOR.services_by_name["SideInput"]: my_servicer} self.test_server = server_from_dictionary(services, strict_real_time()) @@ -137,7 +145,7 @@ def test_side_input_no_broadcast(self): def test_invalid_input(self): with self.assertRaises(TypeError): - SideInput() + SideInputServer() if __name__ == "__main__": From 617f63adbaadabf9094c5e403d1c7f6e07a28768 Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Thu, 11 Jan 2024 13:44:35 -0800 Subject: [PATCH 51/78] tests Signed-off-by: Sidhant Kohli --- tests/map/test_messages.py | 27 ++++++++++++++++++++++- tests/sideinput/test_side_input_server.py | 2 ++ 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/tests/map/test_messages.py b/tests/map/test_messages.py index e3a29027..eab2f01a 100644 --- a/tests/map/test_messages.py +++ b/tests/map/test_messages.py @@ -1,6 +1,6 @@ import unittest -from pynumaflow.mapper import Messages, Message, DROP +from pynumaflow.mapper import Messages, Message, DROP, MapperClass, Datum from tests.testing_utils import mock_message @@ -90,5 +90,30 @@ def test_err(self): msgts[:1] +class ExampleMapper(MapperClass): + def handler(self, keys: list[str], datum: Datum) -> Messages: + messages = Messages() + messages.append(Message(mock_message(), keys=keys)) + return messages + + +class TestMapClass(unittest.TestCase): + def setUp(self) -> None: + # Create a map class instance + self.mapper_instance = ExampleMapper() + + def test_map_class_call(self): + """Test that the __call__ functionality for the class works, + ie the class instance can be called directly to invoke the handler function + """ + # make a call to the class directly + ret = self.mapper_instance([], None) + self.assertEqual(mock_message(), ret[0].value) + # make a call to the handler + ret_handler = self.mapper_instance.handler(keys=[], datum=None) + # + self.assertEqual(ret[0], ret_handler[0]) + + if __name__ == "__main__": unittest.main() diff --git a/tests/sideinput/test_side_input_server.py b/tests/sideinput/test_side_input_server.py index c172d27c..a9204e8e 100644 --- a/tests/sideinput/test_side_input_server.py +++ b/tests/sideinput/test_side_input_server.py @@ -146,6 +146,8 @@ def test_side_input_no_broadcast(self): def test_invalid_input(self): with self.assertRaises(TypeError): SideInputServer() + with self.assertRaises(NotImplementedError): + SideInputServer(side_input_instance=retrieve_side_input_handler, server_type="test").start() if __name__ == "__main__": From 9bb16a7463ad54cf6d1af2e047f474da2d0e842c Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Thu, 11 Jan 2024 13:45:41 -0800 Subject: [PATCH 52/78] tests Signed-off-by: Sidhant Kohli --- tests/sideinput/test_side_input_server.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/sideinput/test_side_input_server.py b/tests/sideinput/test_side_input_server.py index a9204e8e..d9b2b841 100644 --- a/tests/sideinput/test_side_input_server.py +++ b/tests/sideinput/test_side_input_server.py @@ -147,7 +147,9 @@ def test_invalid_input(self): with self.assertRaises(TypeError): SideInputServer() with self.assertRaises(NotImplementedError): - SideInputServer(side_input_instance=retrieve_side_input_handler, server_type="test").start() + SideInputServer( + side_input_instance=retrieve_side_input_handler, server_type="test" + ).start() if __name__ == "__main__": From e4656d527b06234ab4a7e766a66b46cd948684d0 Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Thu, 11 Jan 2024 14:41:38 -0800 Subject: [PATCH 53/78] tests Signed-off-by: Sidhant Kohli --- tests/sideinput/test_responses.py | 25 ++++++++++++++++- tests/sink/test_responses.py | 28 ++++++++++++++++++- tests/source/test_message.py | 7 ++++- tests/sourcetransform/test_messages.py | 28 ++++++++++++++++++- tests/test_shared.py | 37 ++++++++++++++++++++++++++ 5 files changed, 121 insertions(+), 4 deletions(-) create mode 100644 tests/test_shared.py diff --git a/tests/sideinput/test_responses.py b/tests/sideinput/test_responses.py index 589250e3..bf0e7edb 100644 --- a/tests/sideinput/test_responses.py +++ b/tests/sideinput/test_responses.py @@ -1,6 +1,6 @@ import unittest -from pynumaflow.sideinput import Response +from pynumaflow.sideinput import Response, SideInputClass class TestResponse(unittest.TestCase): @@ -26,5 +26,28 @@ def test_no_broadcast_message(self): self.assertTrue(succ_response.no_broadcast) +class ExampleSideInput(SideInputClass): + def retrieve_handler(self) -> Response: + return Response.broadcast_message(b"testMessage") + + +class TestSideInputClass(unittest.TestCase): + def setUp(self) -> None: + # Create a side input class instance + self.side_input_instance = ExampleSideInput() + + def test_side_input_class_call(self): + """Test that the __call__ functionality for the class works, + ie the class instance can be called directly to invoke the handler function + """ + # make a call to the class directly + ret = self.side_input_instance() + self.assertEqual(b"testMessage", ret.value) + # make a call to the handler + ret_handler = self.side_input_instance.retrieve_handler() + # Both responses should be equal + self.assertEqual(ret, ret_handler) + + if __name__ == "__main__": unittest.main() diff --git a/tests/sink/test_responses.py b/tests/sink/test_responses.py index 9fed6d97..516abb19 100644 --- a/tests/sink/test_responses.py +++ b/tests/sink/test_responses.py @@ -1,6 +1,7 @@ import unittest +from collections.abc import Iterator -from pynumaflow.sinker import Response, Responses +from pynumaflow.sinker import Response, Responses, SinkerClass, Datum class TestResponse(unittest.TestCase): @@ -39,5 +40,30 @@ def test_responses(self): ) +class ExampleSinkClass(SinkerClass): + def handler(self, datums: Iterator[Datum]) -> Responses: + results = Responses() + results.append(Response.as_success("test_message")) + return results + + +class TestSinkClass(unittest.TestCase): + def setUp(self) -> None: + # Create a map class instance + self.sinker_instance = ExampleSinkClass() + + def test_sink_class_call(self): + """Test that the __call__ functionality for the class works, + ie the class instance can be called directly to invoke the handler function + """ + # make a call to the class directly + ret = self.sinker_instance(None) + self.assertEqual("test_message", ret[0].id) + # make a call to the handler + ret_handler = self.sinker_instance.handler(None) + # Both responses should be equal + self.assertEqual(ret[0], ret_handler[0]) + + if __name__ == "__main__": unittest.main() diff --git a/tests/source/test_message.py b/tests/source/test_message.py index 43d54f87..00ca83e7 100644 --- a/tests/source/test_message.py +++ b/tests/source/test_message.py @@ -1,6 +1,11 @@ import unittest -from pynumaflow.sourcer import Message, Offset, ReadRequest, PartitionsResponse +from pynumaflow.sourcer import ( + Message, + Offset, + ReadRequest, + PartitionsResponse, +) from tests.source.utils import mock_offset from tests.testing_utils import mock_event_time diff --git a/tests/sourcetransform/test_messages.py b/tests/sourcetransform/test_messages.py index 8f2caf46..dd6ae39e 100644 --- a/tests/sourcetransform/test_messages.py +++ b/tests/sourcetransform/test_messages.py @@ -1,7 +1,8 @@ import unittest from datetime import datetime, timezone -from pynumaflow.sourcetransformer import Messages, Message, DROP +from pynumaflow.sourcetransformer import Messages, Message, DROP, SourceTransformerClass, Datum +from tests.testing_utils import mock_new_event_time def mock_message_t(): @@ -93,5 +94,30 @@ def test_err(self): msgts[:1] +class ExampleSourceTransformClass(SourceTransformerClass): + def handler(self, keys: list[str], datum: Datum) -> Messages: + messages = Messages() + messages.append(Message(mock_message_t(), mock_new_event_time(), keys=keys)) + return messages + + +class TestSourceTransformClass(unittest.TestCase): + def setUp(self) -> None: + # Create a map class instance + self.transform_instance = ExampleSourceTransformClass() + + def test_source_transform_class_call(self): + """Test that the __call__ functionality for the class works, + ie the class instance can be called directly to invoke the handler function + """ + # make a call to the class directly + ret = self.transform_instance([], None) + self.assertEqual(mock_message_t(), ret[0].value) + # make a call to the handler + ret_handler = self.transform_instance.handler([], None) + # Both responses should be equal + self.assertEqual(ret[0], ret_handler[0]) + + if __name__ == "__main__": unittest.main() diff --git a/tests/test_shared.py b/tests/test_shared.py new file mode 100644 index 00000000..9818d9ac --- /dev/null +++ b/tests/test_shared.py @@ -0,0 +1,37 @@ +import unittest + + +from pynumaflow.info.server import get_sdk_version +from pynumaflow.info.types import Protocol +from pynumaflow.mapper import Datum, Messages, Message + +from pynumaflow.shared.server import write_info_file +from tests.testing_utils import read_info_server + + +def map_handler(keys: list[str], datum: Datum) -> Messages: + val = datum.value + msg = "payload:{} event_time:{} watermark:{}".format( + val.decode("utf-8"), + datum.event_time, + datum.watermark, + ) + val = bytes(msg, encoding="utf-8") + messages = Messages() + messages.append(Message(val, keys=keys)) + return messages + + +class TestSharedUtils(unittest.TestCase): + def test_write_info_file(self): + """ + Test write_info_file function + Write data to the info file and read it back to verify + """ + info_file = "/tmp/test_info_server" + ret = write_info_file(info_file=info_file, protocol=Protocol.UDS) + self.assertIsNone(ret) + file_data = read_info_server(info_file=info_file) + self.assertEqual(file_data["protocol"], "uds") + self.assertEqual(file_data["language"], "python") + self.assertEqual(file_data["version"], get_sdk_version()) From 7c1b2762aab969b129e48a784e3830f72436590c Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Thu, 11 Jan 2024 15:01:28 -0800 Subject: [PATCH 54/78] add uvloop Signed-off-by: Sidhant Kohli --- poetry.lock | 56 ++++++++++++++++++++++++++++- pynumaflow/mapper/map.py | 2 +- pynumaflow/mapstreamer/mapstream.py | 2 +- pynumaflow/reducer/reduce.py | 2 +- pynumaflow/sinker/sink.py | 2 +- pynumaflow/sourcer/source.py | 2 +- pyproject.toml | 1 + 7 files changed, 61 insertions(+), 6 deletions(-) diff --git a/poetry.lock b/poetry.lock index 8b310569..49aa67ad 100644 --- a/poetry.lock +++ b/poetry.lock @@ -762,6 +762,7 @@ files = [ {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"}, {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"}, {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"}, + {file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"}, {file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"}, {file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"}, {file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"}, @@ -769,8 +770,15 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"}, {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"}, {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"}, + {file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"}, {file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"}, {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, + {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, + {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, + {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, + {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, + {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, + {file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"}, {file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"}, {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"}, {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"}, @@ -787,6 +795,7 @@ files = [ {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"}, {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"}, {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"}, + {file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"}, {file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"}, {file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"}, {file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"}, @@ -794,6 +803,7 @@ files = [ {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"}, {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"}, {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"}, + {file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"}, {file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"}, {file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"}, {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"}, @@ -915,6 +925,50 @@ secure = ["certifi", "cryptography (>=1.9)", "idna (>=2.0.0)", "pyopenssl (>=17. socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] zstd = ["zstandard (>=0.18.0)"] +[[package]] +name = "uvloop" +version = "0.19.0" +description = "Fast implementation of asyncio event loop on top of libuv" +optional = false +python-versions = ">=3.8.0" +files = [ + {file = "uvloop-0.19.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:de4313d7f575474c8f5a12e163f6d89c0a878bc49219641d49e6f1444369a90e"}, + {file = "uvloop-0.19.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5588bd21cf1fcf06bded085f37e43ce0e00424197e7c10e77afd4bbefffef428"}, + {file = "uvloop-0.19.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7b1fd71c3843327f3bbc3237bedcdb6504fd50368ab3e04d0410e52ec293f5b8"}, + {file = "uvloop-0.19.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5a05128d315e2912791de6088c34136bfcdd0c7cbc1cf85fd6fd1bb321b7c849"}, + {file = "uvloop-0.19.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:cd81bdc2b8219cb4b2556eea39d2e36bfa375a2dd021404f90a62e44efaaf957"}, + {file = "uvloop-0.19.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:5f17766fb6da94135526273080f3455a112f82570b2ee5daa64d682387fe0dcd"}, + {file = "uvloop-0.19.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:4ce6b0af8f2729a02a5d1575feacb2a94fc7b2e983868b009d51c9a9d2149bef"}, + {file = "uvloop-0.19.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:31e672bb38b45abc4f26e273be83b72a0d28d074d5b370fc4dcf4c4eb15417d2"}, + {file = "uvloop-0.19.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:570fc0ed613883d8d30ee40397b79207eedd2624891692471808a95069a007c1"}, + {file = "uvloop-0.19.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5138821e40b0c3e6c9478643b4660bd44372ae1e16a322b8fc07478f92684e24"}, + {file = "uvloop-0.19.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:91ab01c6cd00e39cde50173ba4ec68a1e578fee9279ba64f5221810a9e786533"}, + {file = "uvloop-0.19.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:47bf3e9312f63684efe283f7342afb414eea4d3011542155c7e625cd799c3b12"}, + {file = "uvloop-0.19.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:da8435a3bd498419ee8c13c34b89b5005130a476bda1d6ca8cfdde3de35cd650"}, + {file = "uvloop-0.19.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:02506dc23a5d90e04d4f65c7791e65cf44bd91b37f24cfc3ef6cf2aff05dc7ec"}, + {file = "uvloop-0.19.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2693049be9d36fef81741fddb3f441673ba12a34a704e7b4361efb75cf30befc"}, + {file = "uvloop-0.19.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7010271303961c6f0fe37731004335401eb9075a12680738731e9c92ddd96ad6"}, + {file = "uvloop-0.19.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:5daa304d2161d2918fa9a17d5635099a2f78ae5b5960e742b2fcfbb7aefaa593"}, + {file = "uvloop-0.19.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:7207272c9520203fea9b93843bb775d03e1cf88a80a936ce760f60bb5add92f3"}, + {file = "uvloop-0.19.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:78ab247f0b5671cc887c31d33f9b3abfb88d2614b84e4303f1a63b46c046c8bd"}, + {file = "uvloop-0.19.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:472d61143059c84947aa8bb74eabbace30d577a03a1805b77933d6bd13ddebbd"}, + {file = "uvloop-0.19.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45bf4c24c19fb8a50902ae37c5de50da81de4922af65baf760f7c0c42e1088be"}, + {file = "uvloop-0.19.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:271718e26b3e17906b28b67314c45d19106112067205119dddbd834c2b7ce797"}, + {file = "uvloop-0.19.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:34175c9fd2a4bc3adc1380e1261f60306344e3407c20a4d684fd5f3be010fa3d"}, + {file = "uvloop-0.19.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:e27f100e1ff17f6feeb1f33968bc185bf8ce41ca557deee9d9bbbffeb72030b7"}, + {file = "uvloop-0.19.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:13dfdf492af0aa0a0edf66807d2b465607d11c4fa48f4a1fd41cbea5b18e8e8b"}, + {file = "uvloop-0.19.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6e3d4e85ac060e2342ff85e90d0c04157acb210b9ce508e784a944f852a40e67"}, + {file = "uvloop-0.19.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8ca4956c9ab567d87d59d49fa3704cf29e37109ad348f2d5223c9bf761a332e7"}, + {file = "uvloop-0.19.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f467a5fd23b4fc43ed86342641f3936a68ded707f4627622fa3f82a120e18256"}, + {file = "uvloop-0.19.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:492e2c32c2af3f971473bc22f086513cedfc66a130756145a931a90c3958cb17"}, + {file = "uvloop-0.19.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:2df95fca285a9f5bfe730e51945ffe2fa71ccbfdde3b0da5772b4ee4f2e770d5"}, + {file = "uvloop-0.19.0.tar.gz", hash = "sha256:0246f4fd1bf2bf702e06b0d45ee91677ee5c31242f39aab4ea6fe0c51aedd0fd"}, +] + +[package.extras] +docs = ["Sphinx (>=4.1.2,<4.2.0)", "sphinx-rtd-theme (>=0.5.2,<0.6.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)"] +test = ["Cython (>=0.29.36,<0.30.0)", "aiohttp (==3.9.0b0)", "aiohttp (>=3.8.1)", "flake8 (>=5.0,<6.0)", "mypy (>=0.800)", "psutil", "pyOpenSSL (>=23.0.0,<23.1.0)", "pycodestyle (>=2.9.0,<2.10.0)"] + [[package]] name = "virtualenv" version = "20.24.5" @@ -938,4 +992,4 @@ test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess [metadata] lock-version = "2.0" python-versions = ">=3.9, <3.12" -content-hash = "6372817a9a99177a328bfc5ec53fb44d5cf5b66205c3079bc35751561363966e" +content-hash = "e6fd5e2ffdc1b0e57b4cba288c6cb20260a66250085fe1f0b4f5982488ad81b4" diff --git a/pynumaflow/mapper/map.py b/pynumaflow/mapper/map.py index 3013103c..f0df10fe 100644 --- a/pynumaflow/mapper/map.py +++ b/pynumaflow/mapper/map.py @@ -89,7 +89,7 @@ def start(self) -> None: if self.server_type == ServerType.Sync: self.exec() elif self.server_type == ServerType.Async: - aiorun.run(self.aexec()) + aiorun.run(self.aexec(), use_uvloop=True) elif self.server_type == ServerType.Multiproc: self.exec_multiproc() else: diff --git a/pynumaflow/mapstreamer/mapstream.py b/pynumaflow/mapstreamer/mapstream.py index e553771a..c7fb47eb 100644 --- a/pynumaflow/mapstreamer/mapstream.py +++ b/pynumaflow/mapstreamer/mapstream.py @@ -64,7 +64,7 @@ def start(self): - ServerType.Async: Asynchronous server """ if self.server_type == ServerType.Async: - aiorun.run(self.aexec()) + aiorun.run(self.aexec(), use_uvloop=True) else: _LOGGER.error("Server type not supported - %s", str(self.server_type)) raise NotImplementedError diff --git a/pynumaflow/reducer/reduce.py b/pynumaflow/reducer/reduce.py index e7742486..6f03f495 100644 --- a/pynumaflow/reducer/reduce.py +++ b/pynumaflow/reducer/reduce.py @@ -62,7 +62,7 @@ def start(self): 1. Async """ if self.server_type == ServerType.Async: - aiorun.run(self.aexec()) + aiorun.run(self.aexec(), use_uvloop=True) else: _LOGGER.error("Server type not supported - %s", str(self.server_type)) raise NotImplementedError diff --git a/pynumaflow/sinker/sink.py b/pynumaflow/sinker/sink.py index 32139b53..f3b21c07 100644 --- a/pynumaflow/sinker/sink.py +++ b/pynumaflow/sinker/sink.py @@ -72,7 +72,7 @@ def start(self): if self.server_type == ServerType.Sync: self.exec() elif self.server_type == ServerType.Async: - aiorun.run(self.aexec()) + aiorun.run(self.aexec(), use_uvloop=True) else: _LOGGER.error("Server type not supported - %s", str(self.server_type)) raise NotImplementedError diff --git a/pynumaflow/sourcer/source.py b/pynumaflow/sourcer/source.py index de77f28a..b7d066bf 100644 --- a/pynumaflow/sourcer/source.py +++ b/pynumaflow/sourcer/source.py @@ -67,7 +67,7 @@ def start(self): if self.server_type == ServerType.Sync: self.exec() elif self.server_type == ServerType.Async: - aiorun.run(self.aexec()) + aiorun.run(self.aexec(), use_uvloop=True) else: _LOGGER.error("Server type not supported - %s", str(self.server_type)) raise NotImplementedError diff --git a/pyproject.toml b/pyproject.toml index 7a315da3..9f198883 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,6 +27,7 @@ google-cloud = "^0.34.0" google-api-core = "^2.11.0" protobuf = ">=3.20,<5.0" aiorun = "^2023.7" +uvloop = "^0.19.0" [tool.poetry.group.dev] optional = true From 74efb87ec3dc5d92a452a9d055169519a0dd76bf Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Tue, 16 Jan 2024 15:18:17 -0800 Subject: [PATCH 55/78] seperate mappers Signed-off-by: Sidhant Kohli --- README.md | 10 +- examples/map/flatmap/example.py | 7 +- examples/map/multiproc_map/example.py | 7 +- pynumaflow/mapper/__init__.py | 18 +- pynumaflow/mapper/_dtypes.py | 19 +- pynumaflow/mapper/async_server.py | 112 ++++++------ pynumaflow/mapper/map.py | 169 ------------------ pynumaflow/mapper/multiproc_server.py | 77 ++++++++ pynumaflow/mapper/servicer/__init__.py | 0 pynumaflow/mapper/servicer/async_servicer.py | 72 ++++++++ .../{server.py => servicer/sync_servicer.py} | 12 +- pynumaflow/mapper/{ => servicer}/utils.py | 4 +- pynumaflow/mapper/sync_server.py | 73 ++++++++ pynumaflow/mapstreamer/_dtypes.py | 2 +- pynumaflow/mapstreamer/async_server.py | 2 +- pynumaflow/reducer/async_server.py | 2 +- tests/map/test_async_mapper.py | 11 +- tests/map/test_messages.py | 4 +- tests/map/test_multiproc_mapper.py | 36 ++-- tests/map/test_sync_mapper.py | 21 +-- tests/map/utils.py | 4 +- 21 files changed, 359 insertions(+), 303 deletions(-) delete mode 100644 pynumaflow/mapper/map.py create mode 100644 pynumaflow/mapper/multiproc_server.py create mode 100644 pynumaflow/mapper/servicer/__init__.py create mode 100644 pynumaflow/mapper/servicer/async_servicer.py rename pynumaflow/mapper/{server.py => servicer/sync_servicer.py} (75%) rename pynumaflow/mapper/{ => servicer}/utils.py (87%) create mode 100644 pynumaflow/mapper/sync_server.py diff --git a/README.md b/README.md index f41252fd..14cacc01 100644 --- a/README.md +++ b/README.md @@ -153,15 +153,16 @@ Here we can pass the class instance to the server and the server will invoke the To use a class based handler, we the user needs to inherit the base class of the UDF/UDSink. And implement the required methods in the class. -Example For Mapper, the user needs to inherit the [MapperClass](pynumaflow/mapper/_dtypes.py#170) class and then implement the [handler](pynumaflow/mapper/_dtypes.py#170) method. +Example For Mapper, the user needs to inherit the [Mapper](pynumaflow/mapper/_dtypes.py#170) class and then implement the [handler](pynumaflow/mapper/_dtypes.py#170) method. ### Map ```python -from pynumaflow.mapper import Messages, Message, Datum, MapServer, MapperClass +from pynumaflow.mapper import Messages, Message, Datum, MapServer, Mapper -class MyHandler(MapperClass): - def handler(self, keys: list[str], datum: Datum) -> Messages: + +class MyHandler(Mapper): + def handler(self, keys: list[str], datum: Datum) -> Messages: val = datum.value _ = datum.event_time _ = datum.watermark @@ -174,6 +175,7 @@ class MyHandler(MapperClass): messages.append(Message(str.encode(s))) return messages + if __name__ == "__main__": class_instance = MyHandler() grpc_server = MapServer(mapper_instance=class_instance) diff --git a/examples/map/flatmap/example.py b/examples/map/flatmap/example.py index 3245b92d..f4da36c0 100644 --- a/examples/map/flatmap/example.py +++ b/examples/map/flatmap/example.py @@ -1,7 +1,7 @@ -from pynumaflow.mapper import Messages, Message, Datum, MapServer, MapperClass +from pynumaflow.mapper import Messages, Message, Datum, MapServer, Mapper -class Flatmap(MapperClass): +class Flatmap(Mapper): def handler(self, keys: list[str], datum: Datum) -> Messages: val = datum.value _ = datum.event_time @@ -17,6 +17,5 @@ def handler(self, keys: list[str], datum: Datum) -> Messages: if __name__ == "__main__": - flatmap_instance = Flatmap() - grpc_server = MapServer(mapper_instance=flatmap_instance) + grpc_server = MapServer(Flatmap()) grpc_server.start() diff --git a/examples/map/multiproc_map/example.py b/examples/map/multiproc_map/example.py index 4dfb44c5..96ef19c2 100644 --- a/examples/map/multiproc_map/example.py +++ b/examples/map/multiproc_map/example.py @@ -1,8 +1,7 @@ import math -from pynumaflow._constants import ServerType -from pynumaflow.mapper import Messages, Message, Datum, MapServer, MapperClass +from pynumaflow.mapper import Messages, Message, Datum, Mapper, MapMultiprocServer def is_prime(n): @@ -13,7 +12,7 @@ def is_prime(n): return True -class PrimeMap(MapperClass): +class PrimeMap(Mapper): def handler(self, keys: list[str], datum: Datum) -> Messages: val = datum.value _ = datum.event_time @@ -34,5 +33,5 @@ def handler(self, keys: list[str], datum: Datum) -> Messages: in the pipeline config for the numa container. """ prime_class = PrimeMap() - grpc_server = MapServer(mapper_instance=prime_class, server_type=ServerType.Multiproc) + grpc_server = MapMultiprocServer(mapper_instance=prime_class, server_count=2) grpc_server.start() diff --git a/pynumaflow/mapper/__init__.py b/pynumaflow/mapper/__init__.py index 6df5e6ec..4ebadabd 100644 --- a/pynumaflow/mapper/__init__.py +++ b/pynumaflow/mapper/__init__.py @@ -1,6 +1,18 @@ -from pynumaflow.mapper.map import MapServer +from pynumaflow.mapper.async_server import MapAsyncServer +from pynumaflow.mapper.multiproc_server import MapMultiprocServer +from pynumaflow.mapper.sync_server import MapServer -from pynumaflow.mapper._dtypes import Message, Messages, Datum, DROP, MapperClass +from pynumaflow.mapper._dtypes import Message, Messages, Datum, DROP, Mapper from pynumaflow._constants import ServerType -__all__ = ["Message", "Messages", "Datum", "DROP", "ServerType", "MapperClass", "MapServer"] +__all__ = [ + "Message", + "Messages", + "Datum", + "DROP", + "ServerType", + "Mapper", + "MapServer", + "MapAsyncServer", + "MapMultiprocServer", +] diff --git a/pynumaflow/mapper/_dtypes.py b/pynumaflow/mapper/_dtypes.py index 63ff1088..c7fda744 100644 --- a/pynumaflow/mapper/_dtypes.py +++ b/pynumaflow/mapper/_dtypes.py @@ -163,13 +163,9 @@ def watermark(self) -> datetime: return self._watermark -MapSyncCallable = Callable[[list[str], Datum], Messages] -MapAsyncCallable = Callable[[list[str], Datum], Awaitable[Messages]] - - -class MapperClass(metaclass=ABCMeta): +class Mapper(metaclass=ABCMeta): """ - Provides an interface to write a Mapper + Provides an interface to write a SyncMapServicer which will be exposed over a Synchronous gRPC server. """ @@ -183,10 +179,15 @@ class instance is sent as a callable. @abstractmethod def handler(self, keys: list[str], datum: Datum) -> Messages: """ - Write a handler function which implements the MapCallable interface. + Write a handler function which implements the MapSyncCallable interface. """ pass -# MapCallable is a callable which can be used as a handler for the Map UDF -MapCallable = Union[MapperClass, MapSyncCallable, MapAsyncCallable] +# MapSyncCallable is a callable which can be used as a handler for the Synchronous Map UDF +MapSyncHandlerCallable = Callable[[list[str], Datum], Messages] +MapSyncCallable = Union[Mapper, MapSyncHandlerCallable] + +# MapAsyncCallable is a callable which can be used as a handler for the Asynchronous Map UDF +MapAsyncHandlerCallable = Callable[[list[str], Datum], Awaitable[Messages]] +MapAsyncCallable = Union[Mapper, MapAsyncHandlerCallable] diff --git a/pynumaflow/mapper/async_server.py b/pynumaflow/mapper/async_server.py index ec76384f..85b7ff2e 100644 --- a/pynumaflow/mapper/async_server.py +++ b/pynumaflow/mapper/async_server.py @@ -1,72 +1,78 @@ +import os + +import aiorun import grpc -from google.protobuf import empty_pb2 as _empty_pb2 -from pynumaflow.mapper._dtypes import Datum -from pynumaflow.mapper._dtypes import MapAsyncCallable, MapCallable -from pynumaflow.proto.mapper import map_pb2, map_pb2_grpc -from pynumaflow.types import NumaflowServicerContext -from pynumaflow._constants import _LOGGER +from pynumaflow._constants import ( + MAX_THREADS, + MAX_MESSAGE_SIZE, + MAP_SOCK_PATH, +) +from pynumaflow.mapper._dtypes import MapAsyncCallable +from pynumaflow.mapper.servicer.async_servicer import AsyncMapServicer +from pynumaflow.proto.mapper import map_pb2_grpc +from pynumaflow.shared.server import ( + NumaflowServer, + start_async_server, +) -class AsyncMapper(map_pb2_grpc.MapServicer): +class MapAsyncServer(NumaflowServer): """ - This class is used to create a new grpc Async Map Servicer instance. - It implements the MapServicer interface from the proto map.proto file. - Provides the functionality for the required rpc methods. + Create a new grpc Map Server instance. """ def __init__( self, - handler: MapAsyncCallable, + mapper_instance: MapAsyncCallable, + sock_path=MAP_SOCK_PATH, + max_message_size=MAX_MESSAGE_SIZE, + max_threads=MAX_THREADS, ): - self.__map_handler: MapCallable = handler - - async def MapFn( - self, request: map_pb2.MapRequest, context: NumaflowServicerContext - ) -> map_pb2.MapResponse: """ - Applies a function to each datum element. - The pascal case function name comes from the proto map_pb2_grpc.py file. + Create a new grpc Asynchronous Map Server instance. + A new servicer instance is created and attached to the server. + The server instance is returned. + Args: + mapper_instance: The mapper instance to be used for Map UDF + sock_path: The UNIX socket path to be used for the server + max_message_size: The max message size in bytes the server can receive and send + max_threads: The max number of threads to be spawned; + defaults to number of processors x4 """ - # proto repeated field(keys) is of type google._upb._message.RepeatedScalarContainer - # we need to explicitly convert it to list - try: - res = await self.__invoke_map( - list(request.keys), - Datum( - keys=list(request.keys), - value=request.value, - event_time=request.event_time.ToDatetime(), - watermark=request.watermark.ToDatetime(), - ), - ) - except Exception as e: - context.set_code(grpc.StatusCode.UNKNOWN) - context.set_details(str(e)) - return map_pb2.MapResponse(results=[]) + self.sock_path = f"unix://{sock_path}" + self.max_threads = min(max_threads, int(os.getenv("MAX_THREADS", "4"))) + self.max_message_size = max_message_size - return map_pb2.MapResponse(results=res) + self.mapper_instance = mapper_instance - async def __invoke_map(self, keys: list[str], req: Datum): + self._server_options = [ + ("grpc.max_send_message_length", self.max_message_size), + ("grpc.max_receive_message_length", self.max_message_size), + ] + # Get the servicer instance for the async server + self.servicer = AsyncMapServicer(handler=mapper_instance) + + def start(self) -> None: """ - Invokes the user defined function. + Starter function for the Async server class, need a separate caller + so that all the async coroutines can be started from a single context """ - try: - msgs = await self.__map_handler(keys, req) - except Exception as err: - _LOGGER.critical("UDFError, re-raising the error", exc_info=True) - raise err - datums = [] - for msg in msgs: - datums.append(map_pb2.MapResponse.Result(keys=msg.keys, value=msg.value, tags=msg.tags)) - - return datums + aiorun.run(self.aexec(), use_uvloop=True) - async def IsReady( - self, request: _empty_pb2.Empty, context: NumaflowServicerContext - ) -> map_pb2.ReadyResponse: + async def aexec(self) -> None: """ - IsReady is the heartbeat endpoint for gRPC. - The pascal case function name comes from the proto map_pb2_grpc.py file. + Starts the Async gRPC server on the given UNIX socket with + given max threads. """ - return map_pb2.ReadyResponse(ready=True) + + # As the server is async, we need to create a new server instance in the + # same thread as the event loop so that all the async calls are made in the + # same context + + server_new = grpc.aio.server() + server_new.add_insecure_port(self.sock_path) + map_pb2_grpc.add_MapServicer_to_server(self.servicer, server_new) + + # Start the async server + await start_async_server(server_new, self.sock_path, self.max_threads, self._server_options) diff --git a/pynumaflow/mapper/map.py b/pynumaflow/mapper/map.py deleted file mode 100644 index f0df10fe..00000000 --- a/pynumaflow/mapper/map.py +++ /dev/null @@ -1,169 +0,0 @@ -import os - -import aiorun -import grpc -from pynumaflow.mapper.async_server import AsyncMapper - -from pynumaflow.mapper.server import Mapper - -from pynumaflow._constants import ( - MAX_THREADS, - MAX_MESSAGE_SIZE, - _LOGGER, - MAP_SOCK_PATH, - ServerType, - UDFType, -) - -from pynumaflow.mapper._dtypes import MapCallable -from pynumaflow.proto.mapper import map_pb2_grpc -from pynumaflow.shared.server import ( - NumaflowServer, - start_async_server, - start_multiproc_server, - sync_server_start, -) - - -class MapServer(NumaflowServer): - """ - Create a new grpc Map Server instance. - """ - - def __init__( - self, - mapper_instance: MapCallable, - sock_path=MAP_SOCK_PATH, - max_message_size=MAX_MESSAGE_SIZE, - max_threads=MAX_THREADS, - server_type=ServerType.Sync, - ): - """ - Create a new grpc Map Server instance. - A new servicer instance is created and attached to the server. - The server instance is returned. - Args: - mapper_instance: The mapper instance to be used for Map UDF - sock_path: The UNIX socket path to be used for the server - max_message_size: The max message size in bytes the server can receive and send - max_threads: The max number of threads to be spawned; - defaults to number of processors x4 - server_type: The type of server to be used, this can be one of the following: - - ServerType.Sync: Synchronous server - - ServerType.Async: Asynchronous server - - ServerType.Multiproc: Multiprocess server - """ - self.sock_path = f"unix://{sock_path}" - self.max_threads = min(max_threads, int(os.getenv("MAX_THREADS", "4"))) - self.max_message_size = max_message_size - - self.mapper_instance = mapper_instance - self.server_type = server_type - - self._server_options = [ - ("grpc.max_send_message_length", self.max_message_size), - ("grpc.max_receive_message_length", self.max_message_size), - ] - if server_type == ServerType.Multiproc: - self._server_options.append(("grpc.so_reuseport", 1)) - self._server_options.append(("grpc.so_reuseaddr", 1)) - - # Set the number of processes to be spawned to the number of CPUs or - # the value of the env var NUM_CPU_MULTIPROC defined by the user - # Setting the max value to 2 * CPU count - # Used for multiproc server - self._process_count = min( - int(os.getenv("NUM_CPU_MULTIPROC", str(os.cpu_count()))), 2 * os.cpu_count() - ) - - def start(self) -> None: - """ - Starter function for the server class, Handles the server type and - starts the server accordingly. If the server type is not supported, - raises NotImplementedError. - Currently supported server types are: - - ServerType.Sync: Synchronous server - - ServerType.Async: Asynchronous server - - ServerType.Multiproc: Multiprocess server - """ - if self.server_type == ServerType.Sync: - self.exec() - elif self.server_type == ServerType.Async: - aiorun.run(self.aexec(), use_uvloop=True) - elif self.server_type == ServerType.Multiproc: - self.exec_multiproc() - else: - _LOGGER.error("Server type not supported - %s", str(self.server_type)) - raise NotImplementedError - - def exec(self): - """ - Starts the Synchronous gRPC server on the given UNIX socket with given max threads. - """ - # Get the servicer instance based on the server type - map_servicer = self.get_servicer( - mapper_instance=self.mapper_instance, server_type=self.server_type - ) - _LOGGER.info( - "Sync GRPC Server listening on: %s with max threads: %s", - self.sock_path, - self.max_threads, - ) - # Start the server - sync_server_start( - servicer=map_servicer, - bind_address=self.sock_path, - max_threads=self.max_threads, - server_options=self._server_options, - udf_type=UDFType.Map, - ) - - def exec_multiproc(self): - """ - Starts the multirpoc gRPC server on the given UNIX socket with - given max threads. - """ - - # Get the servicer instance based on the server type - map_servicer = self.get_servicer( - mapper_instance=self.mapper_instance, server_type=self.server_type - ) - - # Start the multirpoc server - start_multiproc_server( - max_threads=self.max_threads, - servicer=map_servicer, - process_count=self._process_count, - server_options=self._server_options, - udf_type=UDFType.Map, - ) - - async def aexec(self) -> None: - """ - Starts the Async gRPC server on the given UNIX socket with - given max threads. - """ - - # As the server is async, we need to create a new server instance in the - # same thread as the event loop so that all the async calls are made in the - # same context - # Create a new async server instance and add the servicer to it - server_new = grpc.aio.server() - server_new.add_insecure_port(self.sock_path) - map_servicer = self.get_servicer( - mapper_instance=self.mapper_instance, server_type=self.server_type - ) - map_pb2_grpc.add_MapServicer_to_server(map_servicer, server_new) - - # Start the async server - await start_async_server(server_new, self.sock_path, self.max_threads, self._server_options) - - def get_servicer(self, mapper_instance: MapCallable, server_type: ServerType): - """Returns the servicer instance based on the server type""" - if server_type == ServerType.Sync: - map_servicer = Mapper(handler=mapper_instance) - elif server_type == ServerType.Async: - map_servicer = AsyncMapper(handler=mapper_instance) - elif server_type == ServerType.Multiproc: - map_servicer = Mapper(handler=mapper_instance) - return map_servicer diff --git a/pynumaflow/mapper/multiproc_server.py b/pynumaflow/mapper/multiproc_server.py new file mode 100644 index 00000000..8935e55b --- /dev/null +++ b/pynumaflow/mapper/multiproc_server.py @@ -0,0 +1,77 @@ +import os + +from pynumaflow._constants import ( + MAX_THREADS, + MAX_MESSAGE_SIZE, + MAP_SOCK_PATH, + UDFType, +) +from pynumaflow.mapper._dtypes import MapSyncCallable +from pynumaflow.mapper.servicer.sync_servicer import SyncMapServicer +from pynumaflow.shared.server import ( + NumaflowServer, + start_multiproc_server, +) + + +class MapMultiprocServer(NumaflowServer): + """ + Create a new grpc Multiproc Map Server instance. + """ + + def __init__( + self, + mapper_instance: MapSyncCallable, + server_count: int = os.cpu_count(), + sock_path=MAP_SOCK_PATH, + max_message_size=MAX_MESSAGE_SIZE, + max_threads=MAX_THREADS, + ): + """ + Create a new grpc Multiproc Map Server instance. + A new servicer instance is created and attached to the server. + The server instance is returned. + Args: + mapper_instance: The mapper instance to be used for Map UDF + server_count: The number of grpc server instances to be forked for multiproc + sock_path: The UNIX socket path to be used for the server + max_message_size: The max message size in bytes the server can receive and send + max_threads: The max number of threads to be spawned; + defaults to number of processors x4 + """ + self.sock_path = f"unix://{sock_path}" + self.max_threads = min(max_threads, int(os.getenv("MAX_THREADS", "4"))) + self.max_message_size = max_message_size + + self.mapper_instance = mapper_instance + + self._server_options = [ + ("grpc.max_send_message_length", self.max_message_size), + ("grpc.max_receive_message_length", self.max_message_size), + ("grpc.so_reuseport", 1), + ("grpc.so_reuseaddr", 1), + ] + # Set the number of processes to be spawned to the number of CPUs or + # the value of the env var NUM_CPU_MULTIPROC defined by the user + # Setting the max value to 2 * CPU count + # Used for multiproc server + self._process_count = min(server_count, 2 * os.cpu_count()) + self.servicer = SyncMapServicer(handler=mapper_instance) + + def start(self) -> None: + """ + Starts the N grpc servers gRPC serves on the with + given max threads. + where N = The number of CPUs or the + value of the env var NUM_CPU_MULTIPROC defined by the user. The max value + is set to 2 * CPU count. + """ + + # Start the multiproc server + start_multiproc_server( + max_threads=self.max_threads, + servicer=self.servicer, + process_count=self._process_count, + server_options=self._server_options, + udf_type=UDFType.Map, + ) diff --git a/pynumaflow/mapper/servicer/__init__.py b/pynumaflow/mapper/servicer/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pynumaflow/mapper/servicer/async_servicer.py b/pynumaflow/mapper/servicer/async_servicer.py new file mode 100644 index 00000000..9b076cce --- /dev/null +++ b/pynumaflow/mapper/servicer/async_servicer.py @@ -0,0 +1,72 @@ +import grpc +from google.protobuf import empty_pb2 as _empty_pb2 + +from pynumaflow.mapper._dtypes import Datum +from pynumaflow.mapper._dtypes import MapAsyncHandlerCallable, MapSyncCallable +from pynumaflow.proto.mapper import map_pb2, map_pb2_grpc +from pynumaflow.types import NumaflowServicerContext +from pynumaflow._constants import _LOGGER + + +class AsyncMapServicer(map_pb2_grpc.MapServicer): + """ + This class is used to create a new grpc Async Map Servicer instance. + It implements the SyncMapServicer interface from the proto map.proto file. + Provides the functionality for the required rpc methods. + """ + + def __init__( + self, + handler: MapAsyncHandlerCallable, + ): + self.__map_handler: MapSyncCallable = handler + + async def MapFn( + self, request: map_pb2.MapRequest, context: NumaflowServicerContext + ) -> map_pb2.MapResponse: + """ + Applies a function to each datum element. + The pascal case function name comes from the proto map_pb2_grpc.py file. + """ + # proto repeated field(keys) is of type google._upb._message.RepeatedScalarContainer + # we need to explicitly convert it to list + try: + res = await self.__invoke_map( + list(request.keys), + Datum( + keys=list(request.keys), + value=request.value, + event_time=request.event_time.ToDatetime(), + watermark=request.watermark.ToDatetime(), + ), + ) + except Exception as e: + context.set_code(grpc.StatusCode.UNKNOWN) + context.set_details(str(e)) + return map_pb2.MapResponse(results=[]) + + return map_pb2.MapResponse(results=res) + + async def __invoke_map(self, keys: list[str], req: Datum): + """ + Invokes the user defined function. + """ + try: + msgs = await self.__map_handler(keys, req) + except Exception as err: + _LOGGER.critical("UDFError, re-raising the error", exc_info=True) + raise err + datums = [] + for msg in msgs: + datums.append(map_pb2.MapResponse.Result(keys=msg.keys, value=msg.value, tags=msg.tags)) + + return datums + + async def IsReady( + self, request: _empty_pb2.Empty, context: NumaflowServicerContext + ) -> map_pb2.ReadyResponse: + """ + IsReady is the heartbeat endpoint for gRPC. + The pascal case function name comes from the proto map_pb2_grpc.py file. + """ + return map_pb2.ReadyResponse(ready=True) diff --git a/pynumaflow/mapper/server.py b/pynumaflow/mapper/servicer/sync_servicer.py similarity index 75% rename from pynumaflow/mapper/server.py rename to pynumaflow/mapper/servicer/sync_servicer.py index 88647dbb..b01690d0 100644 --- a/pynumaflow/mapper/server.py +++ b/pynumaflow/mapper/servicer/sync_servicer.py @@ -1,23 +1,23 @@ from google.protobuf import empty_pb2 as _empty_pb2 -from pynumaflow.mapper._dtypes import MapCallable +from pynumaflow.mapper._dtypes import MapSyncCallable from pynumaflow.proto.mapper import map_pb2, map_pb2_grpc -from pynumaflow.mapper.utils import _map_fn_util +from pynumaflow.mapper.servicer.utils import _map_fn_util from pynumaflow.types import NumaflowServicerContext -class Mapper(map_pb2_grpc.MapServicer): +class SyncMapServicer(map_pb2_grpc.MapServicer): """ This class is used to create a new grpc Map Servicer instance. - It implements the MapServicer interface from the proto map.proto file. + It implements the SyncMapServicer interface from the proto map.proto file. Provides the functionality for the required rpc methods. """ def __init__( self, - handler: MapCallable, + handler: MapSyncCallable, ): - self.__map_handler: MapCallable = handler + self.__map_handler: MapSyncCallable = handler def MapFn( self, request: map_pb2.MapRequest, context: NumaflowServicerContext diff --git a/pynumaflow/mapper/utils.py b/pynumaflow/mapper/servicer/utils.py similarity index 87% rename from pynumaflow/mapper/utils.py rename to pynumaflow/mapper/servicer/utils.py index 8e764cb9..c0c26185 100644 --- a/pynumaflow/mapper/utils.py +++ b/pynumaflow/mapper/servicer/utils.py @@ -1,5 +1,5 @@ import grpc -from pynumaflow.mapper._dtypes import MapCallable +from pynumaflow.mapper._dtypes import MapSyncCallable from pynumaflow.mapper._dtypes import Datum from pynumaflow.proto.mapper import map_pb2 @@ -8,7 +8,7 @@ def _map_fn_util( - __map_handler: MapCallable, request: map_pb2.MapRequest, context: NumaflowServicerContext + __map_handler: MapSyncCallable, request: map_pb2.MapRequest, context: NumaflowServicerContext ) -> map_pb2.MapResponse: # proto repeated field(keys) is of type google._upb._message.RepeatedScalarContainer # we need to explicitly convert it to list diff --git a/pynumaflow/mapper/sync_server.py b/pynumaflow/mapper/sync_server.py new file mode 100644 index 00000000..957b9784 --- /dev/null +++ b/pynumaflow/mapper/sync_server.py @@ -0,0 +1,73 @@ +import os + + +from pynumaflow.mapper.servicer.sync_servicer import SyncMapServicer + +from pynumaflow._constants import ( + MAX_THREADS, + MAX_MESSAGE_SIZE, + _LOGGER, + MAP_SOCK_PATH, + UDFType, +) + +from pynumaflow.mapper._dtypes import MapSyncCallable +from pynumaflow.shared.server import ( + NumaflowServer, + sync_server_start, +) + + +class MapServer(NumaflowServer): + """ + Create a new grpc Map Server instance. + """ + + def __init__( + self, + mapper_instance: MapSyncCallable, + sock_path=MAP_SOCK_PATH, + max_message_size=MAX_MESSAGE_SIZE, + max_threads=MAX_THREADS, + ): + """ + Create a new grpc Synchronous Map Server instance. + A new servicer instance is created and attached to the server. + The server instance is returned. + Args: + mapper_instance: The mapper instance to be used for Map UDF + sock_path: The UNIX socket path to be used for the server + max_message_size: The max message size in bytes the server can receive and send + max_threads: The max number of threads to be spawned; + defaults to number of processors x4 + """ + self.sock_path = f"unix://{sock_path}" + self.max_threads = min(max_threads, int(os.getenv("MAX_THREADS", "4"))) + self.max_message_size = max_message_size + + self.mapper_instance = mapper_instance + + self._server_options = [ + ("grpc.max_send_message_length", self.max_message_size), + ("grpc.max_receive_message_length", self.max_message_size), + ] + # Get the servicer instance for the sync server + self.servicer = SyncMapServicer(handler=mapper_instance) + + def start(self) -> None: + """ + Starts the Synchronous gRPC server on the given UNIX socket with given max threads. + """ + _LOGGER.info( + "Sync GRPC Server listening on: %s with max threads: %s", + self.sock_path, + self.max_threads, + ) + # Start the server + sync_server_start( + servicer=self.servicer, + bind_address=self.sock_path, + max_threads=self.max_threads, + server_options=self._server_options, + udf_type=UDFType.Map, + ) diff --git a/pynumaflow/mapstreamer/_dtypes.py b/pynumaflow/mapstreamer/_dtypes.py index 25b7e444..03e6a7f6 100644 --- a/pynumaflow/mapstreamer/_dtypes.py +++ b/pynumaflow/mapstreamer/_dtypes.py @@ -182,7 +182,7 @@ def __call__(self, *args, **kwargs): @abstractmethod async def handler(self, keys: list[str], datum: Datum) -> AsyncIterable[Message]: """ - Write a handler function which implements the MapCallable interface. + Write a handler function which implements the MapSyncCallable interface. """ pass diff --git a/pynumaflow/mapstreamer/async_server.py b/pynumaflow/mapstreamer/async_server.py index 7f9978bc..f67f37cb 100644 --- a/pynumaflow/mapstreamer/async_server.py +++ b/pynumaflow/mapstreamer/async_server.py @@ -18,7 +18,7 @@ class AsyncMapStreamer(mapstream_pb2_grpc.MapStreamServicer): """ This class is used to create a new grpc Map Stream Servicer instance. - It implements the MapServicer interface from the proto + It implements the SyncMapServicer interface from the proto mapstream_pb2_grpc.py file. Provides the functionality for the required rpc methods. """ diff --git a/pynumaflow/reducer/async_server.py b/pynumaflow/reducer/async_server.py index e83a8106..f6210e38 100644 --- a/pynumaflow/reducer/async_server.py +++ b/pynumaflow/reducer/async_server.py @@ -36,7 +36,7 @@ async def datum_generator( class AsyncReducer(reduce_pb2_grpc.ReduceServicer): """ This class is used to create a new grpc Reduce servicer instance. - It implements the MapServicer interface from the proto reduce.proto file. + It implements the SyncMapServicer interface from the proto reduce.proto file. Provides the functionality for the required rpc methods. """ diff --git a/tests/map/test_async_mapper.py b/tests/map/test_async_mapper.py index d50da394..1d3fa7f2 100644 --- a/tests/map/test_async_mapper.py +++ b/tests/map/test_async_mapper.py @@ -7,15 +7,14 @@ from google.protobuf import empty_pb2 as _empty_pb2 from google.protobuf import timestamp_pb2 as _timestamp_pb2 from grpc.aio._server import Server -from pynumaflow._constants import ServerType from pynumaflow import setup_logging from pynumaflow.mapper import ( Datum, Messages, Message, - MapServer, ) +from pynumaflow.mapper.async_server import MapAsyncServer from pynumaflow.proto.mapper import map_pb2, map_pb2_grpc from tests.testing_utils import ( mock_event_time, @@ -63,8 +62,8 @@ def startup_callable(loop): def new_async_mapper(): - server = MapServer(mapper_instance=async_map_handler, server_type=ServerType.Async) - udfs = server.get_servicer(mapper_instance=async_map_handler, server_type=ServerType.Async) + server = MapAsyncServer(mapper_instance=async_map_handler) + udfs = server.servicer return udfs @@ -212,6 +211,10 @@ def test_is_ready(self) -> None: self.assertTrue(response.ready) + def test_invalid_input(self): + with self.assertRaises(TypeError): + MapAsyncServer() + def __stub(self): return map_pb2_grpc.MapStub(_channel) diff --git a/tests/map/test_messages.py b/tests/map/test_messages.py index eab2f01a..b2edbad7 100644 --- a/tests/map/test_messages.py +++ b/tests/map/test_messages.py @@ -1,6 +1,6 @@ import unittest -from pynumaflow.mapper import Messages, Message, DROP, MapperClass, Datum +from pynumaflow.mapper import Messages, Message, DROP, Mapper, Datum from tests.testing_utils import mock_message @@ -90,7 +90,7 @@ def test_err(self): msgts[:1] -class ExampleMapper(MapperClass): +class ExampleMapper(Mapper): def handler(self, keys: list[str], datum: Datum) -> Messages: messages = Messages() messages.append(Message(mock_message(), keys=keys)) diff --git a/tests/map/test_multiproc_mapper.py b/tests/map/test_multiproc_mapper.py index 62cbdfdb..71e090dc 100644 --- a/tests/map/test_multiproc_mapper.py +++ b/tests/map/test_multiproc_mapper.py @@ -1,16 +1,14 @@ import os import unittest from unittest import mock -from unittest.mock import patch, Mock import grpc from google.protobuf import empty_pb2 as _empty_pb2 from google.protobuf import timestamp_pb2 as _timestamp_pb2 from grpc import StatusCode from grpc_testing import server_from_dictionary, strict_real_time -from pynumaflow._constants import ServerType -from pynumaflow.mapper import MapServer +from pynumaflow.mapper import MapMultiprocServer from pynumaflow.proto.mapper import map_pb2 from tests.map.utils import map_handler, err_map_handler from tests.testing_utils import ( @@ -26,35 +24,27 @@ def mockenv(**envvars): class TestMultiProcMethods(unittest.TestCase): def setUp(self) -> None: - my_server = MapServer(mapper_instance=map_handler) - my_servicer = my_server.get_servicer( - mapper_instance=map_handler, server_type=ServerType.Multiproc - ) - services = {map_pb2.DESCRIPTOR.services_by_name["Map"]: my_servicer} + my_server = MapMultiprocServer(mapper_instance=map_handler) + services = {map_pb2.DESCRIPTOR.services_by_name["Map"]: my_server.servicer} self.test_server = server_from_dictionary(services, strict_real_time()) - @mockenv(NUM_CPU_MULTIPROC="3") def test_multiproc_init(self) -> None: - my_server = MapServer(mapper_instance=map_handler, server_type=ServerType.Multiproc) + my_server = MapMultiprocServer(mapper_instance=map_handler, server_count=3) self.assertEqual(my_server._process_count, 3) - @patch("os.cpu_count", Mock(return_value=4)) def test_multiproc_process_count(self) -> None: - my_server = MapServer(mapper_instance=map_handler, server_type=ServerType.Multiproc) - self.assertEqual(my_server._process_count, 4) + my_server = MapMultiprocServer(mapper_instance=map_handler) + self.assertEqual(my_server._process_count, 8) - @patch("os.cpu_count", Mock(return_value=4)) - @mockenv(NUM_CPU_MULTIPROC="10") def test_max_process_count(self) -> None: - server = MapServer(mapper_instance=map_handler, server_type=ServerType.Multiproc) - self.assertEqual(server._process_count, 8) + """Max process count is capped at 2 * os.cpu_count, irrespective of what the user + provides as input""" + server = MapMultiprocServer(mapper_instance=map_handler, server_count=20) + self.assertEqual(server._process_count, 16) def test_udf_map_err(self): - my_server = MapServer(mapper_instance=err_map_handler) - my_servicer = my_server.get_servicer( - mapper_instance=my_server.mapper_instance, server_type=ServerType.Multiproc - ) - services = {map_pb2.DESCRIPTOR.services_by_name["Map"]: my_servicer} + my_server = MapMultiprocServer(mapper_instance=err_map_handler) + services = {map_pb2.DESCRIPTOR.services_by_name["Map"]: my_server.servicer} self.test_server = server_from_dictionary(services, strict_real_time()) event_time_timestamp = _timestamp_pb2.Timestamp() @@ -131,7 +121,7 @@ def test_map_forward_message(self): def test_invalid_input(self): with self.assertRaises(TypeError): - MapServer(server_type=ServerType.Multiproc) + MapMultiprocServer() if __name__ == "__main__": diff --git a/tests/map/test_sync_mapper.py b/tests/map/test_sync_mapper.py index afe0b6a5..1489ae47 100644 --- a/tests/map/test_sync_mapper.py +++ b/tests/map/test_sync_mapper.py @@ -5,7 +5,6 @@ from google.protobuf import timestamp_pb2 as _timestamp_pb2 from grpc import StatusCode from grpc_testing import server_from_dictionary, strict_real_time -from pynumaflow._constants import ServerType from pynumaflow.mapper import MapServer from pynumaflow.proto.mapper import map_pb2 @@ -21,11 +20,11 @@ class TestSyncMapper(unittest.TestCase): def setUp(self) -> None: class_instance = ExampleMap() my_server = MapServer(mapper_instance=class_instance) - my_servicer = my_server.get_servicer( - mapper_instance=map_handler, server_type=ServerType.Sync - ) + # my_servicer = my_server.get_servicer( + # mapper_instance=map_handler, server_type=ServerType.Sync + # ) - services = {map_pb2.DESCRIPTOR.services_by_name["Map"]: my_servicer} + services = {map_pb2.DESCRIPTOR.services_by_name["Map"]: my_server.servicer} self.test_server = server_from_dictionary(services, strict_real_time()) def test_init_with_args(self) -> None: @@ -39,10 +38,7 @@ def test_init_with_args(self) -> None: def test_udf_map_err(self): my_server = MapServer(mapper_instance=err_map_handler) - my_servicer = my_server.get_servicer( - mapper_instance=my_server.mapper_instance, server_type=ServerType.Sync - ) - services = {map_pb2.DESCRIPTOR.services_by_name["Map"]: my_servicer} + services = {map_pb2.DESCRIPTOR.services_by_name["Map"]: my_server.servicer} self.test_server = server_from_dictionary(services, strict_real_time()) event_time_timestamp = _timestamp_pb2.Timestamp() @@ -69,10 +65,7 @@ def test_udf_map_err(self): def test_udf_map_error_response(self): my_server = MapServer(mapper_instance=err_map_handler) - my_servicer = my_server.get_servicer( - mapper_instance=my_server.mapper_instance, server_type=my_server.server_type - ) - services = {map_pb2.DESCRIPTOR.services_by_name["Map"]: my_servicer} + services = {map_pb2.DESCRIPTOR.services_by_name["Map"]: my_server.servicer} self.test_server = server_from_dictionary(services, strict_real_time()) event_time_timestamp = _timestamp_pb2.Timestamp() @@ -150,8 +143,6 @@ def test_map_forward_message(self): def test_invalid_input(self): with self.assertRaises(TypeError): MapServer() - with self.assertRaises(NotImplementedError): - MapServer(mapper_instance=map_handler, server_type="ERORR").start() if __name__ == "__main__": diff --git a/tests/map/utils.py b/tests/map/utils.py index fb7a36be..6cecd503 100644 --- a/tests/map/utils.py +++ b/tests/map/utils.py @@ -1,11 +1,11 @@ -from pynumaflow.mapper import Datum, Messages, Message, MapperClass +from pynumaflow.mapper import Datum, Messages, Message, Mapper async def async_map_error_fn(keys: list[str], datum: Datum) -> Messages: raise ValueError("error invoking map") -class ExampleMap(MapperClass): +class ExampleMap(Mapper): def handler(self, keys: list[str], datum: Datum) -> Messages: val = datum.value msg = "payload:{} event_time:{} watermark:{}".format( From fe6ec20cfa6c0120596db91b4f9a862c0ef913df Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Tue, 16 Jan 2024 15:30:55 -0800 Subject: [PATCH 56/78] seperate mappers Signed-off-by: Sidhant Kohli --- pynumaflow/_constants.py | 3 +-- pynumaflow/mapper/multiproc_server.py | 5 +++-- tests/map/test_multiproc_mapper.py | 6 ++++-- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/pynumaflow/_constants.py b/pynumaflow/_constants.py index 883e8731..137c738d 100644 --- a/pynumaflow/_constants.py +++ b/pynumaflow/_constants.py @@ -1,5 +1,4 @@ import logging -import multiprocessing import os from enum import Enum @@ -26,7 +25,7 @@ DELIMITER = ":" DROP = "U+005C__DROP__" -_PROCESS_COUNT = multiprocessing.cpu_count() +_PROCESS_COUNT = os.cpu_count() MAX_THREADS = int(os.getenv("MAX_THREADS", "4")) _LOGGER = setup_logging(__name__) diff --git a/pynumaflow/mapper/multiproc_server.py b/pynumaflow/mapper/multiproc_server.py index 8935e55b..b76dbd3d 100644 --- a/pynumaflow/mapper/multiproc_server.py +++ b/pynumaflow/mapper/multiproc_server.py @@ -5,6 +5,7 @@ MAX_MESSAGE_SIZE, MAP_SOCK_PATH, UDFType, + _PROCESS_COUNT, ) from pynumaflow.mapper._dtypes import MapSyncCallable from pynumaflow.mapper.servicer.sync_servicer import SyncMapServicer @@ -22,7 +23,7 @@ class MapMultiprocServer(NumaflowServer): def __init__( self, mapper_instance: MapSyncCallable, - server_count: int = os.cpu_count(), + server_count: int = _PROCESS_COUNT, sock_path=MAP_SOCK_PATH, max_message_size=MAX_MESSAGE_SIZE, max_threads=MAX_THREADS, @@ -55,7 +56,7 @@ def __init__( # the value of the env var NUM_CPU_MULTIPROC defined by the user # Setting the max value to 2 * CPU count # Used for multiproc server - self._process_count = min(server_count, 2 * os.cpu_count()) + self._process_count = min(server_count, 2 * _PROCESS_COUNT) self.servicer = SyncMapServicer(handler=mapper_instance) def start(self) -> None: diff --git a/tests/map/test_multiproc_mapper.py b/tests/map/test_multiproc_mapper.py index 71e090dc..ccb10b5a 100644 --- a/tests/map/test_multiproc_mapper.py +++ b/tests/map/test_multiproc_mapper.py @@ -33,14 +33,16 @@ def test_multiproc_init(self) -> None: self.assertEqual(my_server._process_count, 3) def test_multiproc_process_count(self) -> None: + default_val = os.cpu_count() my_server = MapMultiprocServer(mapper_instance=map_handler) - self.assertEqual(my_server._process_count, 8) + self.assertEqual(my_server._process_count, default_val) def test_max_process_count(self) -> None: """Max process count is capped at 2 * os.cpu_count, irrespective of what the user provides as input""" + default_val = os.cpu_count() server = MapMultiprocServer(mapper_instance=map_handler, server_count=20) - self.assertEqual(server._process_count, 16) + self.assertEqual(server._process_count, default_val * 2) def test_udf_map_err(self): my_server = MapMultiprocServer(mapper_instance=err_map_handler) From 0575d2d51e7c6a1587d7f16a613fdd1b51541549 Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Tue, 16 Jan 2024 15:55:30 -0800 Subject: [PATCH 57/78] seperate mapstream Signed-off-by: Sidhant Kohli --- pynumaflow/mapstreamer/__init__.py | 8 +- pynumaflow/mapstreamer/_dtypes.py | 4 +- pynumaflow/mapstreamer/async_server.py | 108 ++++++++++-------- pynumaflow/mapstreamer/mapstream.py | 103 ----------------- pynumaflow/mapstreamer/servicer/__init__.py | 0 .../mapstreamer/servicer/async_servicer.py | 65 +++++++++++ tests/mapstream/test_async_map_stream.py | 8 +- tests/mapstream/test_async_map_stream_err.py | 15 +-- 8 files changed, 137 insertions(+), 174 deletions(-) delete mode 100644 pynumaflow/mapstreamer/mapstream.py create mode 100644 pynumaflow/mapstreamer/servicer/__init__.py create mode 100644 pynumaflow/mapstreamer/servicer/async_servicer.py diff --git a/pynumaflow/mapstreamer/__init__.py b/pynumaflow/mapstreamer/__init__.py index be47226b..f26f4bd4 100644 --- a/pynumaflow/mapstreamer/__init__.py +++ b/pynumaflow/mapstreamer/__init__.py @@ -1,13 +1,13 @@ from pynumaflow._constants import DROP -from pynumaflow.mapstreamer._dtypes import Message, Messages, Datum, MapStreamerClass -from pynumaflow.mapstreamer.mapstream import MapStreamServer +from pynumaflow.mapstreamer._dtypes import Message, Messages, Datum, MapStreamer +from pynumaflow.mapstreamer.async_server import MapStreamAsyncServer __all__ = [ "Message", "Messages", "Datum", "DROP", - "MapStreamServer", - "MapStreamerClass", + "MapStreamAsyncServer", + "MapStreamer", ] diff --git a/pynumaflow/mapstreamer/_dtypes.py b/pynumaflow/mapstreamer/_dtypes.py index 03e6a7f6..a5363370 100644 --- a/pynumaflow/mapstreamer/_dtypes.py +++ b/pynumaflow/mapstreamer/_dtypes.py @@ -164,7 +164,7 @@ def watermark(self) -> datetime: return self._watermark -class MapStreamerClass(metaclass=ABCMeta): +class MapStreamer(metaclass=ABCMeta): """ Provides an interface to write a Map Streamer which will be exposed over a gRPC server. @@ -188,4 +188,4 @@ async def handler(self, keys: list[str], datum: Datum) -> AsyncIterable[Message] MapStreamAsyncCallable = Callable[[list[str], Datum], AsyncIterable[Message]] -MapStreamCallable = Union[MapStreamerClass, MapStreamAsyncCallable] +MapStreamCallable = Union[MapStreamer, MapStreamAsyncCallable] diff --git a/pynumaflow/mapstreamer/async_server.py b/pynumaflow/mapstreamer/async_server.py index f67f37cb..415b8399 100644 --- a/pynumaflow/mapstreamer/async_server.py +++ b/pynumaflow/mapstreamer/async_server.py @@ -1,70 +1,80 @@ -import multiprocessing import os -from collections.abc import AsyncIterable +import aiorun +import grpc -from google.protobuf import empty_pb2 as _empty_pb2 +from pynumaflow.mapstreamer.servicer.async_servicer import AsyncMapStreamServicer +from pynumaflow.proto.mapstreamer import mapstream_pb2_grpc + +from pynumaflow._constants import ( + MAP_STREAM_SOCK_PATH, + MAX_MESSAGE_SIZE, + MAX_THREADS, + _LOGGER, +) -from pynumaflow.mapstreamer import Datum from pynumaflow.mapstreamer._dtypes import MapStreamCallable -from pynumaflow.proto.mapstreamer import mapstream_pb2_grpc, mapstream_pb2 -from pynumaflow.types import NumaflowServicerContext -from pynumaflow._constants import _LOGGER -_PROCESS_COUNT = multiprocessing.cpu_count() -MAX_THREADS = int(os.getenv("MAX_THREADS", 0)) or (_PROCESS_COUNT * 4) +from pynumaflow.shared.server import NumaflowServer, start_async_server -class AsyncMapStreamer(mapstream_pb2_grpc.MapStreamServicer): +class MapStreamAsyncServer(NumaflowServer): """ - This class is used to create a new grpc Map Stream Servicer instance. - It implements the SyncMapServicer interface from the proto - mapstream_pb2_grpc.py file. - Provides the functionality for the required rpc methods. + Class for a new Map Stream Server instance. """ def __init__( self, - handler: MapStreamCallable, + map_stream_instance: MapStreamCallable, + sock_path=MAP_STREAM_SOCK_PATH, + max_message_size=MAX_MESSAGE_SIZE, + max_threads=MAX_THREADS, ): - self.__map_stream_handler: MapStreamCallable = handler - - async def MapStreamFn( - self, - request: mapstream_pb2.MapStreamRequest, - context: NumaflowServicerContext, - ) -> AsyncIterable[mapstream_pb2.MapStreamResponse]: """ - Applies a map function to a datum stream in streaming mode. - The pascal case function name comes from the proto mapstream_pb2_grpc.py file. + Create a new grpc Async Map Stream Server instance. + A new servicer instance is created and attached to the server. + The server instance is returned. + Args: + map_stream_instance: The map stream instance to be used for Map Stream UDF + sock_path: The UNIX socket path to be used for the server + max_message_size: The max message size in bytes the server can receive and send + max_threads: The max number of threads to be spawned; + defaults to number of processors x4 + server_type: The type of server to be used """ + self.map_stream_instance: MapStreamCallable = map_stream_instance + self.sock_path = f"unix://{sock_path}" + self.max_threads = min(max_threads, int(os.getenv("MAX_THREADS", "4"))) + self.max_message_size = max_message_size - async for res in self.__invoke_map_stream( - list(request.keys), - Datum( - keys=list(request.keys), - value=request.value, - event_time=request.event_time.ToDatetime(), - watermark=request.watermark.ToDatetime(), - ), - ): - yield mapstream_pb2.MapStreamResponse(result=res) + self._server_options = [ + ("grpc.max_send_message_length", self.max_message_size), + ("grpc.max_receive_message_length", self.max_message_size), + ] - async def __invoke_map_stream(self, keys: list[str], req: Datum): - try: - async for msg in self.__map_stream_handler(keys, req): - yield mapstream_pb2.MapStreamResponse.Result( - keys=msg.keys, value=msg.value, tags=msg.tags - ) - except Exception as err: - _LOGGER.critical("UDFError, re-raising the error", exc_info=True) - raise err + self.servicer = AsyncMapStreamServicer(handler=self.map_stream_instance) + + def start(self): + """ + Starter function for the Async Map Stream server, we need a separate caller + to the aexec so that all the async coroutines can be started from a single context + """ + aiorun.run(self.aexec(), use_uvloop=True) - async def IsReady( - self, request: _empty_pb2.Empty, context: NumaflowServicerContext - ) -> mapstream_pb2.ReadyResponse: + async def aexec(self): """ - IsReady is the heartbeat endpoint for gRPC. - The pascal case function name comes from the proto mapstream_pb2_grpc.py file. + Starts the Async gRPC server on the given UNIX socket with + given max threads. """ - return mapstream_pb2.ReadyResponse(ready=True) + # As the server is async, we need to create a new server instance in the + # same thread as the event loop so that all the async calls are made in the + # same context + # Create a new async server instance and add the servicer to it + server = grpc.aio.server() + server.add_insecure_port(self.sock_path) + mapstream_pb2_grpc.add_MapStreamServicer_to_server( + self.servicer, + server, + ) + _LOGGER.info("Starting Map Stream Server") + await start_async_server(server, self.sock_path, self.max_threads, self._server_options) diff --git a/pynumaflow/mapstreamer/mapstream.py b/pynumaflow/mapstreamer/mapstream.py deleted file mode 100644 index c7fb47eb..00000000 --- a/pynumaflow/mapstreamer/mapstream.py +++ /dev/null @@ -1,103 +0,0 @@ -import os - -import aiorun -import grpc - -from pynumaflow.mapstreamer.async_server import AsyncMapStreamer -from pynumaflow.proto.mapstreamer import mapstream_pb2_grpc - -from pynumaflow._constants import ( - MAP_STREAM_SOCK_PATH, - MAX_MESSAGE_SIZE, - MAX_THREADS, - ServerType, - _LOGGER, -) - -from pynumaflow.mapstreamer._dtypes import MapStreamCallable - -from pynumaflow.shared.server import NumaflowServer, start_async_server - - -class MapStreamServer(NumaflowServer): - """ - Class for a new Map Stream Server instance. - """ - - def __init__( - self, - map_stream_instance: MapStreamCallable, - sock_path=MAP_STREAM_SOCK_PATH, - max_message_size=MAX_MESSAGE_SIZE, - max_threads=MAX_THREADS, - server_type=ServerType.Async, - ): - """ - Create a new grpc Map Stream Server instance. - A new servicer instance is created and attached to the server. - The server instance is returned. - Args: - map_stream_instance: The map stream instance to be used for Map Stream UDF - sock_path: The UNIX socket path to be used for the server - max_message_size: The max message size in bytes the server can receive and send - max_threads: The max number of threads to be spawned; - defaults to number of processors x4 - server_type: The type of server to be used - """ - self.map_stream_instance: MapStreamCallable = map_stream_instance - self.sock_path = f"unix://{sock_path}" - self.max_message_size = max_message_size - self.max_threads = min(max_threads, int(os.getenv("MAX_THREADS", "4"))) - self.server_type = server_type - - self._server_options = [ - ("grpc.max_send_message_length", self.max_message_size), - ("grpc.max_receive_message_length", self.max_message_size), - ] - - def start(self): - """ - Starter function for the Map Stream server, Handles the server type and - starts the server accordingly. If the server type is not supported, - raises NotImplementedError. - Currently supported server types are: - - ServerType.Async: Asynchronous server - """ - if self.server_type == ServerType.Async: - aiorun.run(self.aexec(), use_uvloop=True) - else: - _LOGGER.error("Server type not supported - %s", str(self.server_type)) - raise NotImplementedError - - async def aexec(self): - """ - Starts the Async gRPC server on the given UNIX socket with - given max threads. - """ - # As the server is async, we need to create a new server instance in the - # same thread as the event loop so that all the async calls are made in the - # same context - # Create a new async server instance and add the servicer to it - server = grpc.aio.server() - server.add_insecure_port(self.sock_path) - map_servicer = self.get_servicer( - map_stream_instance=self.map_stream_instance, server_type=self.server_type - ) - mapstream_pb2_grpc.add_MapStreamServicer_to_server( - map_servicer, - server, - ) - _LOGGER.info("Starting Map Stream Server") - await start_async_server(server, self.sock_path, self.max_threads, self._server_options) - - def get_servicer(self, map_stream_instance: MapStreamCallable, server_type: ServerType): - """ - Returns the servicer instance based on the server type. - Currently supported server types are: - - ServerType.Async: Asynchronous server - """ - if server_type == ServerType.Async: - map_servicer = AsyncMapStreamer(handler=map_stream_instance) - else: - raise NotImplementedError - return map_servicer diff --git a/pynumaflow/mapstreamer/servicer/__init__.py b/pynumaflow/mapstreamer/servicer/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pynumaflow/mapstreamer/servicer/async_servicer.py b/pynumaflow/mapstreamer/servicer/async_servicer.py new file mode 100644 index 00000000..dc9b3ece --- /dev/null +++ b/pynumaflow/mapstreamer/servicer/async_servicer.py @@ -0,0 +1,65 @@ + +from collections.abc import AsyncIterable + +from google.protobuf import empty_pb2 as _empty_pb2 + +from pynumaflow.mapstreamer import Datum +from pynumaflow.mapstreamer._dtypes import MapStreamCallable +from pynumaflow.proto.mapstreamer import mapstream_pb2_grpc, mapstream_pb2 +from pynumaflow.types import NumaflowServicerContext +from pynumaflow._constants import _LOGGER + + +class AsyncMapStreamServicer(mapstream_pb2_grpc.MapStreamServicer): + """ + This class is used to create a new grpc Map Stream Servicer instance. + It implements the SyncMapServicer interface from the proto + mapstream_pb2_grpc.py file. + Provides the functionality for the required rpc methods. + """ + + def __init__( + self, + handler: MapStreamCallable, + ): + self.__map_stream_handler: MapStreamCallable = handler + + async def MapStreamFn( + self, + request: mapstream_pb2.MapStreamRequest, + context: NumaflowServicerContext, + ) -> AsyncIterable[mapstream_pb2.MapStreamResponse]: + """ + Applies a map function to a datum stream in streaming mode. + The pascal case function name comes from the proto mapstream_pb2_grpc.py file. + """ + + async for res in self.__invoke_map_stream( + list(request.keys), + Datum( + keys=list(request.keys), + value=request.value, + event_time=request.event_time.ToDatetime(), + watermark=request.watermark.ToDatetime(), + ), + ): + yield mapstream_pb2.MapStreamResponse(result=res) + + async def __invoke_map_stream(self, keys: list[str], req: Datum): + try: + async for msg in self.__map_stream_handler(keys, req): + yield mapstream_pb2.MapStreamResponse.Result( + keys=msg.keys, value=msg.value, tags=msg.tags + ) + except Exception as err: + _LOGGER.critical("UDFError, re-raising the error", exc_info=True) + raise err + + async def IsReady( + self, request: _empty_pb2.Empty, context: NumaflowServicerContext + ) -> mapstream_pb2.ReadyResponse: + """ + IsReady is the heartbeat endpoint for gRPC. + The pascal case function name comes from the proto mapstream_pb2_grpc.py file. + """ + return mapstream_pb2.ReadyResponse(ready=True) diff --git a/tests/mapstream/test_async_map_stream.py b/tests/mapstream/test_async_map_stream.py index dec4eb77..107289a6 100644 --- a/tests/mapstream/test_async_map_stream.py +++ b/tests/mapstream/test_async_map_stream.py @@ -12,7 +12,7 @@ from pynumaflow.mapstreamer import ( Message, Datum, - MapStreamServer, + MapStreamAsyncServer, ) from pynumaflow.proto.mapstreamer import mapstream_pb2_grpc from tests.mapstream.utils import start_request_map_stream @@ -47,10 +47,8 @@ def startup_callable(loop): def NewAsyncMapStreamer( map_stream_handler=async_map_stream_handler, ): - server = MapStreamServer(map_stream_instance=async_map_stream_handler) - udfs = server.get_servicer( - map_stream_instance=async_map_stream_handler, server_type=server.server_type - ) + server = MapStreamAsyncServer(map_stream_instance=async_map_stream_handler) + udfs = server.servicer return udfs diff --git a/tests/mapstream/test_async_map_stream_err.py b/tests/mapstream/test_async_map_stream_err.py index 7e7d5890..feed2459 100644 --- a/tests/mapstream/test_async_map_stream_err.py +++ b/tests/mapstream/test_async_map_stream_err.py @@ -7,10 +7,9 @@ import grpc from grpc.aio._server import Server -from pynumaflow._constants import ServerType from pynumaflow import setup_logging -from pynumaflow.mapstreamer import Message, Datum, MapStreamServer +from pynumaflow.mapstreamer import Message, Datum, MapStreamAsyncServer from pynumaflow.proto.mapstreamer import mapstream_pb2_grpc from tests.mapstream.utils import start_request_map_stream @@ -44,10 +43,8 @@ def startup_callable(loop): async def start_server(): server = grpc.aio.server() - server_instance = MapStreamServer(map_stream_instance=err_async_map_stream_handler) - udfs = server_instance.get_servicer( - map_stream_instance=err_async_map_stream_handler, server_type=server_instance.server_type - ) + server_instance = MapStreamAsyncServer(map_stream_instance=err_async_map_stream_handler) + udfs = server_instance.servicer mapstream_pb2_grpc.add_MapStreamServicer_to_server(udfs, server) listen_addr = "unix:///tmp/async_map_stream_err.sock" server.add_insecure_port(listen_addr) @@ -104,11 +101,7 @@ def __stub(self): def test_invalid_input(self): with self.assertRaises(TypeError): - MapStreamServer(server_type=ServerType.Async) - with self.assertRaises(NotImplementedError): - MapStreamServer( - map_stream_instance=err_async_map_stream_handler, server_type="ERORR" - ).start() + MapStreamAsyncServer() if __name__ == "__main__": From c1256f680dd17efabdb6e682fe74982825efc3d1 Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Tue, 16 Jan 2024 16:07:45 -0800 Subject: [PATCH 58/78] seperate reducer Signed-off-by: Sidhant Kohli --- .../mapstreamer/servicer/async_servicer.py | 1 - pynumaflow/reducer/__init__.py | 9 +- pynumaflow/reducer/_dtypes.py | 6 +- pynumaflow/reducer/async_server.py | 200 +++++------------- pynumaflow/reducer/reduce.py | 90 -------- pynumaflow/reducer/servicer/__init__.py | 0 pynumaflow/reducer/servicer/async_server.py | 165 +++++++++++++++ .../reducer/{ => servicer}/asynciter.py | 0 tests/reduce/test_async_reduce.py | 16 +- 9 files changed, 232 insertions(+), 255 deletions(-) delete mode 100644 pynumaflow/reducer/reduce.py create mode 100644 pynumaflow/reducer/servicer/__init__.py create mode 100644 pynumaflow/reducer/servicer/async_server.py rename pynumaflow/reducer/{ => servicer}/asynciter.py (100%) diff --git a/pynumaflow/mapstreamer/servicer/async_servicer.py b/pynumaflow/mapstreamer/servicer/async_servicer.py index dc9b3ece..33c8bd7c 100644 --- a/pynumaflow/mapstreamer/servicer/async_servicer.py +++ b/pynumaflow/mapstreamer/servicer/async_servicer.py @@ -1,4 +1,3 @@ - from collections.abc import AsyncIterable from google.protobuf import empty_pb2 as _empty_pb2 diff --git a/pynumaflow/reducer/__init__.py b/pynumaflow/reducer/__init__.py index 2c7df2fd..7a1c878b 100644 --- a/pynumaflow/reducer/__init__.py +++ b/pynumaflow/reducer/__init__.py @@ -5,8 +5,9 @@ IntervalWindow, Metadata, DROP, - ReducerClass, + Reducer, ) +from pynumaflow.reducer.async_server import ReduceAsyncServer __all__ = [ "Message", @@ -15,8 +16,6 @@ "IntervalWindow", "Metadata", "DROP", - "ReduceServer", - "ReducerClass", + "ReduceAsyncServer", + "Reducer", ] - -from pynumaflow.reducer.reduce import ReduceServer diff --git a/pynumaflow/reducer/_dtypes.py b/pynumaflow/reducer/_dtypes.py index 7cc1595b..31722608 100644 --- a/pynumaflow/reducer/_dtypes.py +++ b/pynumaflow/reducer/_dtypes.py @@ -7,7 +7,7 @@ from collections.abc import AsyncIterable from warnings import warn -from pynumaflow.reducer.asynciter import NonBlockingIterator +from pynumaflow.reducer.servicer.asynciter import NonBlockingIterator from pynumaflow._constants import DROP M = TypeVar("M", bound="Message") @@ -233,7 +233,7 @@ def keys(self) -> list[str]: return self._key -class ReducerClass(metaclass=ABCMeta): +class Reducer(metaclass=ABCMeta): """ Provides an interface to write a Reducer which will be exposed over a gRPC server. @@ -258,4 +258,4 @@ async def handler( ReduceAsyncCallable = Callable[[list[str], AsyncIterable[Datum], Metadata], Awaitable[Messages]] # ReduceCallable is a callable which can be used as a handler for the reduce UDF. -ReduceCallable = Union[ReduceAsyncCallable, ReducerClass] +ReduceCallable = Union[ReduceAsyncCallable, Reducer] diff --git a/pynumaflow/reducer/async_server.py b/pynumaflow/reducer/async_server.py index f6210e38..af6241d1 100644 --- a/pynumaflow/reducer/async_server.py +++ b/pynumaflow/reducer/async_server.py @@ -1,165 +1,75 @@ -import asyncio +import aiorun +import grpc -from datetime import datetime, timezone -from collections.abc import AsyncIterable +from pynumaflow.proto.reducer import reduce_pb2_grpc -import grpc -from google.protobuf import empty_pb2 as _empty_pb2 +from pynumaflow.reducer.servicer.async_server import AsyncReduceServicer from pynumaflow._constants import ( - WIN_START_TIME, - WIN_END_TIME, - STREAM_EOF, - DELIMITER, + REDUCE_SOCK_PATH, + MAX_MESSAGE_SIZE, + MAX_THREADS, ) -from pynumaflow.reducer._dtypes import Datum, IntervalWindow, Metadata -from pynumaflow.reducer._dtypes import ReduceResult, ReduceCallable -from pynumaflow.reducer.asynciter import NonBlockingIterator -from pynumaflow.proto.reducer import reduce_pb2, reduce_pb2_grpc -from pynumaflow.types import NumaflowServicerContext -from pynumaflow._constants import _LOGGER +from pynumaflow.reducer._dtypes import ReduceCallable -async def datum_generator( - request_iterator: AsyncIterable[reduce_pb2.ReduceRequest], -) -> AsyncIterable[Datum]: - async for d in request_iterator: - datum = Datum( - keys=list(d.keys), - value=d.value, - event_time=d.event_time.ToDatetime(), - watermark=d.watermark.ToDatetime(), - ) - yield datum +from pynumaflow.shared.server import NumaflowServer, start_async_server -class AsyncReducer(reduce_pb2_grpc.ReduceServicer): +class ReduceAsyncServer(NumaflowServer): """ - This class is used to create a new grpc Reduce servicer instance. - It implements the SyncMapServicer interface from the proto reduce.proto file. - Provides the functionality for the required rpc methods. + Class for a new Reduce Server instance. """ def __init__( self, - handler: ReduceCallable, + reducer_instance: ReduceCallable, + sock_path=REDUCE_SOCK_PATH, + max_message_size=MAX_MESSAGE_SIZE, + max_threads=MAX_THREADS, ): - # Collection for storing strong references to all running tasks. - # Event loop only keeps a weak reference, which can cause it to - # get lost during execution. - self.background_tasks = set() - self.__reduce_handler: ReduceCallable = handler - - async def ReduceFn( - self, - request_iterator: AsyncIterable[reduce_pb2.ReduceRequest], - context: NumaflowServicerContext, - ) -> reduce_pb2.ReduceResponse: """ - Applies a reduce function to a datum stream. - The pascal case function name comes from the proto reduce_pb2_grpc.py file. + Create a new grpc Reduce Server instance. + A new servicer instance is created and attached to the server. + The server instance is returned. + Args: + reducer_instance: The reducer instance to be used for Reduce UDF + sock_path: The UNIX socket path to be used for the server + max_message_size: The max message size in bytes the server can receive and send + max_threads: The max number of threads to be spawned; + defaults to number of processors x4 + server_type: The type of server to be used """ + self.reducer_instance: ReduceCallable = reducer_instance + self.sock_path = f"unix://{sock_path}" + self.max_message_size = max_message_size + self.max_threads = max_threads + + self._server_options = [ + ("grpc.max_send_message_length", self.max_message_size), + ("grpc.max_receive_message_length", self.max_message_size), + ] + # Get the servicer instance for the async server + self.servicer = AsyncReduceServicer(reducer_instance) + + def start(self): + """ + Starter function for the Async server class, need a separate caller + so that all the async coroutines can be started from a single context + """ + aiorun.run(self.aexec(), use_uvloop=True) - start, end = None, None - for metadata_key, metadata_value in context.invocation_metadata(): - if metadata_key == WIN_START_TIME: - start = metadata_value - elif metadata_key == WIN_END_TIME: - end = metadata_value - if not (start or end): - context.set_code(grpc.StatusCode.INVALID_ARGUMENT) - context.set_details( - f"Expected to have all key/window_start_time/window_end_time; " - f"got start: {start}, end: {end}." - ) - yield reduce_pb2.ReduceResponse(results=[]) - return - - start_dt = datetime.fromtimestamp(int(start) / 1e3, timezone.utc) - end_dt = datetime.fromtimestamp(int(end) / 1e3, timezone.utc) - interval_window = IntervalWindow(start=start_dt, end=end_dt) - - datum_iterator = datum_generator(request_iterator=request_iterator) - - response_task = asyncio.create_task( - self.__async_reduce_handler(interval_window, datum_iterator) - ) - - # Save a reference to the result of this function, to avoid a - # task disappearing mid-execution. - self.background_tasks.add(response_task) - response_task.add_done_callback(lambda t: self.background_tasks.remove(t)) - - await response_task - results_futures = response_task.result() - - try: - for fut in results_futures: - await fut - yield reduce_pb2.ReduceResponse(results=fut.result()) - except Exception as e: - context.set_code(grpc.StatusCode.UNKNOWN) - context.set_details(e.__str__()) - yield reduce_pb2.ReduceResponse(results=[]) - - async def __async_reduce_handler(self, interval_window, datum_iterator: AsyncIterable[Datum]): - callable_dict = {} - # iterate through all the values - async for d in datum_iterator: - keys = d.keys() - unified_key = DELIMITER.join(keys) - result = callable_dict.get(unified_key, None) - - if not result: - niter = NonBlockingIterator() - riter = niter.read_iterator() - # schedule an async task for consumer - # returns a future that will give the results later. - task = asyncio.create_task( - self.__invoke_reduce(keys, riter, Metadata(interval_window=interval_window)) - ) - # Save a reference to the result of this function, to avoid a - # task disappearing mid-execution. - self.background_tasks.add(task) - task.add_done_callback(lambda t: self.background_tasks.remove(t)) - result = ReduceResult(task, niter, keys) - - callable_dict[unified_key] = result - - await result.iterator.put(d) - - for unified_key in callable_dict: - await callable_dict[unified_key].iterator.put(STREAM_EOF) - - tasks = [] - for unified_key in callable_dict: - fut = callable_dict[unified_key].future - tasks.append(fut) - - return tasks - - async def __invoke_reduce( - self, keys: list[str], request_iterator: AsyncIterable[Datum], md: Metadata - ): - try: - msgs = await self.__reduce_handler(keys, request_iterator, md) - except Exception as err: - _LOGGER.critical("UDFError, re-raising the error", exc_info=True) - raise err - - datum_responses = [] - for msg in msgs: - datum_responses.append( - reduce_pb2.ReduceResponse.Result(keys=msg.keys, value=msg.value, tags=msg.tags) - ) - - return datum_responses - - async def IsReady( - self, request: _empty_pb2.Empty, context: NumaflowServicerContext - ) -> reduce_pb2.ReadyResponse: + async def aexec(self): """ - IsReady is the heartbeat endpoint for gRPC. - The pascal case function name comes from the proto reduce_pb2_grpc.py file. + Starts the Async gRPC server on the given UNIX socket with + given max threads. """ - return reduce_pb2.ReadyResponse(ready=True) + # As the server is async, we need to create a new server instance in the + # same thread as the event loop so that all the async calls are made in the + # same context + # Create a new async server instance and add the servicer to it + server = grpc.aio.server() + server.add_insecure_port(self.sock_path) + reduce_servicer = self.servicer + reduce_pb2_grpc.add_ReduceServicer_to_server(reduce_servicer, server) + await start_async_server(server, self.sock_path, self.max_threads, self._server_options) diff --git a/pynumaflow/reducer/reduce.py b/pynumaflow/reducer/reduce.py deleted file mode 100644 index 6f03f495..00000000 --- a/pynumaflow/reducer/reduce.py +++ /dev/null @@ -1,90 +0,0 @@ -import aiorun -import grpc - -from pynumaflow.proto.reducer import reduce_pb2_grpc - -from pynumaflow.reducer.async_server import AsyncReducer - -from pynumaflow._constants import ( - REDUCE_SOCK_PATH, - MAX_MESSAGE_SIZE, - MAX_THREADS, - ServerType, - _LOGGER, -) - -from pynumaflow.reducer._dtypes import ReduceCallable - -from pynumaflow.shared.server import NumaflowServer, start_async_server - - -class ReduceServer(NumaflowServer): - """ - Class for a new Reduce Server instance. - """ - - def __init__( - self, - reducer_instance: ReduceCallable, - sock_path=REDUCE_SOCK_PATH, - max_message_size=MAX_MESSAGE_SIZE, - max_threads=MAX_THREADS, - server_type=ServerType.Async, - ): - """ - Create a new grpc Reduce Server instance. - A new servicer instance is created and attached to the server. - The server instance is returned. - Args: - reducer_instance: The reducer instance to be used for Reduce UDF - sock_path: The UNIX socket path to be used for the server - max_message_size: The max message size in bytes the server can receive and send - max_threads: The max number of threads to be spawned; - defaults to number of processors x4 - server_type: The type of server to be used - """ - self.reducer_instance: ReduceCallable = reducer_instance - self.sock_path = f"unix://{sock_path}" - self.max_message_size = max_message_size - self.max_threads = max_threads - self.server_type = server_type - - self._server_options = [ - ("grpc.max_send_message_length", self.max_message_size), - ("grpc.max_receive_message_length", self.max_message_size), - ] - - def start(self): - """ - Starter function for the Reduce server, Handles the server type and - starts the server. - Currently supported server types are: - 1. Async - """ - if self.server_type == ServerType.Async: - aiorun.run(self.aexec(), use_uvloop=True) - else: - _LOGGER.error("Server type not supported - %s", str(self.server_type)) - raise NotImplementedError - - async def aexec(self): - """ - Starts the Async gRPC server on the given UNIX socket with - given max threads. - """ - # As the server is async, we need to create a new server instance in the - # same thread as the event loop so that all the async calls are made in the - # same context - # Create a new async server instance and add the servicer to it - server = grpc.aio.server() - server.add_insecure_port(self.sock_path) - reduce_servicer = self.get_servicer( - reducer_instance=self.reducer_instance, server_type=self.server_type - ) - reduce_pb2_grpc.add_ReduceServicer_to_server(reduce_servicer, server) - await start_async_server(server, self.sock_path, self.max_threads, self._server_options) - - def get_servicer(self, reducer_instance: ReduceCallable, server_type: ServerType): - """Get the servicer instance for the given server type""" - if server_type == ServerType.Async: - return AsyncReducer(reducer_instance) diff --git a/pynumaflow/reducer/servicer/__init__.py b/pynumaflow/reducer/servicer/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pynumaflow/reducer/servicer/async_server.py b/pynumaflow/reducer/servicer/async_server.py new file mode 100644 index 00000000..82988f6e --- /dev/null +++ b/pynumaflow/reducer/servicer/async_server.py @@ -0,0 +1,165 @@ +import asyncio + +from datetime import datetime, timezone +from collections.abc import AsyncIterable + +import grpc +from google.protobuf import empty_pb2 as _empty_pb2 + +from pynumaflow._constants import ( + WIN_START_TIME, + WIN_END_TIME, + STREAM_EOF, + DELIMITER, +) +from pynumaflow.reducer._dtypes import Datum, IntervalWindow, Metadata +from pynumaflow.reducer._dtypes import ReduceResult, ReduceCallable +from pynumaflow.reducer.servicer.asynciter import NonBlockingIterator +from pynumaflow.proto.reducer import reduce_pb2, reduce_pb2_grpc +from pynumaflow.types import NumaflowServicerContext +from pynumaflow._constants import _LOGGER + + +async def datum_generator( + request_iterator: AsyncIterable[reduce_pb2.ReduceRequest], +) -> AsyncIterable[Datum]: + async for d in request_iterator: + datum = Datum( + keys=list(d.keys), + value=d.value, + event_time=d.event_time.ToDatetime(), + watermark=d.watermark.ToDatetime(), + ) + yield datum + + +class AsyncReduceServicer(reduce_pb2_grpc.ReduceServicer): + """ + This class is used to create a new grpc Reduce servicer instance. + It implements the SyncMapServicer interface from the proto reduce.proto file. + Provides the functionality for the required rpc methods. + """ + + def __init__( + self, + handler: ReduceCallable, + ): + # Collection for storing strong references to all running tasks. + # Event loop only keeps a weak reference, which can cause it to + # get lost during execution. + self.background_tasks = set() + self.__reduce_handler: ReduceCallable = handler + + async def ReduceFn( + self, + request_iterator: AsyncIterable[reduce_pb2.ReduceRequest], + context: NumaflowServicerContext, + ) -> reduce_pb2.ReduceResponse: + """ + Applies a reduce function to a datum stream. + The pascal case function name comes from the proto reduce_pb2_grpc.py file. + """ + + start, end = None, None + for metadata_key, metadata_value in context.invocation_metadata(): + if metadata_key == WIN_START_TIME: + start = metadata_value + elif metadata_key == WIN_END_TIME: + end = metadata_value + if not (start or end): + context.set_code(grpc.StatusCode.INVALID_ARGUMENT) + context.set_details( + f"Expected to have all key/window_start_time/window_end_time; " + f"got start: {start}, end: {end}." + ) + yield reduce_pb2.ReduceResponse(results=[]) + return + + start_dt = datetime.fromtimestamp(int(start) / 1e3, timezone.utc) + end_dt = datetime.fromtimestamp(int(end) / 1e3, timezone.utc) + interval_window = IntervalWindow(start=start_dt, end=end_dt) + + datum_iterator = datum_generator(request_iterator=request_iterator) + + response_task = asyncio.create_task( + self.__async_reduce_handler(interval_window, datum_iterator) + ) + + # Save a reference to the result of this function, to avoid a + # task disappearing mid-execution. + self.background_tasks.add(response_task) + response_task.add_done_callback(lambda t: self.background_tasks.remove(t)) + + await response_task + results_futures = response_task.result() + + try: + for fut in results_futures: + await fut + yield reduce_pb2.ReduceResponse(results=fut.result()) + except Exception as e: + context.set_code(grpc.StatusCode.UNKNOWN) + context.set_details(e.__str__()) + yield reduce_pb2.ReduceResponse(results=[]) + + async def __async_reduce_handler(self, interval_window, datum_iterator: AsyncIterable[Datum]): + callable_dict = {} + # iterate through all the values + async for d in datum_iterator: + keys = d.keys() + unified_key = DELIMITER.join(keys) + result = callable_dict.get(unified_key, None) + + if not result: + niter = NonBlockingIterator() + riter = niter.read_iterator() + # schedule an async task for consumer + # returns a future that will give the results later. + task = asyncio.create_task( + self.__invoke_reduce(keys, riter, Metadata(interval_window=interval_window)) + ) + # Save a reference to the result of this function, to avoid a + # task disappearing mid-execution. + self.background_tasks.add(task) + task.add_done_callback(lambda t: self.background_tasks.remove(t)) + result = ReduceResult(task, niter, keys) + + callable_dict[unified_key] = result + + await result.iterator.put(d) + + for unified_key in callable_dict: + await callable_dict[unified_key].iterator.put(STREAM_EOF) + + tasks = [] + for unified_key in callable_dict: + fut = callable_dict[unified_key].future + tasks.append(fut) + + return tasks + + async def __invoke_reduce( + self, keys: list[str], request_iterator: AsyncIterable[Datum], md: Metadata + ): + try: + msgs = await self.__reduce_handler(keys, request_iterator, md) + except Exception as err: + _LOGGER.critical("UDFError, re-raising the error", exc_info=True) + raise err + + datum_responses = [] + for msg in msgs: + datum_responses.append( + reduce_pb2.ReduceResponse.Result(keys=msg.keys, value=msg.value, tags=msg.tags) + ) + + return datum_responses + + async def IsReady( + self, request: _empty_pb2.Empty, context: NumaflowServicerContext + ) -> reduce_pb2.ReadyResponse: + """ + IsReady is the heartbeat endpoint for gRPC. + The pascal case function name comes from the proto reduce_pb2_grpc.py file. + """ + return reduce_pb2.ReadyResponse(ready=True) diff --git a/pynumaflow/reducer/asynciter.py b/pynumaflow/reducer/servicer/asynciter.py similarity index 100% rename from pynumaflow/reducer/asynciter.py rename to pynumaflow/reducer/servicer/asynciter.py diff --git a/tests/reduce/test_async_reduce.py b/tests/reduce/test_async_reduce.py index 98ba6083..4f0d288f 100644 --- a/tests/reduce/test_async_reduce.py +++ b/tests/reduce/test_async_reduce.py @@ -10,13 +10,13 @@ from grpc.aio._server import Server from pynumaflow import setup_logging -from pynumaflow._constants import WIN_START_TIME, WIN_END_TIME, ServerType +from pynumaflow._constants import WIN_START_TIME, WIN_END_TIME from pynumaflow.reducer import ( Messages, Message, Datum, Metadata, - ReduceServer, + ReduceAsyncServer, ) from pynumaflow.proto.reducer import reduce_pb2, reduce_pb2_grpc from tests.testing_utils import ( @@ -94,12 +94,8 @@ async def reduce_handler(keys: list[str], datums: Iterator[Datum], md: Metadata) def NewAsyncReducer( reduce_handler=async_reduce_handler, ): - server_instance = ReduceServer( - reducer_instance=async_reduce_handler, server_type=ServerType.Async - ) - udfs = server_instance.get_servicer( - reducer_instance=server_instance.reducer_instance, server_type=server_instance.server_type - ) + server_instance = ReduceAsyncServer(reducer_instance=async_reduce_handler) + udfs = server_instance.servicer return udfs @@ -239,9 +235,7 @@ def __stub(self): def test_error_init(self): with self.assertRaises(TypeError): - ReduceServer() - with self.assertRaises(NotImplementedError): - ReduceServer(reducer_instance=async_reduce_handler, server_type="ERORR").start() + ReduceAsyncServer() if __name__ == "__main__": From c7b8a84a40f69c028f73043c00002bd251c095cf Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Tue, 16 Jan 2024 23:54:00 -0800 Subject: [PATCH 59/78] seperate servers Signed-off-by: Sidhant Kohli --- pynumaflow/_constants.py | 10 - pynumaflow/mapper/__init__.py | 2 - pynumaflow/reducer/async_server.py | 2 +- .../{async_server.py => async_servicer.py} | 0 pynumaflow/sideinput/__init__.py | 6 +- pynumaflow/sideinput/_dtypes.py | 4 +- pynumaflow/sideinput/server.py | 78 ++++---- pynumaflow/sideinput/servicer/__init__.py | 0 pynumaflow/sideinput/servicer/servicer.py | 45 +++++ pynumaflow/sideinput/sideinput.py | 83 -------- pynumaflow/sinker/__init__.py | 9 +- pynumaflow/sinker/_dtypes.py | 11 +- pynumaflow/sinker/async_server.py | 73 ++++++++ pynumaflow/sinker/server.py | 112 +++++------ pynumaflow/sinker/servicer/__init__.py | 0 .../async_servicer.py} | 20 +- pynumaflow/sinker/servicer/sync_servicer.py | 69 +++++++ pynumaflow/sinker/sink.py | 126 ------------- pynumaflow/sourcer/__init__.py | 12 +- pynumaflow/sourcer/_dtypes.py | 4 +- pynumaflow/sourcer/async_server.py | 161 ++++++---------- pynumaflow/sourcer/server.py | 177 +++++------------- pynumaflow/sourcer/servicer/__init__.py | 0 pynumaflow/sourcer/servicer/async_servicer.py | 129 +++++++++++++ pynumaflow/sourcer/servicer/sync_servicer.py | 143 ++++++++++++++ pynumaflow/sourcer/source.py | 121 ------------ pynumaflow/sourcetransformer/__init__.py | 11 +- pynumaflow/sourcetransformer/_dtypes.py | 4 +- .../sourcetransformer/multiproc_server.py | 75 ++++++++ pynumaflow/sourcetransformer/server.py | 107 +++++------ .../sourcetransformer/servicer/__init__.py | 0 .../sourcetransformer/servicer/server.py | 73 ++++++++ .../sourcetransformer/sourcetransform.py | 133 ------------- tests/map/test_sync_mapper.py | 4 - tests/mapstream/test_async_map_stream_err.py | 2 +- tests/sideinput/test_responses.py | 4 +- tests/sideinput/test_side_input_server.py | 20 +- tests/sink/test_async_sink.py | 12 +- tests/sink/test_responses.py | 4 +- tests/sink/test_server.py | 10 +- tests/source/test_async_source.py | 12 +- tests/source/test_async_source_err.py | 13 +- tests/source/test_sync_source.py | 4 +- tests/source/test_sync_source_err.py | 9 +- tests/source/utils.py | 10 +- tests/sourcetransform/test_messages.py | 4 +- tests/sourcetransform/test_multiproc.py | 46 ++--- tests/sourcetransform/test_sync_server.py | 24 +-- 48 files changed, 953 insertions(+), 1025 deletions(-) rename pynumaflow/reducer/servicer/{async_server.py => async_servicer.py} (100%) create mode 100644 pynumaflow/sideinput/servicer/__init__.py create mode 100644 pynumaflow/sideinput/servicer/servicer.py delete mode 100644 pynumaflow/sideinput/sideinput.py create mode 100644 pynumaflow/sinker/async_server.py create mode 100644 pynumaflow/sinker/servicer/__init__.py rename pynumaflow/sinker/{async_sink.py => servicer/async_servicer.py} (84%) create mode 100644 pynumaflow/sinker/servicer/sync_servicer.py delete mode 100644 pynumaflow/sinker/sink.py create mode 100644 pynumaflow/sourcer/servicer/__init__.py create mode 100644 pynumaflow/sourcer/servicer/async_servicer.py create mode 100644 pynumaflow/sourcer/servicer/sync_servicer.py delete mode 100644 pynumaflow/sourcer/source.py create mode 100644 pynumaflow/sourcetransformer/multiproc_server.py create mode 100644 pynumaflow/sourcetransformer/servicer/__init__.py create mode 100644 pynumaflow/sourcetransformer/servicer/server.py delete mode 100644 pynumaflow/sourcetransformer/sourcetransform.py diff --git a/pynumaflow/_constants.py b/pynumaflow/_constants.py index 137c738d..ca9766c4 100644 --- a/pynumaflow/_constants.py +++ b/pynumaflow/_constants.py @@ -33,16 +33,6 @@ _LOGGER.setLevel(logging.DEBUG) -class ServerType(str, Enum): - """ - Enumerate grpc server connection protocol. - """ - - Sync = "sync" - Async = "async" - Multiproc = "multiproc" - - class UDFType(str, Enum): """ Enumerate the type of UDF. diff --git a/pynumaflow/mapper/__init__.py b/pynumaflow/mapper/__init__.py index 4ebadabd..a713d039 100644 --- a/pynumaflow/mapper/__init__.py +++ b/pynumaflow/mapper/__init__.py @@ -3,14 +3,12 @@ from pynumaflow.mapper.sync_server import MapServer from pynumaflow.mapper._dtypes import Message, Messages, Datum, DROP, Mapper -from pynumaflow._constants import ServerType __all__ = [ "Message", "Messages", "Datum", "DROP", - "ServerType", "Mapper", "MapServer", "MapAsyncServer", diff --git a/pynumaflow/reducer/async_server.py b/pynumaflow/reducer/async_server.py index af6241d1..f7442238 100644 --- a/pynumaflow/reducer/async_server.py +++ b/pynumaflow/reducer/async_server.py @@ -3,7 +3,7 @@ from pynumaflow.proto.reducer import reduce_pb2_grpc -from pynumaflow.reducer.servicer.async_server import AsyncReduceServicer +from pynumaflow.reducer.servicer.async_servicer import AsyncReduceServicer from pynumaflow._constants import ( REDUCE_SOCK_PATH, diff --git a/pynumaflow/reducer/servicer/async_server.py b/pynumaflow/reducer/servicer/async_servicer.py similarity index 100% rename from pynumaflow/reducer/servicer/async_server.py rename to pynumaflow/reducer/servicer/async_servicer.py diff --git a/pynumaflow/sideinput/__init__.py b/pynumaflow/sideinput/__init__.py index c8713710..2058fe97 100644 --- a/pynumaflow/sideinput/__init__.py +++ b/pynumaflow/sideinput/__init__.py @@ -1,5 +1,5 @@ from pynumaflow._constants import SIDE_INPUT_DIR_PATH -from pynumaflow.sideinput._dtypes import Response, SideInputClass -from pynumaflow.sideinput.sideinput import SideInputServer +from pynumaflow.sideinput._dtypes import Response, SideInput +from pynumaflow.sideinput.server import SideInputServer -__all__ = ["Response", "SideInputClass", "SideInputServer", "SIDE_INPUT_DIR_PATH"] +__all__ = ["Response", "SideInput", "SideInputServer", "SIDE_INPUT_DIR_PATH"] diff --git a/pynumaflow/sideinput/_dtypes.py b/pynumaflow/sideinput/_dtypes.py index 2225af14..6a68f420 100644 --- a/pynumaflow/sideinput/_dtypes.py +++ b/pynumaflow/sideinput/_dtypes.py @@ -39,7 +39,7 @@ def no_broadcast_message(cls: type[R]) -> R: return Response(value=b"", no_broadcast=True) -class SideInputClass(metaclass=ABCMeta): +class SideInput(metaclass=ABCMeta): """ Provides an interface to write a SideInput Class which will be exposed over gRPC. @@ -61,4 +61,4 @@ def retrieve_handler(self) -> Response: RetrieverHandlerCallable = Callable[[], Response] -RetrieverCallable = Union[SideInputClass, RetrieverHandlerCallable] +RetrieverCallable = Union[SideInput, RetrieverHandlerCallable] diff --git a/pynumaflow/sideinput/server.py b/pynumaflow/sideinput/server.py index 458365b7..121bf496 100644 --- a/pynumaflow/sideinput/server.py +++ b/pynumaflow/sideinput/server.py @@ -1,45 +1,59 @@ -import grpc -from google.protobuf import empty_pb2 as _empty_pb2 - +import os +from pynumaflow.shared import NumaflowServer +from pynumaflow.shared.server import sync_server_start +from pynumaflow.sideinput._dtypes import RetrieverCallable +from pynumaflow.sideinput.servicer.servicer import SideInputServicer from pynumaflow._constants import ( + MAX_THREADS, + MAX_MESSAGE_SIZE, + SIDE_INPUT_SOCK_PATH, _LOGGER, + UDFType, + SIDE_INPUT_DIR_PATH, ) -from pynumaflow.proto.sideinput import sideinput_pb2_grpc, sideinput_pb2 -from pynumaflow.sideinput._dtypes import RetrieverCallable -from pynumaflow.types import NumaflowServicerContext -class SideInput(sideinput_pb2_grpc.SideInputServicer): +class SideInputServer(NumaflowServer): + """Server for side input""" + def __init__( self, - handler: RetrieverCallable, + side_input_instance: RetrieverCallable, + sock_path=SIDE_INPUT_SOCK_PATH, + max_message_size=MAX_MESSAGE_SIZE, + max_threads=MAX_THREADS, + side_input_dir_path=SIDE_INPUT_DIR_PATH, ): - self.__retrieve_handler: RetrieverCallable = handler + self.sock_path = f"unix://{sock_path}" + self.max_threads = min(max_threads, int(os.getenv("MAX_THREADS", "4"))) + self.max_message_size = max_message_size - def RetrieveSideInput( - self, request: _empty_pb2.Empty, context: NumaflowServicerContext - ) -> sideinput_pb2.SideInputResponse: - """ - Applies a sideinput function for a retrieval request. - The pascal case function name comes from the proto sideinput_pb2_grpc.py file. - """ - # if there is an exception, we will mark all the responses as a failure - try: - rspn = self.__retrieve_handler() - except Exception as err: - err_msg = "RetrieveSideInputErr: %r" % err - _LOGGER.critical(err_msg, exc_info=True) - context.set_code(grpc.StatusCode.UNKNOWN) - context.set_details(str(err)) - return sideinput_pb2.SideInputResponse(value=None, no_broadcast=True) + self._server_options = [ + ("grpc.max_send_message_length", self.max_message_size), + ("grpc.max_receive_message_length", self.max_message_size), + ] - return sideinput_pb2.SideInputResponse(value=rspn.value, no_broadcast=rspn.no_broadcast) + self.side_input_instance = side_input_instance + self.side_input_dir_path = side_input_dir_path + self.servicer = SideInputServicer(side_input_instance) - def IsReady( - self, request: _empty_pb2.Empty, context: NumaflowServicerContext - ) -> sideinput_pb2.ReadyResponse: + def start(self): """ - IsReady is the heartbeat endpoint for gRPC. - The pascal case function name comes from the proto sideinput_pb2_grpc.py file. + Starts the Synchronous gRPC server on the given UNIX socket with given max threads. """ - return sideinput_pb2.ReadyResponse(ready=True) + # Get the servicer instance based on the server type + side_input_servicer = self.servicer + _LOGGER.info( + "Side Input GRPC Server listening on: %s with max threads: %s", + self.sock_path, + self.max_threads, + ) + # Start the server + sync_server_start( + servicer=side_input_servicer, + bind_address=self.sock_path, + max_threads=self.max_threads, + server_options=self._server_options, + udf_type=UDFType.SideInput, + add_info_server=False, + ) diff --git a/pynumaflow/sideinput/servicer/__init__.py b/pynumaflow/sideinput/servicer/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pynumaflow/sideinput/servicer/servicer.py b/pynumaflow/sideinput/servicer/servicer.py new file mode 100644 index 00000000..2f050149 --- /dev/null +++ b/pynumaflow/sideinput/servicer/servicer.py @@ -0,0 +1,45 @@ +import grpc +from google.protobuf import empty_pb2 as _empty_pb2 + +from pynumaflow._constants import ( + _LOGGER, +) +from pynumaflow.proto.sideinput import sideinput_pb2_grpc, sideinput_pb2 +from pynumaflow.sideinput._dtypes import RetrieverCallable +from pynumaflow.types import NumaflowServicerContext + + +class SideInputServicer(sideinput_pb2_grpc.SideInputServicer): + def __init__( + self, + handler: RetrieverCallable, + ): + self.__retrieve_handler: RetrieverCallable = handler + + def RetrieveSideInput( + self, request: _empty_pb2.Empty, context: NumaflowServicerContext + ) -> sideinput_pb2.SideInputResponse: + """ + Applies a sideinput function for a retrieval request. + The pascal case function name comes from the proto sideinput_pb2_grpc.py file. + """ + # if there is an exception, we will mark all the responses as a failure + try: + rspn = self.__retrieve_handler() + except Exception as err: + err_msg = "RetrieveSideInputErr: %r" % err + _LOGGER.critical(err_msg, exc_info=True) + context.set_code(grpc.StatusCode.UNKNOWN) + context.set_details(str(err)) + return sideinput_pb2.SideInputResponse(value=None, no_broadcast=True) + + return sideinput_pb2.SideInputResponse(value=rspn.value, no_broadcast=rspn.no_broadcast) + + def IsReady( + self, request: _empty_pb2.Empty, context: NumaflowServicerContext + ) -> sideinput_pb2.ReadyResponse: + """ + IsReady is the heartbeat endpoint for gRPC. + The pascal case function name comes from the proto sideinput_pb2_grpc.py file. + """ + return sideinput_pb2.ReadyResponse(ready=True) diff --git a/pynumaflow/sideinput/sideinput.py b/pynumaflow/sideinput/sideinput.py deleted file mode 100644 index 3076e2d7..00000000 --- a/pynumaflow/sideinput/sideinput.py +++ /dev/null @@ -1,83 +0,0 @@ -import os -from pynumaflow.shared import NumaflowServer -from pynumaflow.shared.server import sync_server_start -from pynumaflow.sideinput._dtypes import RetrieverCallable -from pynumaflow.sideinput.server import SideInput -from pynumaflow._constants import ( - MAX_THREADS, - MAX_MESSAGE_SIZE, - SIDE_INPUT_SOCK_PATH, - ServerType, - _LOGGER, - UDFType, - SIDE_INPUT_DIR_PATH, -) - - -class SideInputServer(NumaflowServer): - """Server for side input""" - - def __init__( - self, - side_input_instance: RetrieverCallable, - sock_path=SIDE_INPUT_SOCK_PATH, - max_message_size=MAX_MESSAGE_SIZE, - max_threads=MAX_THREADS, - server_type=ServerType.Sync, - side_input_dir_path=SIDE_INPUT_DIR_PATH, - ): - self.sock_path = f"unix://{sock_path}" - self.max_threads = min(max_threads, int(os.getenv("MAX_THREADS", "4"))) - self.max_message_size = max_message_size - - self._server_options = [ - ("grpc.max_send_message_length", self.max_message_size), - ("grpc.max_receive_message_length", self.max_message_size), - ] - - self.side_input_instance = side_input_instance - self.server_type = server_type - self.side_input_dir_path = side_input_dir_path - - def start(self): - """Starter function for the server class, Handles the server type and - starts the server accordingly. If the server type is not supported, - raises NotImplementedError. - Currently supported server types: - 1) ServerType.Sync - """ - if self.server_type == ServerType.Sync: - return self.exec() - else: - _LOGGER.error("Server type not supported - %s", str(self.server_type)) - raise NotImplementedError - - def exec(self): - """ - Starts the Synchronous gRPC server on the given UNIX socket with given max threads. - """ - # Get the servicer instance based on the server type - side_input_servicer = self.get_servicer( - side_input_instance=self.side_input_instance, server_type=self.server_type - ) - _LOGGER.info( - "Side Input GRPC Server listening on: %s with max threads: %s", - self.sock_path, - self.max_threads, - ) - # Start the server - sync_server_start( - servicer=side_input_servicer, - bind_address=self.sock_path, - max_threads=self.max_threads, - server_options=self._server_options, - udf_type=UDFType.SideInput, - add_info_server=False, - ) - - def get_servicer(self, side_input_instance, server_type): - """ - Returns the servicer instance based on the server type - """ - if server_type == ServerType.Sync: - return SideInput(side_input_instance) diff --git a/pynumaflow/sinker/__init__.py b/pynumaflow/sinker/__init__.py index 9b89b5d6..4df6f270 100644 --- a/pynumaflow/sinker/__init__.py +++ b/pynumaflow/sinker/__init__.py @@ -1,7 +1,6 @@ -from pynumaflow._constants import ServerType +from pynumaflow.sinker.async_server import SinkAsyncServer +from pynumaflow.sinker.server import SinkServer -from pynumaflow.sinker.sink import SinkServer +from pynumaflow.sinker._dtypes import Response, Responses, Datum, Sinker -from pynumaflow.sinker._dtypes import Response, Responses, Datum, SinkerClass - -__all__ = ["Response", "Responses", "Datum", "SinkerClass", "SinkServer", "ServerType"] +__all__ = ["Response", "Responses", "Datum", "Sinker", "SinkServer", "SinkAsyncServer"] diff --git a/pynumaflow/sinker/_dtypes.py b/pynumaflow/sinker/_dtypes.py index f0914469..a767e053 100644 --- a/pynumaflow/sinker/_dtypes.py +++ b/pynumaflow/sinker/_dtypes.py @@ -163,7 +163,7 @@ def watermark(self) -> datetime: return self._watermark -class SinkerClass(metaclass=ABCMeta): +class Sinker(metaclass=ABCMeta): """ Provides an interface to write a Sinker which will be exposed over a gRPC server. @@ -185,7 +185,10 @@ def handler(self, datums: Iterator[Datum]) -> Responses: pass +# SyncSinkCallable is a callable which can be used as a handler for the Synchronous UDSink. SinkHandlerCallable = Callable[[Iterator[Datum]], Responses] -AsyncSinkCallable = Callable[[AsyncIterable[Datum]], Awaitable[Responses]] -# SinkCallable is a callable which can be used as a handler for the UDSink. -SinkCallable = Union[SinkerClass, SinkHandlerCallable, AsyncSinkCallable] +SyncSinkCallable = Union[Sinker, SinkHandlerCallable] + +# AsyncSinkCallable is a callable which can be used as a handler for the Asynchronous UDSink. +AsyncSinkHandlerCallable = Callable[[AsyncIterable[Datum]], Awaitable[Responses]] +AsyncSinkCallable = Union[Sinker, AsyncSinkHandlerCallable] diff --git a/pynumaflow/sinker/async_server.py b/pynumaflow/sinker/async_server.py new file mode 100644 index 00000000..6442db16 --- /dev/null +++ b/pynumaflow/sinker/async_server.py @@ -0,0 +1,73 @@ +import os + +import aiorun +import grpc + +from pynumaflow.sinker.servicer.async_servicer import AsyncSinkServicer +from pynumaflow.proto.sinker import sink_pb2_grpc + + +from pynumaflow._constants import ( + SINK_SOCK_PATH, + MAX_MESSAGE_SIZE, + MAX_THREADS, +) + +from pynumaflow.shared.server import NumaflowServer, start_async_server +from pynumaflow.sinker._dtypes import AsyncSinkCallable + + +class SinkAsyncServer(NumaflowServer): + """ + SinkServer is the main class to start a gRPC server for a sinker. + """ + + def __init__( + self, + sinker_instance: AsyncSinkCallable, + sock_path=SINK_SOCK_PATH, + max_message_size=MAX_MESSAGE_SIZE, + max_threads=MAX_THREADS, + ): + """ + Create a new grpc Sink Server instance. + A new servicer instance is created and attached to the server. + The server instance is returned. + Args: + sinker_instance: The sinker instance to be used for Sink UDF + sock_path: The UNIX socket path to be used for the server + max_message_size: The max message size in bytes the server can receive and send + max_threads: The max number of threads to be spawned; + defaults to number of processors x4 + """ + self.sock_path = f"unix://{sock_path}" + self.max_threads = min(max_threads, int(os.getenv("MAX_THREADS", "4"))) + self.max_message_size = max_message_size + + self.sinker_instance = sinker_instance + + self._server_options = [ + ("grpc.max_send_message_length", self.max_message_size), + ("grpc.max_receive_message_length", self.max_message_size), + ] + self.servicer = AsyncSinkServicer(sinker_instance) + + def start(self): + """ + Starter function for the Async server class, need a separate caller + so that all the async coroutines can be started from a single context + """ + aiorun.run(self.aexec(), use_uvloop=True) + + async def aexec(self): + """ + Starts the Asynchronous gRPC server on the given UNIX socket with given max threads. + """ + # As the server is async, we need to create a new server instance in the + # same thread as the event loop so that all the async calls are made in the + # same context + # Create a new server instance, add the servicer to it and start the server + server = grpc.aio.server() + server.add_insecure_port(self.sock_path) + sink_pb2_grpc.add_SinkServicer_to_server(self.servicer, server) + await start_async_server(server, self.sock_path, self.max_threads, self._server_options) diff --git a/pynumaflow/sinker/server.py b/pynumaflow/sinker/server.py index b166936e..0cd4a5e8 100644 --- a/pynumaflow/sinker/server.py +++ b/pynumaflow/sinker/server.py @@ -1,81 +1,71 @@ -import logging -import multiprocessing import os -from collections.abc import Iterator, Iterable -from google.protobuf import empty_pb2 as _empty_pb2 -from pynumaflow import setup_logging -from pynumaflow.sinker._dtypes import Responses, Datum, Response -from pynumaflow.sinker._dtypes import SinkCallable -from pynumaflow.proto.sinker import sink_pb2_grpc, sink_pb2 -from pynumaflow.types import NumaflowServicerContext +from pynumaflow.sinker.servicer.sync_servicer import SyncSinkServicer -_LOGGER = setup_logging(__name__) -if os.getenv("PYTHONDEBUG"): - _LOGGER.setLevel(logging.DEBUG) +from pynumaflow._constants import ( + SINK_SOCK_PATH, + MAX_MESSAGE_SIZE, + MAX_THREADS, + _LOGGER, + UDFType, +) +from pynumaflow.shared.server import NumaflowServer, sync_server_start +from pynumaflow.sinker._dtypes import SyncSinkCallable -_PROCESS_COUNT = multiprocessing.cpu_count() -MAX_THREADS = int(os.getenv("MAX_THREADS", 0)) or (_PROCESS_COUNT * 4) - -def datum_generator(request_iterator: Iterable[sink_pb2.SinkRequest]) -> Iterable[Datum]: - for d in request_iterator: - datum = Datum( - keys=list(d.keys), - sink_msg_id=d.id, - value=d.value, - event_time=d.event_time.ToDatetime(), - watermark=d.watermark.ToDatetime(), - ) - yield datum - - -class Sinker(sink_pb2_grpc.SinkServicer): +class SinkServer(NumaflowServer): """ - This class is used to create a new grpc Sink servicer instance. - It implements the SinkServicer interface from the proto sink.proto file. - Provides the functionality for the required rpc methods. + SinkServer is the main class to start a gRPC server for a sinker. """ def __init__( self, - handler: SinkCallable, + sinker_instance: SyncSinkCallable, + sock_path=SINK_SOCK_PATH, + max_message_size=MAX_MESSAGE_SIZE, + max_threads=MAX_THREADS, ): - self.__sink_handler: SinkCallable = handler - - def SinkFn( - self, request_iterator: Iterator[sink_pb2.SinkRequest], context: NumaflowServicerContext - ) -> sink_pb2.SinkResponse: """ - Applies a sink function to a list of datum elements. - The pascal case function name comes from the proto sink_pb2_grpc.py file. + Create a new grpc Sink Server instance. + A new servicer instance is created and attached to the server. + The server instance is returned. + Args: + sinker_instance: The sinker instance to be used for Sink UDF + sock_path: The UNIX socket path to be used for the server + max_message_size: The max message size in bytes the server can receive and send + max_threads: The max number of threads to be spawned; + defaults to number of processors x4 """ - # if there is an exception, we will mark all the responses as a failure - datum_iterator = datum_generator(request_iterator) - try: - rspns = self.__sink_handler(datum_iterator) - except Exception as err: - err_msg = "UDSinkError: %r" % err - _LOGGER.critical(err_msg, exc_info=True) - rspns = Responses() - for _datum in datum_iterator: - rspns.append(Response.as_failure(_datum.id, err_msg)) + self.sock_path = f"unix://{sock_path}" + self.max_threads = min(max_threads, int(os.getenv("MAX_THREADS", "4"))) + self.max_message_size = max_message_size + + self.sinker_instance = sinker_instance - responses = [] - for rspn in rspns: - responses.append( - sink_pb2.SinkResponse.Result(id=rspn.id, success=rspn.success, err_msg=rspn.err) - ) + self._server_options = [ + ("grpc.max_send_message_length", self.max_message_size), + ("grpc.max_receive_message_length", self.max_message_size), + ] - return sink_pb2.SinkResponse(results=responses) + self.servicer = SyncSinkServicer(sinker_instance) - def IsReady( - self, request: _empty_pb2.Empty, context: NumaflowServicerContext - ) -> sink_pb2.ReadyResponse: + def start(self): """ - IsReady is the heartbeat endpoint for gRPC. - The pascal case function name comes from the proto sink_pb2_grpc.py file. + Starts the Synchronous gRPC server on the + given UNIX socket with given max threads. """ - return sink_pb2.ReadyResponse(ready=True) + _LOGGER.info( + "Sync GRPC Sink listening on: %s with max threads: %s", + self.sock_path, + self.max_threads, + ) + # Start the server + sync_server_start( + servicer=self.servicer, + bind_address=self.sock_path, + max_threads=self.max_threads, + server_options=self._server_options, + udf_type=UDFType.Sink, + ) diff --git a/pynumaflow/sinker/servicer/__init__.py b/pynumaflow/sinker/servicer/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pynumaflow/sinker/async_sink.py b/pynumaflow/sinker/servicer/async_servicer.py similarity index 84% rename from pynumaflow/sinker/async_sink.py rename to pynumaflow/sinker/servicer/async_servicer.py index e836bce2..59d2364e 100644 --- a/pynumaflow/sinker/async_sink.py +++ b/pynumaflow/sinker/servicer/async_servicer.py @@ -1,22 +1,12 @@ -import logging -import multiprocessing -import os from collections.abc import AsyncIterable from google.protobuf import empty_pb2 as _empty_pb2 -from pynumaflow import setup_logging from pynumaflow.sinker._dtypes import Responses, Datum, Response -from pynumaflow.sinker._dtypes import SinkCallable +from pynumaflow.sinker._dtypes import SyncSinkCallable from pynumaflow.proto.sinker import sink_pb2_grpc, sink_pb2 from pynumaflow.types import NumaflowServicerContext - -_LOGGER = setup_logging(__name__) -if os.getenv("PYTHONDEBUG"): - _LOGGER.setLevel(logging.DEBUG) - -_PROCESS_COUNT = multiprocessing.cpu_count() -MAX_THREADS = int(os.getenv("MAX_THREADS", 0)) or (_PROCESS_COUNT * 4) +from pynumaflow._constants import _LOGGER async def datum_generator( @@ -33,7 +23,7 @@ async def datum_generator( yield datum -class AsyncSinker(sink_pb2_grpc.SinkServicer): +class AsyncSinkServicer(sink_pb2_grpc.SinkServicer): """ This class is used to create a new grpc Sink servicer instance. It implements the SinkServicer interface from the proto sink.proto file. @@ -42,9 +32,9 @@ class AsyncSinker(sink_pb2_grpc.SinkServicer): def __init__( self, - handler: SinkCallable, + handler: SyncSinkCallable, ): - self.__sink_handler: SinkCallable = handler + self.__sink_handler: SyncSinkCallable = handler self.cleanup_coroutines = [] async def SinkFn( diff --git a/pynumaflow/sinker/servicer/sync_servicer.py b/pynumaflow/sinker/servicer/sync_servicer.py new file mode 100644 index 00000000..652a56c9 --- /dev/null +++ b/pynumaflow/sinker/servicer/sync_servicer.py @@ -0,0 +1,69 @@ +from collections.abc import Iterator, Iterable + +from google.protobuf import empty_pb2 as _empty_pb2 +from pynumaflow._constants import _LOGGER +from pynumaflow.sinker._dtypes import Responses, Datum, Response +from pynumaflow.sinker._dtypes import SyncSinkCallable +from pynumaflow.proto.sinker import sink_pb2_grpc, sink_pb2 +from pynumaflow.types import NumaflowServicerContext + + +def datum_generator(request_iterator: Iterable[sink_pb2.SinkRequest]) -> Iterable[Datum]: + for d in request_iterator: + datum = Datum( + keys=list(d.keys), + sink_msg_id=d.id, + value=d.value, + event_time=d.event_time.ToDatetime(), + watermark=d.watermark.ToDatetime(), + ) + yield datum + + +class SyncSinkServicer(sink_pb2_grpc.SinkServicer): + """ + This class is used to create a new grpc Sink servicer instance. + It implements the SinkServicer interface from the proto sink.proto file. + Provides the functionality for the required rpc methods. + """ + + def __init__( + self, + handler: SyncSinkCallable, + ): + self.__sink_handler: SyncSinkCallable = handler + + def SinkFn( + self, request_iterator: Iterator[sink_pb2.SinkRequest], context: NumaflowServicerContext + ) -> sink_pb2.SinkResponse: + """ + Applies a sink function to a list of datum elements. + The pascal case function name comes from the proto sink_pb2_grpc.py file. + """ + # if there is an exception, we will mark all the responses as a failure + datum_iterator = datum_generator(request_iterator) + try: + rspns = self.__sink_handler(datum_iterator) + except Exception as err: + err_msg = "UDSinkError: %r" % err + _LOGGER.critical(err_msg, exc_info=True) + rspns = Responses() + for _datum in datum_iterator: + rspns.append(Response.as_failure(_datum.id, err_msg)) + + responses = [] + for rspn in rspns: + responses.append( + sink_pb2.SinkResponse.Result(id=rspn.id, success=rspn.success, err_msg=rspn.err) + ) + + return sink_pb2.SinkResponse(results=responses) + + def IsReady( + self, request: _empty_pb2.Empty, context: NumaflowServicerContext + ) -> sink_pb2.ReadyResponse: + """ + IsReady is the heartbeat endpoint for gRPC. + The pascal case function name comes from the proto sink_pb2_grpc.py file. + """ + return sink_pb2.ReadyResponse(ready=True) diff --git a/pynumaflow/sinker/sink.py b/pynumaflow/sinker/sink.py deleted file mode 100644 index f3b21c07..00000000 --- a/pynumaflow/sinker/sink.py +++ /dev/null @@ -1,126 +0,0 @@ -import os - -import aiorun -import grpc - -from pynumaflow.sinker.async_sink import AsyncSinker -from pynumaflow.proto.sinker import sink_pb2_grpc - -from pynumaflow.sinker.server import Sinker - -from pynumaflow._constants import ( - SINK_SOCK_PATH, - MAX_MESSAGE_SIZE, - MAX_THREADS, - ServerType, - _LOGGER, - UDFType, -) - -from pynumaflow.shared.server import NumaflowServer, sync_server_start, start_async_server -from pynumaflow.sinker._dtypes import SinkCallable - - -class SinkServer(NumaflowServer): - """ - SinkServer is the main class to start a gRPC server for a sinker. - """ - - def __init__( - self, - sinker_instance: SinkCallable, - sock_path=SINK_SOCK_PATH, - max_message_size=MAX_MESSAGE_SIZE, - max_threads=MAX_THREADS, - server_type=ServerType.Sync, - ): - """ - Create a new grpc Sink Server instance. - A new servicer instance is created and attached to the server. - The server instance is returned. - Args: - sinker_instance: The sinker instance to be used for Sink UDF - sock_path: The UNIX socket path to be used for the server - max_message_size: The max message size in bytes the server can receive and send - max_threads: The max number of threads to be spawned; - defaults to number of processors x4 - server_type: The type of server to be used, this can be one of the following: - - ServerType.Sync: Synchronous server - - ServerType.Async: Asynchronous server - """ - self.sock_path = f"unix://{sock_path}" - self.max_threads = min(max_threads, int(os.getenv("MAX_THREADS", "4"))) - self.max_message_size = max_message_size - - self.sinker_instance = sinker_instance - self.server_type = server_type - - self._server_options = [ - ("grpc.max_send_message_length", self.max_message_size), - ("grpc.max_receive_message_length", self.max_message_size), - ] - - def start(self): - """ - Starter function for the server class, Handles the server type and - starts the server accordingly. If the server type is not supported, - raises NotImplementedError. - Currently supported server types are: - - ServerType.Sync: Synchronous server - - ServerType.Async: Asynchronous server - """ - if self.server_type == ServerType.Sync: - self.exec() - elif self.server_type == ServerType.Async: - aiorun.run(self.aexec(), use_uvloop=True) - else: - _LOGGER.error("Server type not supported - %s", str(self.server_type)) - raise NotImplementedError - - def exec(self): - """ - Starts the Synchronous gRPC server on the - given UNIX socket with given max threads. - """ - # Get the servicer instance - sink_servicer = self.get_servicer( - sinker_instance=self.sinker_instance, server_type=self.server_type - ) - _LOGGER.info( - "Sync GRPC Sink listening on: %s with max threads: %s", - self.sock_path, - self.max_threads, - ) - # Start the server - sync_server_start( - servicer=sink_servicer, - bind_address=self.sock_path, - max_threads=self.max_threads, - server_options=self._server_options, - udf_type=UDFType.Sink, - ) - - async def aexec(self): - """ - Starts the Asynchronous gRPC server on the given UNIX socket with given max threads. - """ - # As the server is async, we need to create a new server instance in the - # same thread as the event loop so that all the async calls are made in the - # same context - # Create a new server instance, add the servicer to it and start the server - server = grpc.aio.server() - server.add_insecure_port(self.sock_path) - sink_servicer = self.get_servicer( - sinker_instance=self.sinker_instance, server_type=self.server_type - ) - sink_pb2_grpc.add_SinkServicer_to_server(sink_servicer, server) - await start_async_server(server, self.sock_path, self.max_threads, self._server_options) - - def get_servicer(self, sinker_instance: SinkCallable, server_type: ServerType): - """ - Returns the servicer instance based on the server type. - """ - if server_type == ServerType.Sync: - return Sinker(sinker_instance) - elif server_type == ServerType.Async: - return AsyncSinker(sinker_instance) diff --git a/pynumaflow/sourcer/__init__.py b/pynumaflow/sourcer/__init__.py index 5d558433..a90d2e4d 100644 --- a/pynumaflow/sourcer/__init__.py +++ b/pynumaflow/sourcer/__init__.py @@ -1,7 +1,3 @@ -from pynumaflow._constants import ServerType - -from pynumaflow.sourcer.source import SourceServer - from pynumaflow.sourcer._dtypes import ( Message, ReadRequest, @@ -10,8 +6,10 @@ Offset, PartitionsResponse, get_default_partitions, - SourcerClass, + Sourcer, ) +from pynumaflow.sourcer.async_server import SourceAsyncServer +from pynumaflow.sourcer.server import SourceServer __all__ = [ "Message", @@ -22,6 +20,6 @@ "PartitionsResponse", "get_default_partitions", "SourceServer", - "SourcerClass", - "ServerType", + "Sourcer", + "SourceAsyncServer", ] diff --git a/pynumaflow/sourcer/_dtypes.py b/pynumaflow/sourcer/_dtypes.py index 27246c43..f26e9a40 100644 --- a/pynumaflow/sourcer/_dtypes.py +++ b/pynumaflow/sourcer/_dtypes.py @@ -204,7 +204,7 @@ def partitions(self) -> list[int]: return self._partitions -class SourcerClass(metaclass=ABCMeta): +class Sourcer(metaclass=ABCMeta): """ Provides an interface to write a Sourcer which will be exposed over an gRPC server. @@ -257,7 +257,7 @@ def partitions_handler(self) -> PartitionsResponse: SourceReadCallable = Callable[[ReadRequest], Iterable[Message]] AsyncSourceReadCallable = Callable[[ReadRequest], AsyncIterable[Message]] SourceAckCallable = Callable[[AckRequest], None] -SourceCallable = SourcerClass +SourceCallable = Sourcer def get_default_partitions() -> list[int]: diff --git a/pynumaflow/sourcer/async_server.py b/pynumaflow/sourcer/async_server.py index 58598068..8e8f2dd1 100644 --- a/pynumaflow/sourcer/async_server.py +++ b/pynumaflow/sourcer/async_server.py @@ -1,129 +1,74 @@ -from collections.abc import AsyncIterable -from google.protobuf import timestamp_pb2 as _timestamp_pb2 +import os + +import aiorun import grpc -from google.protobuf import empty_pb2 as _empty_pb2 +from pynumaflow.sourcer.servicer.async_servicer import AsyncSourceServicer -from pynumaflow.sourcer._dtypes import ReadRequest -from pynumaflow.sourcer._dtypes import Offset, AckRequest, SourceCallable -from pynumaflow.proto.sourcer import source_pb2 +from pynumaflow._constants import ( + SOURCE_SOCK_PATH, + MAX_MESSAGE_SIZE, + MAX_THREADS, +) from pynumaflow.proto.sourcer import source_pb2_grpc -from pynumaflow.types import NumaflowServicerContext -from pynumaflow._constants import _LOGGER + +from pynumaflow.shared.server import NumaflowServer, start_async_server +from pynumaflow.sourcer._dtypes import SourceCallable -class AsyncSourcer(source_pb2_grpc.SourceServicer): +class SourceAsyncServer(NumaflowServer): """ - This class is used to create a new grpc Source servicer instance. - It implements the SourceServicer interface from the proto source.proto file. - Provides the functionality for the required rpc methods. + Class for a new Async Source Server instance. """ - def __init__(self, source_handler: SourceCallable): - self.source_handler = source_handler - self.__source_read_handler = source_handler.read_handler - self.__source_ack_handler = source_handler.ack_handler - self.__source_pending_handler = source_handler.pending_handler - self.__source_partitions_handler = source_handler.partitions_handler - self.cleanup_coroutines = [] - - async def ReadFn( + def __init__( self, - request: source_pb2.ReadRequest, - context: NumaflowServicerContext, - ) -> AsyncIterable[source_pb2.ReadResponse]: + sourcer_instance: SourceCallable, + sock_path=SOURCE_SOCK_PATH, + max_message_size=MAX_MESSAGE_SIZE, + max_threads=MAX_THREADS, + ): """ - Applies a Read function and returns a stream of datum responses. - The pascal case function name comes from the proto source_pb2_grpc.py file. + Create a new grpc Source Server instance. + A new servicer instance is created and attached to the server. + The server instance is returned. + Args: + sourcer_instance: The sourcer instance to be used for Source UDF + sock_path: The UNIX socket path to be used for the server + max_message_size: The max message size in bytes the server can receive and send + max_threads: The max number of threads to be spawned; + defaults to number of processors x4 """ + self.sock_path = f"unix://{sock_path}" + self.max_threads = min(max_threads, int(os.getenv("MAX_THREADS", "4"))) + self.max_message_size = max_message_size - async for res in self.__invoke_source_read_stream( - ReadRequest( - num_records=request.request.num_records, - timeout_in_ms=request.request.timeout_in_ms, - ) - ): - yield source_pb2.ReadResponse(result=res) + self.sourcer_instance = sourcer_instance - async def __invoke_source_read_stream(self, req: ReadRequest): - try: - async for msg in self.__source_read_handler(req): - event_time_timestamp = _timestamp_pb2.Timestamp() - event_time_timestamp.FromDatetime(dt=msg.event_time) - yield source_pb2.ReadResponse.Result( - payload=msg.payload, - keys=msg.keys, - offset=msg.offset.as_dict, - event_time=event_time_timestamp, - ) - except Exception as err: - _LOGGER.critical("User-Defined Source ReadError ", exc_info=True) - raise err - - async def AckFn( - self, request: source_pb2.AckRequest, context: NumaflowServicerContext - ) -> source_pb2.AckResponse: - """ - Applies an Ack function in User Defined Source - """ - # proto repeated field(offsets) is of type google._upb._message.RepeatedScalarContainer - # we need to explicitly convert it to list - offsets = [] - for offset in request.request.offsets: - offsets.append(Offset(offset.offset, offset.partition_id)) - try: - await self.__invoke_ack(ack_req=offsets) - except Exception as e: - context.set_code(grpc.StatusCode.UNKNOWN) - context.set_details(str(e)) - raise e + self._server_options = [ + ("grpc.max_send_message_length", self.max_message_size), + ("grpc.max_receive_message_length", self.max_message_size), + ] - return source_pb2.AckResponse() + self.servicer = AsyncSourceServicer(source_handler=sourcer_instance) - async def __invoke_ack(self, ack_req: list[Offset]): + def start(self): """ - Invokes the Source Ack Function. + Starter function for the Async server class, need a separate caller + so that all the async coroutines can be started from a single context """ - try: - await self.__source_ack_handler(AckRequest(offsets=ack_req)) - except Exception as err: - _LOGGER.critical("AckFn Error", exc_info=True) - raise err - return source_pb2.AckResponse.Result() + aiorun.run(self.aexec(), use_uvloop=True) - async def IsReady( - self, request: _empty_pb2.Empty, context: NumaflowServicerContext - ) -> source_pb2.ReadyResponse: + async def aexec(self): """ - IsReady is the heartbeat endpoint for gRPC. - The pascal case function name comes from the proto source_pb2_grpc.py file. + Starts the Async gRPC server on the given UNIX socket with given max threads """ - return source_pb2.ReadyResponse(ready=True) - async def PendingFn( - self, request: _empty_pb2.Empty, context: NumaflowServicerContext - ) -> source_pb2.PendingResponse: - """ - PendingFn returns the number of pending records - at the user defined source. - """ - try: - count = await self.__source_pending_handler() - except Exception as err: - _LOGGER.critical("PendingFn Error", exc_info=True) - raise err - resp = source_pb2.PendingResponse.Result(count=count.count) - return source_pb2.PendingResponse(result=resp) - - async def PartitionsFn( - self, request: _empty_pb2.Empty, context: NumaflowServicerContext - ) -> source_pb2.PartitionsResponse: - """ - PartitionsFn returns the partitions of the user defined source. - """ - try: - partitions = await self.__source_partitions_handler() - except Exception as err: - _LOGGER.critical("PartitionsFn Error", exc_info=True) - raise err - resp = source_pb2.PartitionsResponse.Result(partitions=partitions.partitions) - return source_pb2.PartitionsResponse(result=resp) + # As the server is async, we need to create a new server instance in the + # same thread as the event loop so that all the async calls are made in the + # same context + # Create a new async server instance and add the servicer to it + server = grpc.aio.server() + server.add_insecure_port(self.sock_path) + source_servicer = self.servicer + source_pb2_grpc.add_SourceServicer_to_server(source_servicer, server) + await start_async_server(server, self.sock_path, self.max_threads, self._server_options) diff --git a/pynumaflow/sourcer/server.py b/pynumaflow/sourcer/server.py index e5149d01..2a1bbc8e 100644 --- a/pynumaflow/sourcer/server.py +++ b/pynumaflow/sourcer/server.py @@ -1,143 +1,70 @@ -from collections.abc import Iterable +import os -from google.protobuf import timestamp_pb2 as _timestamp_pb2 -import grpc -from google.protobuf import empty_pb2 as _empty_pb2 - -from pynumaflow.sourcer._dtypes import ReadRequest -from pynumaflow.sourcer._dtypes import ( - SourceReadCallable, - Offset, - AckRequest, - SourceAckCallable, - SourceCallable, +from pynumaflow._constants import ( + SOURCE_SOCK_PATH, + MAX_MESSAGE_SIZE, + MAX_THREADS, + _LOGGER, + UDFType, ) -from pynumaflow.proto.sourcer import source_pb2 -from pynumaflow.proto.sourcer import source_pb2_grpc -from pynumaflow.types import NumaflowServicerContext -from pynumaflow._constants import _LOGGER +from pynumaflow.shared.server import NumaflowServer, sync_server_start +from pynumaflow.sourcer._dtypes import SourceCallable +from pynumaflow.sourcer.servicer.sync_servicer import SyncSourceServicer -class Sourcer(source_pb2_grpc.SourceServicer): +class SourceServer(NumaflowServer): """ - This class is used to create a new grpc Source servicer instance. - It implements the SourceServicer interface from the proto source.proto file. - Provides the functionality for the required rpc methods. + Class for a new Source Server instance. """ def __init__( self, - source_handler: SourceCallable, + sourcer_instance: SourceCallable, + sock_path=SOURCE_SOCK_PATH, + max_message_size=MAX_MESSAGE_SIZE, + max_threads=MAX_THREADS, ): - self.source_handler = source_handler - self.__source_read_handler: SourceReadCallable = source_handler.read_handler - self.__source_ack_handler: SourceAckCallable = source_handler.ack_handler - self.__source_pending_handler = source_handler.pending_handler - self.__source_partitions_handler = source_handler.partitions_handler - - def ReadFn( - self, - request: source_pb2.ReadRequest, - context: NumaflowServicerContext, - ) -> Iterable[source_pb2.ReadResponse]: """ - Applies a Read function to a datum stream in streaming mode. - The pascal case function name comes from the proto source_pb2_grpc.py file. + Create a new grpc Source Server instance. + A new servicer instance is created and attached to the server. + The server instance is returned. + Args: + sourcer_instance: The sourcer instance to be used for Source UDF + sock_path: The UNIX socket path to be used for the server + max_message_size: The max message size in bytes the server can receive and send + max_threads: The max number of threads to be spawned; + defaults to number of processors x4 """ + self.sock_path = f"unix://{sock_path}" + self.max_threads = min(max_threads, int(os.getenv("MAX_THREADS", "4"))) + self.max_message_size = max_message_size - for res in self.__invoke_source_read_stream( - ReadRequest( - num_records=request.request.num_records, - timeout_in_ms=request.request.timeout_in_ms, - ) - ): - yield source_pb2.ReadResponse(result=res) - - def __invoke_source_read_stream(self, req: ReadRequest): - try: - for msg in self.__source_read_handler(req): - event_time_timestamp = _timestamp_pb2.Timestamp() - event_time_timestamp.FromDatetime(dt=msg.event_time) - yield source_pb2.ReadResponse.Result( - payload=msg.payload, - keys=msg.keys, - offset=msg.offset.as_dict, - event_time=event_time_timestamp, - ) - except Exception as err: - _LOGGER.critical("User-Defined Source ReadError ", exc_info=True) - raise err + self.sourcer_instance = sourcer_instance - def AckFn( - self, request: source_pb2.AckRequest, context: NumaflowServicerContext - ) -> source_pb2.AckResponse: - """ - Applies an Ack function in User Defined Source - """ - # proto repeated field(offsets) is of type google._upb._message.RepeatedScalarContainer - # we need to explicitly convert it to list - offsets = [] - for offset in request.request.offsets: - offsets.append(Offset(offset.offset, offset.partition_id)) - try: - self.__invoke_ack(ack_req=offsets) - except Exception as e: - context.set_code(grpc.StatusCode.UNKNOWN) - context.set_details(str(e)) - raise e + self._server_options = [ + ("grpc.max_send_message_length", self.max_message_size), + ("grpc.max_receive_message_length", self.max_message_size), + ] - return source_pb2.AckResponse() - - def __invoke_ack(self, ack_req: list[Offset]): - """ - Invokes the Source Ack Function. - """ - try: - self.__source_ack_handler(AckRequest(offsets=ack_req)) - except Exception as err: - _LOGGER.critical("AckFn Error", exc_info=True) - raise err - return source_pb2.AckResponse.Result() - - def IsReady( - self, request: _empty_pb2.Empty, context: NumaflowServicerContext - ) -> source_pb2.ReadyResponse: - """ - IsReady is the heartbeat endpoint for gRPC. - The pascal case function name comes from the proto source_pb2_grpc.py file. - """ - return source_pb2.ReadyResponse(ready=True) - - def PendingFn( - self, request: _empty_pb2.Empty, context: NumaflowServicerContext - ) -> source_pb2.PendingResponse: - """ - PendingFn returns the number of pending records - at the user defined source. - """ - try: - count = self.__source_pending_handler() - except Exception as err: - _LOGGER.critical("PendingFn error", exc_info=True) - raise err - resp = source_pb2.PendingResponse.Result(count=count.count) - return source_pb2.PendingResponse(result=resp) + self.servicer = SyncSourceServicer(source_handler=sourcer_instance) - def PartitionsFn( - self, request: _empty_pb2.Empty, context: NumaflowServicerContext - ) -> source_pb2.PartitionsResponse: + def start(self): """ - Partitions returns the partitions associated with the source, will be used by - the platform to determine the partitions to which the watermark should be published. - If the source doesn't have partitions, get_default_partitions() can be used to - return the default partitions. In most cases, the get_default_partitions() - should be enough; the cases where we need to implement custom partitions_handler() - is in a case like Kafka, where a reader can read from multiple Kafka partitions. + Starts the Synchronous Source gRPC server on the given + UNIX socket with given max threads. """ - try: - partitions = self.__source_partitions_handler() - except Exception as err: - _LOGGER.critical("PartitionFn error", exc_info=True) - raise err - resp = source_pb2.PartitionsResponse.Result(partitions=partitions.partitions) - return source_pb2.PartitionsResponse(result=resp) + # Get the servicer instance + source_servicer = self.servicer + _LOGGER.info( + "Sync Source GRPC Server listening on: %s with max threads: %s", + self.sock_path, + self.max_threads, + ) + # Start the sync server + sync_server_start( + servicer=source_servicer, + bind_address=self.sock_path, + max_threads=self.max_threads, + server_options=self._server_options, + udf_type=UDFType.Source, + ) diff --git a/pynumaflow/sourcer/servicer/__init__.py b/pynumaflow/sourcer/servicer/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pynumaflow/sourcer/servicer/async_servicer.py b/pynumaflow/sourcer/servicer/async_servicer.py new file mode 100644 index 00000000..cdffae92 --- /dev/null +++ b/pynumaflow/sourcer/servicer/async_servicer.py @@ -0,0 +1,129 @@ +from collections.abc import AsyncIterable +from google.protobuf import timestamp_pb2 as _timestamp_pb2 +import grpc +from google.protobuf import empty_pb2 as _empty_pb2 + +from pynumaflow.sourcer._dtypes import ReadRequest +from pynumaflow.sourcer._dtypes import Offset, AckRequest, SourceCallable +from pynumaflow.proto.sourcer import source_pb2 +from pynumaflow.proto.sourcer import source_pb2_grpc +from pynumaflow.types import NumaflowServicerContext +from pynumaflow._constants import _LOGGER + + +class AsyncSourceServicer(source_pb2_grpc.SourceServicer): + """ + This class is used to create a new grpc Source servicer instance. + It implements the SourceServicer interface from the proto source.proto file. + Provides the functionality for the required rpc methods. + """ + + def __init__(self, source_handler: SourceCallable): + self.source_handler = source_handler + self.__source_read_handler = source_handler.read_handler + self.__source_ack_handler = source_handler.ack_handler + self.__source_pending_handler = source_handler.pending_handler + self.__source_partitions_handler = source_handler.partitions_handler + self.cleanup_coroutines = [] + + async def ReadFn( + self, + request: source_pb2.ReadRequest, + context: NumaflowServicerContext, + ) -> AsyncIterable[source_pb2.ReadResponse]: + """ + Applies a Read function and returns a stream of datum responses. + The pascal case function name comes from the proto source_pb2_grpc.py file. + """ + + async for res in self.__invoke_source_read_stream( + ReadRequest( + num_records=request.request.num_records, + timeout_in_ms=request.request.timeout_in_ms, + ) + ): + yield source_pb2.ReadResponse(result=res) + + async def __invoke_source_read_stream(self, req: ReadRequest): + try: + async for msg in self.__source_read_handler(req): + event_time_timestamp = _timestamp_pb2.Timestamp() + event_time_timestamp.FromDatetime(dt=msg.event_time) + yield source_pb2.ReadResponse.Result( + payload=msg.payload, + keys=msg.keys, + offset=msg.offset.as_dict, + event_time=event_time_timestamp, + ) + except Exception as err: + _LOGGER.critical("User-Defined Source ReadError ", exc_info=True) + raise err + + async def AckFn( + self, request: source_pb2.AckRequest, context: NumaflowServicerContext + ) -> source_pb2.AckResponse: + """ + Applies an Ack function in User Defined Source + """ + # proto repeated field(offsets) is of type google._upb._message.RepeatedScalarContainer + # we need to explicitly convert it to list + offsets = [] + for offset in request.request.offsets: + offsets.append(Offset(offset.offset, offset.partition_id)) + try: + await self.__invoke_ack(ack_req=offsets) + except Exception as e: + context.set_code(grpc.StatusCode.UNKNOWN) + context.set_details(str(e)) + raise e + + return source_pb2.AckResponse() + + async def __invoke_ack(self, ack_req: list[Offset]): + """ + Invokes the Source Ack Function. + """ + try: + await self.__source_ack_handler(AckRequest(offsets=ack_req)) + except Exception as err: + _LOGGER.critical("AckFn Error", exc_info=True) + raise err + return source_pb2.AckResponse.Result() + + async def IsReady( + self, request: _empty_pb2.Empty, context: NumaflowServicerContext + ) -> source_pb2.ReadyResponse: + """ + IsReady is the heartbeat endpoint for gRPC. + The pascal case function name comes from the proto source_pb2_grpc.py file. + """ + return source_pb2.ReadyResponse(ready=True) + + async def PendingFn( + self, request: _empty_pb2.Empty, context: NumaflowServicerContext + ) -> source_pb2.PendingResponse: + """ + PendingFn returns the number of pending records + at the user defined source. + """ + try: + count = await self.__source_pending_handler() + except Exception as err: + _LOGGER.critical("PendingFn Error", exc_info=True) + raise err + resp = source_pb2.PendingResponse.Result(count=count.count) + return source_pb2.PendingResponse(result=resp) + + async def PartitionsFn( + self, request: _empty_pb2.Empty, context: NumaflowServicerContext + ) -> source_pb2.PartitionsResponse: + """ + PartitionsFn returns the partitions of the user defined source. + """ + try: + partitions = await self.__source_partitions_handler() + except Exception as err: + _LOGGER.critical("PartitionsFn Error", exc_info=True) + raise err + resp = source_pb2.PartitionsResponse.Result(partitions=partitions.partitions) + return source_pb2.PartitionsResponse(result=resp) diff --git a/pynumaflow/sourcer/servicer/sync_servicer.py b/pynumaflow/sourcer/servicer/sync_servicer.py new file mode 100644 index 00000000..824508c5 --- /dev/null +++ b/pynumaflow/sourcer/servicer/sync_servicer.py @@ -0,0 +1,143 @@ +from collections.abc import Iterable + +from google.protobuf import timestamp_pb2 as _timestamp_pb2 +import grpc +from google.protobuf import empty_pb2 as _empty_pb2 + +from pynumaflow.sourcer._dtypes import ReadRequest +from pynumaflow.sourcer._dtypes import ( + SourceReadCallable, + Offset, + AckRequest, + SourceAckCallable, + SourceCallable, +) +from pynumaflow.proto.sourcer import source_pb2 +from pynumaflow.proto.sourcer import source_pb2_grpc +from pynumaflow.types import NumaflowServicerContext +from pynumaflow._constants import _LOGGER + + +class SyncSourceServicer(source_pb2_grpc.SourceServicer): + """ + This class is used to create a new grpc Source servicer instance. + It implements the SourceServicer interface from the proto source.proto file. + Provides the functionality for the required rpc methods. + """ + + def __init__( + self, + source_handler: SourceCallable, + ): + self.source_handler = source_handler + self.__source_read_handler: SourceReadCallable = source_handler.read_handler + self.__source_ack_handler: SourceAckCallable = source_handler.ack_handler + self.__source_pending_handler = source_handler.pending_handler + self.__source_partitions_handler = source_handler.partitions_handler + + def ReadFn( + self, + request: source_pb2.ReadRequest, + context: NumaflowServicerContext, + ) -> Iterable[source_pb2.ReadResponse]: + """ + Applies a Read function to a datum stream in streaming mode. + The pascal case function name comes from the proto source_pb2_grpc.py file. + """ + + for res in self.__invoke_source_read_stream( + ReadRequest( + num_records=request.request.num_records, + timeout_in_ms=request.request.timeout_in_ms, + ) + ): + yield source_pb2.ReadResponse(result=res) + + def __invoke_source_read_stream(self, req: ReadRequest): + try: + for msg in self.__source_read_handler(req): + event_time_timestamp = _timestamp_pb2.Timestamp() + event_time_timestamp.FromDatetime(dt=msg.event_time) + yield source_pb2.ReadResponse.Result( + payload=msg.payload, + keys=msg.keys, + offset=msg.offset.as_dict, + event_time=event_time_timestamp, + ) + except Exception as err: + _LOGGER.critical("User-Defined Source ReadError ", exc_info=True) + raise err + + def AckFn( + self, request: source_pb2.AckRequest, context: NumaflowServicerContext + ) -> source_pb2.AckResponse: + """ + Applies an Ack function in User Defined Source + """ + # proto repeated field(offsets) is of type google._upb._message.RepeatedScalarContainer + # we need to explicitly convert it to list + offsets = [] + for offset in request.request.offsets: + offsets.append(Offset(offset.offset, offset.partition_id)) + try: + self.__invoke_ack(ack_req=offsets) + except Exception as e: + context.set_code(grpc.StatusCode.UNKNOWN) + context.set_details(str(e)) + raise e + + return source_pb2.AckResponse() + + def __invoke_ack(self, ack_req: list[Offset]): + """ + Invokes the Source Ack Function. + """ + try: + self.__source_ack_handler(AckRequest(offsets=ack_req)) + except Exception as err: + _LOGGER.critical("AckFn Error", exc_info=True) + raise err + return source_pb2.AckResponse.Result() + + def IsReady( + self, request: _empty_pb2.Empty, context: NumaflowServicerContext + ) -> source_pb2.ReadyResponse: + """ + IsReady is the heartbeat endpoint for gRPC. + The pascal case function name comes from the proto source_pb2_grpc.py file. + """ + return source_pb2.ReadyResponse(ready=True) + + def PendingFn( + self, request: _empty_pb2.Empty, context: NumaflowServicerContext + ) -> source_pb2.PendingResponse: + """ + PendingFn returns the number of pending records + at the user defined source. + """ + try: + count = self.__source_pending_handler() + except Exception as err: + _LOGGER.critical("PendingFn error", exc_info=True) + raise err + resp = source_pb2.PendingResponse.Result(count=count.count) + return source_pb2.PendingResponse(result=resp) + + def PartitionsFn( + self, request: _empty_pb2.Empty, context: NumaflowServicerContext + ) -> source_pb2.PartitionsResponse: + """ + Partitions returns the partitions associated with the source, will be used by + the platform to determine the partitions to which the watermark should be published. + If the source doesn't have partitions, get_default_partitions() can be used to + return the default partitions. In most cases, the get_default_partitions() + should be enough; the cases where we need to implement custom partitions_handler() + is in a case like Kafka, where a reader can read from multiple Kafka partitions. + """ + try: + partitions = self.__source_partitions_handler() + except Exception as err: + _LOGGER.critical("PartitionFn error", exc_info=True) + raise err + resp = source_pb2.PartitionsResponse.Result(partitions=partitions.partitions) + return source_pb2.PartitionsResponse(result=resp) diff --git a/pynumaflow/sourcer/source.py b/pynumaflow/sourcer/source.py deleted file mode 100644 index b7d066bf..00000000 --- a/pynumaflow/sourcer/source.py +++ /dev/null @@ -1,121 +0,0 @@ -import os - -import aiorun -import grpc -from pynumaflow.sourcer.async_server import AsyncSourcer -from pynumaflow.sourcer.server import Sourcer - -from pynumaflow._constants import ( - SOURCE_SOCK_PATH, - MAX_MESSAGE_SIZE, - MAX_THREADS, - ServerType, - _LOGGER, - UDFType, -) -from pynumaflow.proto.sourcer import source_pb2_grpc - -from pynumaflow.shared.server import NumaflowServer, sync_server_start, start_async_server -from pynumaflow.sourcer._dtypes import SourceCallable - - -class SourceServer(NumaflowServer): - """ - Class for a new Source Server instance. - """ - - def __init__( - self, - sourcer_instance: SourceCallable, - sock_path=SOURCE_SOCK_PATH, - max_message_size=MAX_MESSAGE_SIZE, - max_threads=MAX_THREADS, - server_type=ServerType.Sync, - ): - """ - Create a new grpc Source Server instance. - A new servicer instance is created and attached to the server. - The server instance is returned. - Args: - sourcer_instance: The sourcer instance to be used for Source UDF - sock_path: The UNIX socket path to be used for the server - max_message_size: The max message size in bytes the server can receive and send - max_threads: The max number of threads to be spawned; - defaults to number of processors x4 - server_type: The type of server to be used - """ - self.sock_path = f"unix://{sock_path}" - self.max_threads = min(max_threads, int(os.getenv("MAX_THREADS", "4"))) - self.max_message_size = max_message_size - - self.sourcer_instance = sourcer_instance - self.server_type = server_type - - self._server_options = [ - ("grpc.max_send_message_length", self.max_message_size), - ("grpc.max_receive_message_length", self.max_message_size), - ] - - def start(self): - """ - Starter function for the Source server, Handles the server type and - starts the server. - Currrently supported server types: - 1. ServerType.Sync - 2. ServerType.Async - """ - if self.server_type == ServerType.Sync: - self.exec() - elif self.server_type == ServerType.Async: - aiorun.run(self.aexec(), use_uvloop=True) - else: - _LOGGER.error("Server type not supported - %s", str(self.server_type)) - raise NotImplementedError - - def exec(self): - """ - Starts the Synchronous gRPC server on the given UNIX socket with given max threads. - """ - # Get the servicer instance - source_servicer = self.get_servicer( - sourcer_instance=self.sourcer_instance, server_type=self.server_type - ) - _LOGGER.info( - "Sync GRPC Server listening on: %s with max threads: %s", - self.sock_path, - self.max_threads, - ) - # Start the sync server - sync_server_start( - servicer=source_servicer, - bind_address=self.sock_path, - max_threads=self.max_threads, - server_options=self._server_options, - udf_type=UDFType.Source, - ) - - async def aexec(self): - """ - Starts the Async gRPC server on the given UNIX socket with given max threads - """ - - # As the server is async, we need to create a new server instance in the - # same thread as the event loop so that all the async calls are made in the - # same context - # Create a new async server instance and add the servicer to it - server = grpc.aio.server() - server.add_insecure_port(self.sock_path) - source_servicer = self.get_servicer( - sourcer_instance=self.sourcer_instance, server_type=self.server_type - ) - source_pb2_grpc.add_SourceServicer_to_server(source_servicer, server) - await start_async_server(server, self.sock_path, self.max_threads, self._server_options) - - def get_servicer(self, sourcer_instance: SourceCallable, server_type: ServerType): - if server_type == ServerType.Sync: - source_servicer = Sourcer(source_handler=sourcer_instance) - elif server_type == ServerType.Async: - source_servicer = AsyncSourcer(source_handler=sourcer_instance) - else: - raise NotImplementedError - return source_servicer diff --git a/pynumaflow/sourcetransformer/__init__.py b/pynumaflow/sourcetransformer/__init__.py index f636e458..69f8018c 100644 --- a/pynumaflow/sourcetransformer/__init__.py +++ b/pynumaflow/sourcetransformer/__init__.py @@ -1,13 +1,12 @@ -from pynumaflow._constants import ServerType - from pynumaflow.sourcetransformer._dtypes import ( Message, Messages, Datum, DROP, - SourceTransformerClass, + SourceTransformer, ) -from pynumaflow.sourcetransformer.sourcetransform import SourceTransformServer +from pynumaflow.sourcetransformer.multiproc_server import SourceTransformMultiProcServer +from pynumaflow.sourcetransformer.server import SourceTransformServer __all__ = [ "Message", @@ -15,6 +14,6 @@ "Datum", "DROP", "SourceTransformServer", - "SourceTransformerClass", - "ServerType", + "SourceTransformer", + "SourceTransformMultiProcServer", ] diff --git a/pynumaflow/sourcetransformer/_dtypes.py b/pynumaflow/sourcetransformer/_dtypes.py index b4526c17..ad0d5426 100644 --- a/pynumaflow/sourcetransformer/_dtypes.py +++ b/pynumaflow/sourcetransformer/_dtypes.py @@ -173,7 +173,7 @@ def watermark(self) -> datetime: return self._watermark -class SourceTransformerClass(metaclass=ABCMeta): +class SourceTransformer(metaclass=ABCMeta): """ Provides an interface to write a Source Transformer which will be exposed over a GRPC server. @@ -198,4 +198,4 @@ def handler(self, keys: list[str], datum: Datum) -> Messages: SourceTransformHandler = Callable[[list[str], Datum], Messages] # SourceTransformCallable is the type of the handler function for the # Source Transformer UDFunction. -SourceTransformCallable = Union[SourceTransformHandler, SourceTransformerClass] +SourceTransformCallable = Union[SourceTransformHandler, SourceTransformer] diff --git a/pynumaflow/sourcetransformer/multiproc_server.py b/pynumaflow/sourcetransformer/multiproc_server.py new file mode 100644 index 00000000..23d7101b --- /dev/null +++ b/pynumaflow/sourcetransformer/multiproc_server.py @@ -0,0 +1,75 @@ +import os + +from pynumaflow.sourcetransformer.servicer.server import SourceTransformServicer + +from pynumaflow.shared.server import start_multiproc_server + +from pynumaflow._constants import ( + MAX_MESSAGE_SIZE, + SOURCE_TRANSFORMER_SOCK_PATH, + MAX_THREADS, + UDFType, + _PROCESS_COUNT, +) + +from pynumaflow.sourcetransformer._dtypes import SourceTransformCallable + +from pynumaflow.shared import NumaflowServer + + +class SourceTransformMultiProcServer(NumaflowServer): + """ + Class for a new Source Transformer Server instance. + """ + + def __init__( + self, + source_transform_instance: SourceTransformCallable, + server_count: int = _PROCESS_COUNT, + sock_path=SOURCE_TRANSFORMER_SOCK_PATH, + max_message_size=MAX_MESSAGE_SIZE, + max_threads=MAX_THREADS, + ): + """ + Create a new grpc Source Transformer Server instance. + A new servicer instance is created and attached to the server. + The server instance is returned. + Args: + source_transform_instance: The source transformer instance to be used for + Source Transformer UDF + sock_path: The UNIX socket path to be used for the server + max_message_size: The max message size in bytes the server can receive and send + max_threads: The max number of threads to be spawned; + defaults to number of processors x4 + """ + self.sock_path = f"unix://{sock_path}" + self.max_threads = min(max_threads, int(os.getenv("MAX_THREADS", "4"))) + self.max_message_size = max_message_size + + self.source_transform_instance = source_transform_instance + + self._server_options = [ + ("grpc.max_send_message_length", self.max_message_size), + ("grpc.max_receive_message_length", self.max_message_size), + ("grpc.so_reuseport", 1), + ("grpc.so_reuseaddr", 1), + ] + # Set the number of processes to be spawned to the number of CPUs or + # the value of the env var NUM_CPU_MULTIPROC defined by the user + # Setting the max value to 2 * CPU count + # Used for multiproc server + self._process_count = min(server_count, 2 * _PROCESS_COUNT) + self.servicer = SourceTransformServicer(handler=source_transform_instance) + + def start(self): + """ + Starts the Multiproc gRPC server on the given TCP sockets + with given max threads. + """ + start_multiproc_server( + max_threads=self.max_threads, + servicer=self.servicer, + process_count=self._process_count, + server_options=self._server_options, + udf_type=UDFType.Map, + ) diff --git a/pynumaflow/sourcetransformer/server.py b/pynumaflow/sourcetransformer/server.py index b63fa13c..0f248eb2 100644 --- a/pynumaflow/sourcetransformer/server.py +++ b/pynumaflow/sourcetransformer/server.py @@ -1,73 +1,68 @@ -import grpc -from google.protobuf import empty_pb2 as _empty_pb2 -from google.protobuf import timestamp_pb2 as _timestamp_pb2 +import os -from pynumaflow.sourcetransformer import Datum +from pynumaflow._constants import ( + MAX_MESSAGE_SIZE, + SOURCE_TRANSFORMER_SOCK_PATH, + MAX_THREADS, + _LOGGER, + UDFType, +) +from pynumaflow.shared import NumaflowServer +from pynumaflow.shared.server import sync_server_start from pynumaflow.sourcetransformer._dtypes import SourceTransformCallable -from pynumaflow.proto.sourcetransformer import transform_pb2 -from pynumaflow.proto.sourcetransformer import transform_pb2_grpc -from pynumaflow.types import NumaflowServicerContext -from pynumaflow._constants import _LOGGER +from pynumaflow.sourcetransformer.servicer.server import SourceTransformServicer -class SourceTransformer(transform_pb2_grpc.SourceTransformServicer): +class SourceTransformServer(NumaflowServer): """ - This class is used to create a new grpc SourceTransform servicer instance. - It implements the SourceTransformServicer interface from the proto transform.proto file. - Provides the functionality for the required rpc methods. + Class for a new Source Transformer Server instance. """ def __init__( self, - handler: SourceTransformCallable, + source_transform_instance: SourceTransformCallable, + sock_path=SOURCE_TRANSFORMER_SOCK_PATH, + max_message_size=MAX_MESSAGE_SIZE, + max_threads=MAX_THREADS, ): - self.__transform_handler: SourceTransformCallable = handler - - def SourceTransformFn( - self, request: transform_pb2.SourceTransformRequest, context: NumaflowServicerContext - ) -> transform_pb2.SourceTransformResponse: """ - Applies a function to each datum element. - The pascal case function name comes from the generated transform_pb2_grpc.py file. + Create a new grpc Source Transformer Server instance. + A new servicer instance is created and attached to the server. + The server instance is returned. + Args: + source_transform_instance: The source transformer instance to be used for + Source Transformer UDF + sock_path: The UNIX socket path to be used for the server + max_message_size: The max message size in bytes the server can receive and send + max_threads: The max number of threads to be spawned; + defaults to number of processors x4 """ + self.sock_path = f"unix://{sock_path}" + self.max_threads = min(max_threads, int(os.getenv("MAX_THREADS", "4"))) + self.max_message_size = max_message_size - # proto repeated field(keys) is of type google._upb._message.RepeatedScalarContainer - # we need to explicitly convert it to list - try: - msgts = self.__transform_handler( - list(request.keys), - Datum( - keys=list(request.keys), - value=request.value, - event_time=request.event_time.ToDatetime(), - watermark=request.watermark.ToDatetime(), - ), - ) - except Exception as err: - _LOGGER.critical("UDFError, re-raising the error", exc_info=True) - context.set_code(grpc.StatusCode.UNKNOWN) - context.set_details(str(err)) - return transform_pb2.SourceTransformResponse(results=[]) + self.source_transform_instance = source_transform_instance - datums = [] - for msgt in msgts: - event_time_timestamp = _timestamp_pb2.Timestamp() - event_time_timestamp.FromDatetime(dt=msgt.event_time) - datums.append( - transform_pb2.SourceTransformResponse.Result( - keys=list(msgt.keys), - value=msgt.value, - tags=msgt.tags, - event_time=event_time_timestamp, - ) - ) - return transform_pb2.SourceTransformResponse(results=datums) + self._server_options = [ + ("grpc.max_send_message_length", self.max_message_size), + ("grpc.max_receive_message_length", self.max_message_size), + ] + self.servicer = SourceTransformServicer(handler=source_transform_instance) - def IsReady( - self, request: _empty_pb2.Empty, context: NumaflowServicerContext - ) -> transform_pb2.ReadyResponse: + def start(self): """ - IsReady is the heartbeat endpoint for gRPC. - The pascal case function name comes from the proto transform_pb2_grpc.py file. + Starts the Synchronous gRPC server on the given UNIX socket with given max threads. """ - return transform_pb2.ReadyResponse(ready=True) + _LOGGER.info( + "Sync GRPC Server listening on: %s with max threads: %s", + self.sock_path, + self.max_threads, + ) + # Start the sync server + sync_server_start( + servicer=self.servicer, + bind_address=self.sock_path, + max_threads=self.max_threads, + server_options=self._server_options, + udf_type=UDFType.SourceTransformer, + ) diff --git a/pynumaflow/sourcetransformer/servicer/__init__.py b/pynumaflow/sourcetransformer/servicer/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pynumaflow/sourcetransformer/servicer/server.py b/pynumaflow/sourcetransformer/servicer/server.py new file mode 100644 index 00000000..6f803fe2 --- /dev/null +++ b/pynumaflow/sourcetransformer/servicer/server.py @@ -0,0 +1,73 @@ +import grpc +from google.protobuf import empty_pb2 as _empty_pb2 +from google.protobuf import timestamp_pb2 as _timestamp_pb2 + +from pynumaflow.sourcetransformer import Datum +from pynumaflow.sourcetransformer._dtypes import SourceTransformCallable +from pynumaflow.proto.sourcetransformer import transform_pb2 +from pynumaflow.proto.sourcetransformer import transform_pb2_grpc +from pynumaflow.types import NumaflowServicerContext +from pynumaflow._constants import _LOGGER + + +class SourceTransformServicer(transform_pb2_grpc.SourceTransformServicer): + """ + This class is used to create a new grpc SourceTransform servicer instance. + It implements the SourceTransformServicer interface from the proto transform.proto file. + Provides the functionality for the required rpc methods. + """ + + def __init__( + self, + handler: SourceTransformCallable, + ): + self.__transform_handler: SourceTransformCallable = handler + + def SourceTransformFn( + self, request: transform_pb2.SourceTransformRequest, context: NumaflowServicerContext + ) -> transform_pb2.SourceTransformResponse: + """ + Applies a function to each datum element. + The pascal case function name comes from the generated transform_pb2_grpc.py file. + """ + + # proto repeated field(keys) is of type google._upb._message.RepeatedScalarContainer + # we need to explicitly convert it to list + try: + msgts = self.__transform_handler( + list(request.keys), + Datum( + keys=list(request.keys), + value=request.value, + event_time=request.event_time.ToDatetime(), + watermark=request.watermark.ToDatetime(), + ), + ) + except Exception as err: + _LOGGER.critical("UDFError, re-raising the error", exc_info=True) + context.set_code(grpc.StatusCode.UNKNOWN) + context.set_details(str(err)) + return transform_pb2.SourceTransformResponse(results=[]) + + datums = [] + for msgt in msgts: + event_time_timestamp = _timestamp_pb2.Timestamp() + event_time_timestamp.FromDatetime(dt=msgt.event_time) + datums.append( + transform_pb2.SourceTransformResponse.Result( + keys=list(msgt.keys), + value=msgt.value, + tags=msgt.tags, + event_time=event_time_timestamp, + ) + ) + return transform_pb2.SourceTransformResponse(results=datums) + + def IsReady( + self, request: _empty_pb2.Empty, context: NumaflowServicerContext + ) -> transform_pb2.ReadyResponse: + """ + IsReady is the heartbeat endpoint for gRPC. + The pascal case function name comes from the proto transform_pb2_grpc.py file. + """ + return transform_pb2.ReadyResponse(ready=True) diff --git a/pynumaflow/sourcetransformer/sourcetransform.py b/pynumaflow/sourcetransformer/sourcetransform.py deleted file mode 100644 index 591a0119..00000000 --- a/pynumaflow/sourcetransformer/sourcetransform.py +++ /dev/null @@ -1,133 +0,0 @@ -import os - -from pynumaflow.sourcetransformer.server import SourceTransformer - -from pynumaflow.shared.server import sync_server_start, start_multiproc_server - -from pynumaflow._constants import ( - MAX_MESSAGE_SIZE, - SOURCE_TRANSFORMER_SOCK_PATH, - MAX_THREADS, - ServerType, - _LOGGER, - UDFType, -) - -from pynumaflow.sourcetransformer._dtypes import SourceTransformCallable - -from pynumaflow.shared import NumaflowServer - - -class SourceTransformServer(NumaflowServer): - """ - Class for a new Source Transformer Server instance. - """ - - def __init__( - self, - source_transform_instance: SourceTransformCallable, - sock_path=SOURCE_TRANSFORMER_SOCK_PATH, - max_message_size=MAX_MESSAGE_SIZE, - max_threads=MAX_THREADS, - server_type=ServerType.Sync, - ): - """ - Create a new grpc Source Transformer Server instance. - A new servicer instance is created and attached to the server. - The server instance is returned. - Args: - source_transform_instance: The source transformer instance to be used for - Source Transformer UDF - sock_path: The UNIX socket path to be used for the server - max_message_size: The max message size in bytes the server can receive and send - max_threads: The max number of threads to be spawned; - defaults to number of processors x4 - server_type: The type of server to be used - """ - self.sock_path = f"unix://{sock_path}" - self.max_threads = min(max_threads, int(os.getenv("MAX_THREADS", "4"))) - self.max_message_size = max_message_size - - self.source_transform_instance = source_transform_instance - self.server_type = server_type - - self._server_options = [ - ("grpc.max_send_message_length", self.max_message_size), - ("grpc.max_receive_message_length", self.max_message_size), - ] - if server_type == ServerType.Multiproc: - self._server_options.append(("grpc.so_reuseport", 1)) - self._server_options.append(("grpc.so_reuseaddr", 1)) - - # Set the number of processes to be spawned to the number of CPUs or - # the value of the env var NUM_CPU_MULTIPROC defined by the user - # Setting the max value to 2 * CPU count - # Used for multiproc server - self._process_count = min( - int(os.getenv("NUM_CPU_MULTIPROC", str(os.cpu_count()))), 2 * os.cpu_count() - ) - - def start(self): - """ - Starter function for the Source Transformer server, - Handles the server type and starts the server. - Currrently supported server types: - 1. ServerType.Sync - 2. ServerType.Multiproc - """ - if self.server_type == ServerType.Sync: - self.exec() - elif self.server_type == ServerType.Multiproc: - self.exec_multiproc() - else: - _LOGGER.error("Server type not supported - %s", str(self.server_type)) - raise NotImplementedError - - def exec(self): - """ - Starts the Synchronous gRPC server on the given UNIX socket with given max threads. - """ - transform_servicer = self.get_servicer( - source_transform_instance=self.source_transform_instance, server_type=self.server_type - ) - _LOGGER.info( - "Sync GRPC Server listening on: %s with max threads: %s", - self.sock_path, - self.max_threads, - ) - # Start the sync server - sync_server_start( - servicer=transform_servicer, - bind_address=self.sock_path, - max_threads=self.max_threads, - server_options=self._server_options, - udf_type=UDFType.SourceTransformer, - ) - - def exec_multiproc(self): - """ - Starts the Multiproc gRPC server on the given TCP sockets - with given max threads. - """ - transform_servicer = self.get_servicer( - source_transform_instance=self.source_transform_instance, server_type=self.server_type - ) - start_multiproc_server( - max_threads=self.max_threads, - servicer=transform_servicer, - process_count=self._process_count, - server_options=self._server_options, - udf_type=UDFType.Map, - ) - - def get_servicer( - self, source_transform_instance: SourceTransformCallable, server_type: ServerType - ): - """ - Returns the servicer instance for the given server type. - """ - if server_type == ServerType.Sync: - transform_servicer = SourceTransformer(handler=source_transform_instance) - elif server_type == ServerType.Multiproc: - transform_servicer = SourceTransformer(handler=source_transform_instance) - return transform_servicer diff --git a/tests/map/test_sync_mapper.py b/tests/map/test_sync_mapper.py index 1489ae47..839ec2da 100644 --- a/tests/map/test_sync_mapper.py +++ b/tests/map/test_sync_mapper.py @@ -20,10 +20,6 @@ class TestSyncMapper(unittest.TestCase): def setUp(self) -> None: class_instance = ExampleMap() my_server = MapServer(mapper_instance=class_instance) - # my_servicer = my_server.get_servicer( - # mapper_instance=map_handler, server_type=ServerType.Sync - # ) - services = {map_pb2.DESCRIPTOR.services_by_name["Map"]: my_server.servicer} self.test_server = server_from_dictionary(services, strict_real_time()) diff --git a/tests/mapstream/test_async_map_stream_err.py b/tests/mapstream/test_async_map_stream_err.py index feed2459..a1bf137e 100644 --- a/tests/mapstream/test_async_map_stream_err.py +++ b/tests/mapstream/test_async_map_stream_err.py @@ -43,7 +43,7 @@ def startup_callable(loop): async def start_server(): server = grpc.aio.server() - server_instance = MapStreamAsyncServer(map_stream_instance=err_async_map_stream_handler) + server_instance = MapStreamAsyncServer(err_async_map_stream_handler) udfs = server_instance.servicer mapstream_pb2_grpc.add_MapStreamServicer_to_server(udfs, server) listen_addr = "unix:///tmp/async_map_stream_err.sock" diff --git a/tests/sideinput/test_responses.py b/tests/sideinput/test_responses.py index bf0e7edb..859f4bb1 100644 --- a/tests/sideinput/test_responses.py +++ b/tests/sideinput/test_responses.py @@ -1,6 +1,6 @@ import unittest -from pynumaflow.sideinput import Response, SideInputClass +from pynumaflow.sideinput import Response, SideInput class TestResponse(unittest.TestCase): @@ -26,7 +26,7 @@ def test_no_broadcast_message(self): self.assertTrue(succ_response.no_broadcast) -class ExampleSideInput(SideInputClass): +class ExampleSideInput(SideInput): def retrieve_handler(self) -> Response: return Response.broadcast_message(b"testMessage") diff --git a/tests/sideinput/test_side_input_server.py b/tests/sideinput/test_side_input_server.py index d9b2b841..501c378f 100644 --- a/tests/sideinput/test_side_input_server.py +++ b/tests/sideinput/test_side_input_server.py @@ -33,10 +33,8 @@ class TestServer(unittest.TestCase): """ def setUp(self) -> None: - server = SideInputServer(side_input_instance=retrieve_side_input_handler) - my_service = server.get_servicer( - side_input_instance=server.side_input_instance, server_type=server.server_type - ) + server = SideInputServer(retrieve_side_input_handler) + my_service = server.servicer services = {sideinput_pb2.DESCRIPTOR.services_by_name["SideInput"]: my_service} self.test_server = server_from_dictionary(services, strict_real_time()) @@ -56,10 +54,8 @@ def test_side_input_err(self): """ Test the error case for the RetrieveSideInput method, """ - server = SideInputServer(side_input_instance=err_retrieve_handler) - my_service = server.get_servicer( - side_input_instance=server.side_input_instance, server_type=server.server_type - ) + server = SideInputServer(err_retrieve_handler) + my_service = server.servicer services = {sideinput_pb2.DESCRIPTOR.services_by_name["SideInput"]: my_service} self.test_server = server_from_dictionary(services, strict_real_time()) @@ -121,9 +117,7 @@ def test_side_input_no_broadcast(self): where we expect the no_broadcast flag to be True. """ server = SideInputServer(side_input_instance=retrieve_no_broadcast_handler) - my_servicer = server.get_servicer( - side_input_instance=server.side_input_instance, server_type=server.server_type - ) + my_servicer = server.servicer services = {sideinput_pb2.DESCRIPTOR.services_by_name["SideInput"]: my_servicer} self.test_server = server_from_dictionary(services, strict_real_time()) @@ -146,10 +140,6 @@ def test_side_input_no_broadcast(self): def test_invalid_input(self): with self.assertRaises(TypeError): SideInputServer() - with self.assertRaises(NotImplementedError): - SideInputServer( - side_input_instance=retrieve_side_input_handler, server_type="test" - ).start() if __name__ == "__main__": diff --git a/tests/sink/test_async_sink.py b/tests/sink/test_async_sink.py index 2fc22809..1ba23edd 100644 --- a/tests/sink/test_async_sink.py +++ b/tests/sink/test_async_sink.py @@ -7,14 +7,14 @@ import grpc from google.protobuf import empty_pb2 as _empty_pb2 from grpc.aio._server import Server -from pynumaflow._constants import ServerType from pynumaflow import setup_logging from pynumaflow.sinker import ( Datum, ) -from pynumaflow.sinker import Responses, Response, SinkServer +from pynumaflow.sinker import Responses, Response from pynumaflow.proto.sinker import sink_pb2_grpc, sink_pb2 +from pynumaflow.sinker.async_server import SinkAsyncServer from tests.sink.test_server import ( mock_message, mock_err_message, @@ -69,8 +69,8 @@ def startup_callable(loop): async def start_server(): server = grpc.aio.server() - server_instance = SinkServer(sinker_instance=udsink_handler, server_type=ServerType.Async) - uds = server_instance.get_servicer(sinker_instance=udsink_handler, server_type=ServerType.Async) + server_instance = SinkAsyncServer(sinker_instance=udsink_handler) + uds = server_instance.servicer sink_pb2_grpc.add_SinkServicer_to_server(uds, server) listen_addr = "unix:///tmp/async_sink.sock" server.add_insecure_port(listen_addr) @@ -163,9 +163,7 @@ def __stub(self): def test_invalid_server_type(self) -> None: with self.assertRaises(TypeError): - SinkServer(server_type=ServerType.Async) - with self.assertRaises(NotImplementedError): - SinkServer(sinker_instance=udsink_handler, server_type="ERORR").start() + SinkAsyncServer() if __name__ == "__main__": diff --git a/tests/sink/test_responses.py b/tests/sink/test_responses.py index 516abb19..032323b8 100644 --- a/tests/sink/test_responses.py +++ b/tests/sink/test_responses.py @@ -1,7 +1,7 @@ import unittest from collections.abc import Iterator -from pynumaflow.sinker import Response, Responses, SinkerClass, Datum +from pynumaflow.sinker import Response, Responses, Sinker, Datum class TestResponse(unittest.TestCase): @@ -40,7 +40,7 @@ def test_responses(self): ) -class ExampleSinkClass(SinkerClass): +class ExampleSinkClass(Sinker): def handler(self, datums: Iterator[Datum]) -> Responses: results = Responses() results.append(Response.as_success("test_message")) diff --git a/tests/sink/test_server.py b/tests/sink/test_server.py index 8678469c..4a9bea86 100644 --- a/tests/sink/test_server.py +++ b/tests/sink/test_server.py @@ -48,9 +48,7 @@ def mock_watermark(): class TestServer(unittest.TestCase): def setUp(self) -> None: server = SinkServer(sinker_instance=udsink_handler) - my_servicer = server.get_servicer( - sinker_instance=server.sinker_instance, server_type=server.server_type - ) + my_servicer = server.servicer services = {sink_pb2.DESCRIPTOR.services_by_name["Sink"]: my_servicer} self.test_server = server_from_dictionary(services, strict_real_time()) @@ -71,9 +69,7 @@ def test_is_ready(self): def test_udsink_err(self): server = SinkServer(sinker_instance=err_udsink_handler) - my_servicer = server.get_servicer( - sinker_instance=server.sinker_instance, server_type=server.server_type - ) + my_servicer = server.servicer services = {sink_pb2.DESCRIPTOR.services_by_name["Sink"]: my_servicer} self.test_server = server_from_dictionary(services, strict_real_time()) @@ -165,8 +161,6 @@ def test_forward_message(self): def test_invalid_init(self): with self.assertRaises(TypeError): SinkServer() - with self.assertRaises(NotImplementedError): - SinkServer(sinker_instance=udsink_handler, server_type="ERORR").start() if __name__ == "__main__": diff --git a/tests/source/test_async_source.py b/tests/source/test_async_source.py index b7c7d179..fdf5e756 100644 --- a/tests/source/test_async_source.py +++ b/tests/source/test_async_source.py @@ -6,13 +6,12 @@ import grpc from google.protobuf import empty_pb2 as _empty_pb2 from grpc.aio._server import Server -from pynumaflow._constants import ServerType from pynumaflow import setup_logging +from pynumaflow.proto.sourcer import source_pb2_grpc, source_pb2 from pynumaflow.sourcer import ( - SourceServer, + SourceAsyncServer, ) -from pynumaflow.proto.sourcer import source_pb2_grpc, source_pb2 from tests.source.utils import ( mock_offset, read_req_source_fn, @@ -40,11 +39,8 @@ def startup_callable(loop): def NewAsyncSourcer(): class_instance = AsyncSource() - server = SourceServer(sourcer_instance=class_instance, server_type=ServerType.Async) - - udfs = server.get_servicer( - sourcer_instance=server.sourcer_instance, server_type=server.server_type - ) + server = SourceAsyncServer(sourcer_instance=class_instance) + udfs = server.servicer return udfs diff --git a/tests/source/test_async_source_err.py b/tests/source/test_async_source_err.py index 0a1f9a61..85acef7e 100644 --- a/tests/source/test_async_source_err.py +++ b/tests/source/test_async_source_err.py @@ -6,10 +6,9 @@ import grpc from grpc.aio._server import Server -from pynumaflow._constants import ServerType from pynumaflow import setup_logging -from pynumaflow.sourcer import SourceServer +from pynumaflow.sourcer import SourceAsyncServer from pynumaflow.proto.sourcer import source_pb2_grpc, source_pb2 from google.protobuf import empty_pb2 as _empty_pb2 from tests.source.utils import ( @@ -34,10 +33,8 @@ def startup_callable(loop): async def start_server(): server = grpc.aio.server() class_instance = AsyncSourceError() - server_instance = SourceServer(sourcer_instance=class_instance, server_type=ServerType.Async) - udfs = server_instance.get_servicer( - sourcer_instance=server_instance.sourcer_instance, server_type=ServerType.Async - ) + server_instance = SourceAsyncServer(sourcer_instance=class_instance) + udfs = server_instance.servicer source_pb2_grpc.add_SourceServicer_to_server(udfs, server) listen_addr = "unix:///tmp/async_err_source.sock" server.add_insecure_port(listen_addr) @@ -123,6 +120,10 @@ def test_partition_error(self) -> None: return self.fail("Expected an exception.") + def test_invalid_server_type(self) -> None: + with self.assertRaises(TypeError): + SourceAsyncServer() + if __name__ == "__main__": logging.basicConfig(level=logging.DEBUG) diff --git a/tests/source/test_sync_source.py b/tests/source/test_sync_source.py index 9cfc81fa..5e6c2ca8 100644 --- a/tests/source/test_sync_source.py +++ b/tests/source/test_sync_source.py @@ -19,9 +19,7 @@ class TestSyncSourcer(unittest.TestCase): def setUp(self) -> None: class_instance = SyncSource() server = SourceServer(sourcer_instance=class_instance) - my_servicer = server.get_servicer( - sourcer_instance=server.sourcer_instance, server_type=server.server_type - ) + my_servicer = server.servicer services = {source_pb2.DESCRIPTOR.services_by_name["Source"]: my_servicer} self.test_server = server_from_dictionary(services, strict_real_time()) diff --git a/tests/source/test_sync_source_err.py b/tests/source/test_sync_source_err.py index d92cada1..e135913e 100644 --- a/tests/source/test_sync_source_err.py +++ b/tests/source/test_sync_source_err.py @@ -3,7 +3,6 @@ import grpc from google.protobuf import empty_pb2 as _empty_pb2 from grpc_testing import server_from_dictionary, strict_real_time -from pynumaflow._constants import ServerType from pynumaflow.sourcer import SourceServer from pynumaflow.proto.sourcer import source_pb2 @@ -17,10 +16,8 @@ class TestSyncSourcer(unittest.TestCase): def setUp(self) -> None: class_instance = SyncSourceError() - server = SourceServer(sourcer_instance=class_instance, server_type=ServerType.Sync) - my_servicer = server.get_servicer( - sourcer_instance=server.sourcer_instance, server_type=server.server_type - ) + server = SourceServer(sourcer_instance=class_instance) + my_servicer = server.servicer services = {source_pb2.DESCRIPTOR.services_by_name["Source"]: my_servicer} self.test_server = server_from_dictionary(services, strict_real_time()) @@ -103,8 +100,6 @@ def test_source_partition(self): def test_invalid_input(self): with self.assertRaises(TypeError): SourceServer() - with self.assertRaises(NotImplementedError): - SourceServer(sourcer_instance=SyncSourceError(), server_type="random").start() if __name__ == "__main__": diff --git a/tests/source/utils.py b/tests/source/utils.py index 1c7f693b..41a3d637 100644 --- a/tests/source/utils.py +++ b/tests/source/utils.py @@ -6,7 +6,7 @@ PendingResponse, Offset, PartitionsResponse, - SourcerClass, + Sourcer, ) from pynumaflow.proto.sourcer import source_pb2 from tests.testing_utils import mock_event_time @@ -20,7 +20,7 @@ def mock_partitions() -> list[int]: return [1, 2, 3] -class AsyncSource(SourcerClass): +class AsyncSource(Sourcer): async def read_handler(self, datum: ReadRequest) -> AsyncIterable[Message]: payload = b"payload:test_mock_message" keys = ["test_key"] @@ -39,7 +39,7 @@ async def partitions_handler(self) -> PartitionsResponse: return PartitionsResponse(partitions=mock_partitions()) -class SyncSource(SourcerClass): +class SyncSource(Sourcer): def read_handler(self, datum: ReadRequest) -> Iterable[Message]: payload = b"payload:test_mock_message" keys = ["test_key"] @@ -72,7 +72,7 @@ def ack_req_source_fn() -> AckRequest: return request -class AsyncSourceError(SourcerClass): +class AsyncSourceError(Sourcer): # This handler mimics the scenario where map stream UDF throws a runtime error. async def read_handler(self, datum: ReadRequest) -> AsyncIterable[Message]: payload = b"payload:test_mock_message" @@ -93,7 +93,7 @@ async def partitions_handler(self) -> PartitionsResponse: raise RuntimeError("Got a runtime error from partition handler.") -class SyncSourceError(SourcerClass): +class SyncSourceError(Sourcer): def read_handler(self, datum: ReadRequest) -> Iterable[Message]: raise RuntimeError("Got a runtime error from read handler.") diff --git a/tests/sourcetransform/test_messages.py b/tests/sourcetransform/test_messages.py index dd6ae39e..9f6baceb 100644 --- a/tests/sourcetransform/test_messages.py +++ b/tests/sourcetransform/test_messages.py @@ -1,7 +1,7 @@ import unittest from datetime import datetime, timezone -from pynumaflow.sourcetransformer import Messages, Message, DROP, SourceTransformerClass, Datum +from pynumaflow.sourcetransformer import Messages, Message, DROP, SourceTransformer, Datum from tests.testing_utils import mock_new_event_time @@ -94,7 +94,7 @@ def test_err(self): msgts[:1] -class ExampleSourceTransformClass(SourceTransformerClass): +class ExampleSourceTransformClass(SourceTransformer): def handler(self, keys: list[str], datum: Datum) -> Messages: messages = Messages() messages.append(Message(mock_message_t(), mock_new_event_time(), keys=keys)) diff --git a/tests/sourcetransform/test_multiproc.py b/tests/sourcetransform/test_multiproc.py index 10196857..da4dcffb 100644 --- a/tests/sourcetransform/test_multiproc.py +++ b/tests/sourcetransform/test_multiproc.py @@ -1,17 +1,15 @@ import os import unittest from unittest import mock -from unittest.mock import Mock, patch import grpc from google.protobuf import empty_pb2 as _empty_pb2 from google.protobuf import timestamp_pb2 as _timestamp_pb2 from grpc import StatusCode from grpc_testing import server_from_dictionary, strict_real_time -from pynumaflow._constants import ServerType from pynumaflow.proto.sourcetransformer import transform_pb2 -from pynumaflow.sourcetransformer import SourceTransformServer +from pynumaflow.sourcetransformer.multiproc_server import SourceTransformMultiProcServer from tests.sourcetransform.utils import transform_handler, err_transform_handler from tests.testing_utils import ( mock_event_time, @@ -28,46 +26,32 @@ def mockenv(**envvars): class TestMultiProcMethods(unittest.TestCase): def setUp(self) -> None: - server = SourceTransformServer( - source_transform_instance=transform_handler, server_type=ServerType.Multiproc - ) - my_servicer = server.get_servicer( - source_transform_instance=server.source_transform_instance, - server_type=server.server_type, - ) + server = SourceTransformMultiProcServer(source_transform_instance=transform_handler) + my_servicer = server.servicer services = {transform_pb2.DESCRIPTOR.services_by_name["SourceTransform"]: my_servicer} self.test_server = server_from_dictionary(services, strict_real_time()) - @mockenv(NUM_CPU_MULTIPROC="3") def test_multiproc_init(self) -> None: - server = SourceTransformServer( - source_transform_instance=transform_handler, server_type=ServerType.Multiproc + server = SourceTransformMultiProcServer( + source_transform_instance=transform_handler, server_count=3 ) self.assertEqual(server._process_count, 3) - @patch("os.cpu_count", Mock(return_value=4)) def test_multiproc_process_count(self) -> None: - server = SourceTransformServer( - source_transform_instance=transform_handler, server_type=ServerType.Multiproc - ) - self.assertEqual(server._process_count, 4) + default_value = os.cpu_count() + server = SourceTransformMultiProcServer(source_transform_instance=transform_handler) + self.assertEqual(server._process_count, default_value) - @patch("os.cpu_count", Mock(return_value=4)) - @mockenv(NUM_CPU_MULTIPROC="10") def test_max_process_count(self) -> None: - server = SourceTransformServer( - source_transform_instance=transform_handler, server_type=ServerType.Multiproc + default_value = os.cpu_count() + server = SourceTransformMultiProcServer( + source_transform_instance=transform_handler, server_count=50 ) - self.assertEqual(server._process_count, 8) + self.assertEqual(server._process_count, 2 * default_value) def test_udf_mapt_err(self): - server = SourceTransformServer( - source_transform_instance=err_transform_handler, server_type=ServerType.Multiproc - ) - my_servicer = server.get_servicer( - source_transform_instance=server.source_transform_instance, - server_type=server.server_type, - ) + server = SourceTransformMultiProcServer(source_transform_instance=err_transform_handler) + my_servicer = server.servicer services = {transform_pb2.DESCRIPTOR.services_by_name["SourceTransform"]: my_servicer} self.test_server = server_from_dictionary(services, strict_real_time()) @@ -161,7 +145,7 @@ def test_mapt_assign_new_event_time(self, test_server=None): def test_invalid_input(self): with self.assertRaises(TypeError): - SourceTransformServer(server_type=ServerType.Multiproc) + SourceTransformMultiProcServer() if __name__ == "__main__": diff --git a/tests/sourcetransform/test_sync_server.py b/tests/sourcetransform/test_sync_server.py index 7ca38141..382c36e5 100644 --- a/tests/sourcetransform/test_sync_server.py +++ b/tests/sourcetransform/test_sync_server.py @@ -5,7 +5,6 @@ from google.protobuf import timestamp_pb2 as _timestamp_pb2 from grpc import StatusCode from grpc_testing import server_from_dictionary, strict_real_time -from pynumaflow._constants import ServerType from pynumaflow.sourcetransformer import SourceTransformServer from pynumaflow.proto.sourcetransformer import transform_pb2 @@ -20,13 +19,8 @@ class TestServer(unittest.TestCase): def setUp(self) -> None: - server = SourceTransformServer( - source_transform_instance=transform_handler, server_type=ServerType.Sync - ) - my_servicer = server.get_servicer( - source_transform_instance=server.source_transform_instance, - server_type=server.server_type, - ) + server = SourceTransformServer(source_transform_instance=transform_handler) + my_servicer = server.servicer services = {transform_pb2.DESCRIPTOR.services_by_name["SourceTransform"]: my_servicer} self.test_server = server_from_dictionary(services, strict_real_time()) @@ -35,19 +29,13 @@ def test_init_with_args(self) -> None: source_transform_instance=transform_handler, sock_path="/tmp/test.sock", max_message_size=1024 * 1024 * 5, - server_type=ServerType.Sync, ) self.assertEqual(server.sock_path, "unix:///tmp/test.sock") self.assertEqual(server.max_message_size, 1024 * 1024 * 5) def test_udf_mapt_err(self): - server = SourceTransformServer( - source_transform_instance=err_transform_handler, server_type=ServerType.Sync - ) - my_servicer = server.get_servicer( - source_transform_instance=server.source_transform_instance, - server_type=server.server_type, - ) + server = SourceTransformServer(source_transform_instance=err_transform_handler) + my_servicer = server.servicer services = {transform_pb2.DESCRIPTOR.services_by_name["SourceTransform"]: my_servicer} self.test_server = server_from_dictionary(services, strict_real_time()) @@ -145,10 +133,6 @@ def test_mapt_assign_new_event_time(self, test_server=None): def test_invalid_input(self): with self.assertRaises(TypeError): SourceTransformServer() - with self.assertRaises(NotImplementedError): - SourceTransformServer( - source_transform_instance=transform_handler, server_type=ServerType.Async - ).start() if __name__ == "__main__": From c0a025c512a9e1a56dd71045b65762f23edf1a05 Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Wed, 17 Jan 2024 00:46:43 -0800 Subject: [PATCH 60/78] examples Signed-off-by: Sidhant Kohli --- examples/map/even_odd/example.py | 2 +- examples/map/even_odd/pyproject.toml | 2 +- examples/map/flatmap/pipeline.yaml | 2 +- examples/map/flatmap/pyproject.toml | 2 +- examples/map/forward_message/Makefile | 2 +- examples/map/forward_message/example.py | 25 ++++++++++++- examples/map/forward_message/pyproject.toml | 2 +- examples/map/multiproc_map/Makefile | 2 +- examples/map/multiproc_map/README.md | 16 +++----- examples/map/multiproc_map/example.py | 11 +++--- examples/map/multiproc_map/pipeline.yaml | 4 +- examples/map/multiproc_map/pyproject.toml | 2 +- examples/mapstream/flatmap_stream/example.py | 29 ++++++++++++--- .../mapstream/flatmap_stream/pyproject.toml | 2 +- examples/reduce/counter/example.py | 9 ++--- examples/reduce/counter/pyproject.toml | 2 +- examples/sideinput/simple-sideinput/Makefile | 2 +- .../sideinput/simple-sideinput/example.py | 37 ++++++++++--------- .../simple-sideinput/pipeline-numaflow.yaml | 2 +- .../sideinput/simple-sideinput/pyproject.toml | 2 +- examples/sink/async_log/Makefile | 2 +- examples/sink/async_log/example.py | 7 ++-- .../sink/async_log/pipeline-numaflow.yaml | 2 +- examples/sink/async_log/pyproject.toml | 2 +- examples/sink/log/Makefile | 2 +- examples/sink/log/example.py | 6 +-- examples/sink/log/pipeline-numaflow.yaml | 4 +- examples/sink/log/pyproject.toml | 2 +- examples/source/async-source/Makefile | 2 +- examples/source/async-source/example.py | 15 +++----- .../async-source/pipeline-numaflow.yaml | 2 +- examples/source/async-source/pyproject.toml | 2 +- examples/source/simple-source/Makefile | 2 +- examples/source/simple-source/example.py | 6 +-- .../simple-source/pipeline-numaflow.yaml | 2 +- examples/source/simple-source/pyproject.toml | 2 +- .../event_time_filter/Makefile | 2 +- .../event_time_filter/example.py | 2 +- .../event_time_filter/pyproject.toml | 2 +- 39 files changed, 127 insertions(+), 96 deletions(-) diff --git a/examples/map/even_odd/example.py b/examples/map/even_odd/example.py index b35fe371..da5c75a6 100644 --- a/examples/map/even_odd/example.py +++ b/examples/map/even_odd/example.py @@ -22,5 +22,5 @@ def my_handler(keys: list[str], datum: Datum) -> Messages: if __name__ == "__main__": - grpc_server = MapServer(mapper_instance=my_handler) + grpc_server = MapServer(my_handler) grpc_server.start() diff --git a/examples/map/even_odd/pyproject.toml b/examples/map/even_odd/pyproject.toml index 94e721a5..63e388cf 100644 --- a/examples/map/even_odd/pyproject.toml +++ b/examples/map/even_odd/pyproject.toml @@ -6,7 +6,7 @@ authors = ["Numaflow developers"] [tool.poetry.dependencies] python = "~3.10" -pynumaflow = "~0.6.1" +pynumaflow = "~0.7.0" [tool.poetry.dev-dependencies] diff --git a/examples/map/flatmap/pipeline.yaml b/examples/map/flatmap/pipeline.yaml index 5e1127d1..41a7c2f7 100644 --- a/examples/map/flatmap/pipeline.yaml +++ b/examples/map/flatmap/pipeline.yaml @@ -15,7 +15,7 @@ spec: - name: flatmap udf: container: - image: "quay.io/numaio/numaflow-python/map-flatmap:v0.5.0" + image: "quay.io/numaio/numaflow-python/map-flatmap:v0.7.0" env: - name: PYTHONDEBUG value: "true" diff --git a/examples/map/flatmap/pyproject.toml b/examples/map/flatmap/pyproject.toml index 41fcecb5..badafc14 100644 --- a/examples/map/flatmap/pyproject.toml +++ b/examples/map/flatmap/pyproject.toml @@ -6,7 +6,7 @@ authors = ["Numaflow developers"] [tool.poetry.dependencies] python = "~3.10" -pynumaflow = "~0.6.1" +pynumaflow = "~0.7.0" [tool.poetry.dev-dependencies] diff --git a/examples/map/forward_message/Makefile b/examples/map/forward_message/Makefile index 982dfc22..6bbab66a 100644 --- a/examples/map/forward_message/Makefile +++ b/examples/map/forward_message/Makefile @@ -1,6 +1,6 @@ .PHONY: image image: - docker build -t "quay.io/numaio/numaflow-python/map-forward-message:v0.5.0" . + docker build -t "quay.io/numaio/numaflow-python/map-forward-message:v0.7.0" . # Github CI runner uses platform linux/amd64. If your local environment don't, the image built by command above might not work # under the CI E2E test environment. # To build an image that supports multiple platforms(linux/amd64,linux/arm64) and push to quay.io, use the following command diff --git a/examples/map/forward_message/example.py b/examples/map/forward_message/example.py index d4425fc0..012f1086 100644 --- a/examples/map/forward_message/example.py +++ b/examples/map/forward_message/example.py @@ -1,4 +1,17 @@ -from pynumaflow.mapper import Messages, Message, Datum, MapServer +import os + +from pynumaflow.mapper import Messages, Message, Datum, MapServer, Mapper + + +class Example(Mapper): + + def handler(self, keys: list[str], datum: Datum) -> Messages: + val = datum.value + _ = datum.event_time + _ = datum.watermark + messages = Messages() + messages.append(Message(value=val, keys=keys)) + return messages def my_handler(keys: list[str], datum: Datum) -> Messages: @@ -11,5 +24,13 @@ def my_handler(keys: list[str], datum: Datum) -> Messages: if __name__ == "__main__": - grpc_server = MapServer(mapper_instance=my_handler) + invoke = os.getenv("INVOKE", "handler") + # Use the class based approach or function based handler + # based on the env variable + # Both can be used and passed directly to the server class + if invoke == "class": + handler = Example() + else: + handler = my_handler + grpc_server = MapServer(handler) grpc_server.start() diff --git a/examples/map/forward_message/pyproject.toml b/examples/map/forward_message/pyproject.toml index db000ba6..441e8dd4 100644 --- a/examples/map/forward_message/pyproject.toml +++ b/examples/map/forward_message/pyproject.toml @@ -6,7 +6,7 @@ authors = ["Numaflow developers"] [tool.poetry.dependencies] python = "~3.10" -pynumaflow = "~0.6.1" +pynumaflow = "~0.7.0" [tool.poetry.dev-dependencies] diff --git a/examples/map/multiproc_map/Makefile b/examples/map/multiproc_map/Makefile index 50c4444f..283e90b7 100644 --- a/examples/map/multiproc_map/Makefile +++ b/examples/map/multiproc_map/Makefile @@ -1,6 +1,6 @@ .PHONY: image image: - docker build -t "quay.io/numaio/numaflow-python/multiproc:v0.5.0" . + docker buildx build --no-cache -t "quay.io/kohlisid/numaflow-python/refactor:v0.7.2" --platform linux/amd64,linux/arm64 . --push # Github CI runner uses platform linux/amd64. If your local environment don't, the image built by command above might not work # under the CI E2E test environment. # To build an image that supports multiple platforms(linux/amd64,linux/arm64) and push to quay.io, use the following command diff --git a/examples/map/multiproc_map/README.md b/examples/map/multiproc_map/README.md index 0bff3481..f2053199 100644 --- a/examples/map/multiproc_map/README.md +++ b/examples/map/multiproc_map/README.md @@ -8,19 +8,15 @@ writing UDFs using map function. These are particularly useful for CPU intensive as it allows for better resource utilisation. In this mode we would spawn N number (N = Cpu count) of grpc servers in different processes, where each of them -listening on the same TCP socket. - -This is possible by enabling the `SO_REUSEPORT` flag for the TCP socket, which allows these different -processes to bind to the same port. +listening on multiple TCP sockets. To enable multiprocessing mode -1) Start the multiproc server in the UDF using the following command, select the server_type = ServerType.Multiproc +1) Start the multiproc server in the UDF using the following command +2) Provide the optional argument `server_count` to specify the number of +servers to be forked. Defaults to `os.cpu_count` if not provided ```python if __name__ == "__main__": - grpc_server = MapServer(mapper_instance=handler, - server_type=ServerType.Multiproc) + grpc_server = MapMultiProcServer(handler, server_count = 3) grpc_server.start() -``` -2) Set the ENV var value `NUM_CPU_MULTIPROC="n"` for the UDF container, -to set the value of the number of server instances (one for each subprocess) to be created. \ No newline at end of file +``` \ No newline at end of file diff --git a/examples/map/multiproc_map/example.py b/examples/map/multiproc_map/example.py index 96ef19c2..963e9c42 100644 --- a/examples/map/multiproc_map/example.py +++ b/examples/map/multiproc_map/example.py @@ -1,5 +1,5 @@ import math - +import os from pynumaflow.mapper import Messages, Message, Datum, Mapper, MapMultiprocServer @@ -27,11 +27,10 @@ def handler(self, keys: list[str], datum: Datum) -> Messages: if __name__ == "__main__": """ Example of starting a multiprocessing map vertex. - To enable set the env variable - NUM_CPU_MULTIPROC="N" - Set the server_type = ServerType.Multiproc - in the pipeline config for the numa container. """ + # To set the env server_count value set the env variable + # NUM_CPU_MULTIPROC="N" + server_count = int(os.getenv("NUM_CPU_MULTIPROC", "2")) prime_class = PrimeMap() - grpc_server = MapMultiprocServer(mapper_instance=prime_class, server_count=2) + grpc_server = MapMultiprocServer(prime_class, server_count=server_count) grpc_server.start() diff --git a/examples/map/multiproc_map/pipeline.yaml b/examples/map/multiproc_map/pipeline.yaml index 1fb6471e..ae4cc955 100644 --- a/examples/map/multiproc_map/pipeline.yaml +++ b/examples/map/multiproc_map/pipeline.yaml @@ -15,12 +15,12 @@ spec: - name: mult udf: container: - image: "quay.io/numaio/numaflow-python/multiproc:latest" + image: "quay.io/kohlisid/numaflow-python/refactor:v0.7.2" env: - name: PYTHONDEBUG value: "true" - name: NUM_CPU_MULTIPROC - value: "2" # DO NOT forget the double quotes!!! + value: "3" # DO NOT forget the double quotes!!! containerTemplate: resources: limits: diff --git a/examples/map/multiproc_map/pyproject.toml b/examples/map/multiproc_map/pyproject.toml index db000ba6..441e8dd4 100644 --- a/examples/map/multiproc_map/pyproject.toml +++ b/examples/map/multiproc_map/pyproject.toml @@ -6,7 +6,7 @@ authors = ["Numaflow developers"] [tool.poetry.dependencies] python = "~3.10" -pynumaflow = "~0.6.1" +pynumaflow = "~0.7.0" [tool.poetry.dev-dependencies] diff --git a/examples/mapstream/flatmap_stream/example.py b/examples/mapstream/flatmap_stream/example.py index d966ca58..310edd1a 100644 --- a/examples/mapstream/flatmap_stream/example.py +++ b/examples/mapstream/flatmap_stream/example.py @@ -1,8 +1,24 @@ +import os from collections.abc import AsyncIterable +from pynumaflow.mapstreamer import Message, Datum, MapStreamAsyncServer, MapStreamer -from pynumaflow._constants import ServerType -from pynumaflow.mapstreamer import Message, Datum, MapStreamServer +class Example(MapStreamer): + async def handler(self, keys: list[str], datum: Datum) -> AsyncIterable[Message]: + """ + A handler that splits the input datum value into multiple strings by `,` separator and + emits them as a stream. + """ + val = datum.value + _ = datum.event_time + _ = datum.watermark + strs = val.decode("utf-8").split(",") + + if len(strs) == 0: + yield Message.to_drop() + return + for s in strs: + yield Message(str.encode(s)) async def map_stream_handler(_: list[str], datum: Datum) -> AsyncIterable[Message]: @@ -23,7 +39,10 @@ async def map_stream_handler(_: list[str], datum: Datum) -> AsyncIterable[Messag if __name__ == "__main__": - grpc_server = MapStreamServer( - map_stream_instance=map_stream_handler, server_type=ServerType.Async - ) + invoke = os.getenv("INVOKE", "handler") + if invoke == "class": + handler = Example() + else: + handler = map_stream_handler + grpc_server = MapStreamAsyncServer(handler) grpc_server.start() diff --git a/examples/mapstream/flatmap_stream/pyproject.toml b/examples/mapstream/flatmap_stream/pyproject.toml index da79a6b5..e99df380 100644 --- a/examples/mapstream/flatmap_stream/pyproject.toml +++ b/examples/mapstream/flatmap_stream/pyproject.toml @@ -6,7 +6,7 @@ authors = ["Numaflow developers"] [tool.poetry.dependencies] python = "~3.10" -pynumaflow = "~0.6.1" +pynumaflow = "~0.7.0" [tool.poetry.dev-dependencies] diff --git a/examples/reduce/counter/example.py b/examples/reduce/counter/example.py index 7bcd956c..e5467cfc 100644 --- a/examples/reduce/counter/example.py +++ b/examples/reduce/counter/example.py @@ -1,12 +1,11 @@ import os from collections.abc import AsyncIterable -from pynumaflow._constants import ServerType -from pynumaflow.reducer import Messages, Message, Datum, Metadata, ReduceServer, ReducerClass +from pynumaflow.reducer import Messages, Message, Datum, Metadata, ReduceAsyncServer, Reducer -class ExampleClass(ReducerClass): +class Example(Reducer): async def handler( self, keys: list[str], datums: AsyncIterable[Datum], md: Metadata ) -> Messages: @@ -36,8 +35,8 @@ async def reduce_handler(keys: list[str], datums: AsyncIterable[Datum], md: Meta if __name__ == "__main__": invoke = os.getenv("INVOKE", "handler") if invoke == "class": - handler = ExampleClass() + handler = Example() else: handler = reduce_handler - grpc_server = ReduceServer(reducer_instance=handler, server_type=ServerType.Async) + grpc_server = ReduceAsyncServer(handler) grpc_server.start() diff --git a/examples/reduce/counter/pyproject.toml b/examples/reduce/counter/pyproject.toml index b4ab3665..dc3cc41d 100644 --- a/examples/reduce/counter/pyproject.toml +++ b/examples/reduce/counter/pyproject.toml @@ -6,7 +6,7 @@ authors = ["Numaflow developers"] [tool.poetry.dependencies] python = "~3.10" -pynumaflow = "~0.6.1" +pynumaflow = "~0.7.0" [tool.poetry.dev-dependencies] diff --git a/examples/sideinput/simple-sideinput/Makefile b/examples/sideinput/simple-sideinput/Makefile index 2d36ab46..cd2f0add 100644 --- a/examples/sideinput/simple-sideinput/Makefile +++ b/examples/sideinput/simple-sideinput/Makefile @@ -1,6 +1,6 @@ .PHONY: image image: - docker build -t "quay.io/numaio/numaflow-python/sideinput-example:v0.5.0" . + docker build -t "quay.io/numaio/numaflow-python/sideinput-example:v0.7.0" . # Github CI runner uses platform linux/amd64. If your local environment don't, the image built by command above might not work # under the CI E2E test environment. # To build an image that supports multiple platforms(linux/amd64,linux/arm64) and push to quay.io, use the following command diff --git a/examples/sideinput/simple-sideinput/example.py b/examples/sideinput/simple-sideinput/example.py index cb7c18f9..acef3779 100644 --- a/examples/sideinput/simple-sideinput/example.py +++ b/examples/sideinput/simple-sideinput/example.py @@ -1,26 +1,27 @@ import datetime -from pynumaflow.sideinput import Response, SideInputServer +from pynumaflow.sideinput import Response, SideInputServer, SideInput -counter = 0 +class ExampleSideInput(SideInput): + def __init__(self): + self.counter = 0 -def my_handler() -> Response: - """ - This function is called every time the side input is requested. - """ - time_now = datetime.datetime.now() - # val is the value to be broadcasted - val = "an example:" + str(time_now) - global counter - counter += 1 - # broadcast every other time - if counter % 2 == 0: - # no_broadcast_message() is used to indicate that there is no broadcast - return Response.no_broadcast_message() - # broadcast_message() is used to indicate that there is a broadcast - return Response.broadcast_message(val.encode("utf-8")) + def retrieve_handler(self) -> Response: + """ + This function is called every time the side input is requested. + """ + time_now = datetime.datetime.now() + # val is the value to be broadcasted + val = "an example:" + str(time_now) + self.counter += 1 + # broadcast every other time + if self.counter % 2 == 0: + # no_broadcast_message() is used to indicate that there is no broadcast + return Response.no_broadcast_message() + # broadcast_message() is used to indicate that there is a broadcast + return Response.broadcast_message(val.encode("utf-8")) if __name__ == "__main__": - grpc_server = SideInputServer(side_input_instance=my_handler) + grpc_server = SideInputServer(ExampleSideInput()) grpc_server.start() diff --git a/examples/sideinput/simple-sideinput/pipeline-numaflow.yaml b/examples/sideinput/simple-sideinput/pipeline-numaflow.yaml index 3d471fb4..4e07d5f1 100644 --- a/examples/sideinput/simple-sideinput/pipeline-numaflow.yaml +++ b/examples/sideinput/simple-sideinput/pipeline-numaflow.yaml @@ -6,7 +6,7 @@ spec: sideInputs: - name: myticker container: - image: "quay.io/numaio/numaflow-python/sideinput-example:v0.5.0" + image: "quay.io/numaio/numaflow-python/sideinput-example:v0.7.0" imagePullPolicy: Always trigger: schedule: "*/2 * * * *" diff --git a/examples/sideinput/simple-sideinput/pyproject.toml b/examples/sideinput/simple-sideinput/pyproject.toml index 361ba9e5..441e8dd4 100644 --- a/examples/sideinput/simple-sideinput/pyproject.toml +++ b/examples/sideinput/simple-sideinput/pyproject.toml @@ -6,7 +6,7 @@ authors = ["Numaflow developers"] [tool.poetry.dependencies] python = "~3.10" -pynumaflow = "~0.6.0" +pynumaflow = "~0.7.0" [tool.poetry.dev-dependencies] diff --git a/examples/sink/async_log/Makefile b/examples/sink/async_log/Makefile index 8420c6dd..49ed516f 100644 --- a/examples/sink/async_log/Makefile +++ b/examples/sink/async_log/Makefile @@ -1,6 +1,6 @@ .PHONY: image image: - docker build -t "quay.io/numaio/numaflow-python/async-sink-log:v0.6.1" . + docker build -t "quay.io/numaio/numaflow-python/async-sink-log:v0.7.0" . # Github CI runner uses platform linux/amd64. If your local environment don't, the image built by command above might not work # under the CI E2E test environment. # To build an image that supports multiple platforms(linux/amd64,linux/arm64) and push to quay.io, use the following command diff --git a/examples/sink/async_log/example.py b/examples/sink/async_log/example.py index 1ae4dde8..a42a4c96 100644 --- a/examples/sink/async_log/example.py +++ b/examples/sink/async_log/example.py @@ -2,10 +2,11 @@ from collections.abc import AsyncIterable -from pynumaflow.sinker import Datum, Responses, Response, SinkServer, SinkerClass, ServerType +from pynumaflow.sinker import Datum, Responses, Response, Sinker +from pynumaflow.sinker import SinkAsyncServer -class UserDefinedSink(SinkerClass): +class UserDefinedSink(Sinker): async def handler(self, datums: AsyncIterable[Datum]) -> Responses: responses = Responses() async for msg in datums: @@ -28,5 +29,5 @@ async def udsink_handler(datums: AsyncIterable[Datum]) -> Responses: sink_handler = UserDefinedSink() else: sink_handler = udsink_handler - grpc_server = SinkServer(sinker_instance=sink_handler, server_type=ServerType.Async) + grpc_server = SinkAsyncServer(sink_handler) grpc_server.start() diff --git a/examples/sink/async_log/pipeline-numaflow.yaml b/examples/sink/async_log/pipeline-numaflow.yaml index cdc2f1d2..26d10ac2 100644 --- a/examples/sink/async_log/pipeline-numaflow.yaml +++ b/examples/sink/async_log/pipeline-numaflow.yaml @@ -21,7 +21,7 @@ spec: args: - python - example.py - image: quay.io/numaio/numaflow-python/async-sink-log:v0.6.1 + image: quay.io/numaio/numaflow-python/async-sink-log:v0.7.0 imagePullPolicy: Always env: - name: PYTHONDEBUG diff --git a/examples/sink/async_log/pyproject.toml b/examples/sink/async_log/pyproject.toml index 60a417b5..583a6388 100644 --- a/examples/sink/async_log/pyproject.toml +++ b/examples/sink/async_log/pyproject.toml @@ -6,7 +6,7 @@ authors = ["Numaflow developers"] [tool.poetry.dependencies] python = "~3.10" -pynumaflow = "~0.6.1" +pynumaflow = "~0.7.0" [tool.poetry.dev-dependencies] diff --git a/examples/sink/log/Makefile b/examples/sink/log/Makefile index f2d7b2a2..fa77ab1a 100644 --- a/examples/sink/log/Makefile +++ b/examples/sink/log/Makefile @@ -1,6 +1,6 @@ .PHONY: image image: - docker build -t "quay.io/numaio/numaflow-python/sink-log:v0.6.1" . + docker build -t "quay.io/numaio/numaflow-python/sink-log:v0.7.0" . # Github CI runner uses platform linux/amd64. If your local environment don't, the image built by command above might not work # under the CI E2E test environment. # To build an image that supports multiple platforms(linux/amd64,linux/arm64) and push to quay.io, use the following command diff --git a/examples/sink/log/example.py b/examples/sink/log/example.py index 6c2f48c6..187eb000 100644 --- a/examples/sink/log/example.py +++ b/examples/sink/log/example.py @@ -2,10 +2,10 @@ from collections.abc import Iterator from pynumaflow.sinker import Datum, Responses, Response, SinkServer -from pynumaflow.sinker import SinkerClass +from pynumaflow.sinker import Sinker -class UserDefinedSink(SinkerClass): +class UserDefinedSink(Sinker): def handler(self, datums: Iterator[Datum]) -> Responses: responses = Responses() for msg in datums: @@ -28,5 +28,5 @@ def udsink_handler(datums: Iterator[Datum]) -> Responses: sink_handler = UserDefinedSink() else: sink_handler = udsink_handler - grpc_server = SinkServer(sinker_instance=sink_handler) + grpc_server = SinkServer(sink_handler) grpc_server.start() diff --git a/examples/sink/log/pipeline-numaflow.yaml b/examples/sink/log/pipeline-numaflow.yaml index 9439526e..0f9ab368 100644 --- a/examples/sink/log/pipeline-numaflow.yaml +++ b/examples/sink/log/pipeline-numaflow.yaml @@ -21,13 +21,11 @@ spec: args: - python - example.py - image: "quay.io/numaio/numaflow-python/sink-log:v0.6.1" + image: "quay.io/numaio/numaflow-python/sink-log:v0.7.0" imagePullPolicy: Always env: - name: PYTHONDEBUG value: "true" - - name: SERVER_TYPE - value: "async" - name: INVOKE value: "handler" - name: log-output diff --git a/examples/sink/log/pyproject.toml b/examples/sink/log/pyproject.toml index 60a417b5..583a6388 100644 --- a/examples/sink/log/pyproject.toml +++ b/examples/sink/log/pyproject.toml @@ -6,7 +6,7 @@ authors = ["Numaflow developers"] [tool.poetry.dependencies] python = "~3.10" -pynumaflow = "~0.6.1" +pynumaflow = "~0.7.0" [tool.poetry.dev-dependencies] diff --git a/examples/source/async-source/Makefile b/examples/source/async-source/Makefile index bb43ac51..ddcc242e 100644 --- a/examples/source/async-source/Makefile +++ b/examples/source/async-source/Makefile @@ -1,6 +1,6 @@ .PHONY: image image: - docker build -t "quay.io/numaio/numaflow-python/async-source:v0.6.1" . + docker build -t "quay.io/numaio/numaflow-python/async-source:v0.7.0" . # Github CI runner uses platform linux/amd64. If your local environment don't, the image built by command above might not work # under the CI E2E test environment. # To build an image that supports multiple platforms(linux/amd64,linux/arm64) and push to quay.io, use the following command diff --git a/examples/source/async-source/example.py b/examples/source/async-source/example.py index f5775730..352175ff 100644 --- a/examples/source/async-source/example.py +++ b/examples/source/async-source/example.py @@ -1,8 +1,5 @@ -from datetime import datetime from collections.abc import AsyncIterable - -import aiorun -from pynumaflow._constants import ServerType +from datetime import datetime from pynumaflow.sourcer import ( ReadRequest, @@ -12,12 +9,12 @@ Offset, PartitionsResponse, get_default_partitions, - SourceServer, - SourcerClass, + Sourcer, + SourceAsyncServer, ) -class AsyncSource(SourcerClass): +class AsyncSource(Sourcer): """ AsyncSource is a class for User Defined Source implementation. """ @@ -71,5 +68,5 @@ async def partitions_handler(self) -> PartitionsResponse: if __name__ == "__main__": ud_source = AsyncSource() - grpc_server = SourceServer(sourcer_instance=ud_source, server_type=ServerType.Async) - aiorun.run(grpc_server.start()) + grpc_server = SourceAsyncServer(ud_source) + grpc_server.start() diff --git a/examples/source/async-source/pipeline-numaflow.yaml b/examples/source/async-source/pipeline-numaflow.yaml index 1a9197cc..da8d482d 100644 --- a/examples/source/async-source/pipeline-numaflow.yaml +++ b/examples/source/async-source/pipeline-numaflow.yaml @@ -9,7 +9,7 @@ spec: udsource: container: # A simple user-defined async source - image: "quay.io/numaio/numaflow-python/async-source:v0.6.1" + image: "quay.io/numaio/numaflow-python/async-source:v0.7.0" imagePullPolicy: Always limits: readBatchSize: 2 diff --git a/examples/source/async-source/pyproject.toml b/examples/source/async-source/pyproject.toml index 15ee2ec1..60fba875 100644 --- a/examples/source/async-source/pyproject.toml +++ b/examples/source/async-source/pyproject.toml @@ -6,7 +6,7 @@ authors = ["Numaflow developers"] [tool.poetry.dependencies] python = "~3.10" -pynumaflow = "~0.6.1" +pynumaflow = "~0.7.0" [tool.poetry.dev-dependencies] diff --git a/examples/source/simple-source/Makefile b/examples/source/simple-source/Makefile index eb4363d2..647da242 100644 --- a/examples/source/simple-source/Makefile +++ b/examples/source/simple-source/Makefile @@ -1,6 +1,6 @@ .PHONY: image image: - docker build -t "quay.io/numaio/numaflow-python/simple-source:v0.6.1" . + docker build -t "quay.io/numaio/numaflow-python/simple-source:v0.7.0" . # Github CI runner uses platform linux/amd64. If your local environment don't, the image built by command above might not work # under the CI E2E test environment. # To build an image that supports multiple platforms(linux/amd64,linux/arm64) and push to quay.io, use the following command diff --git a/examples/source/simple-source/example.py b/examples/source/simple-source/example.py index 46f25daa..a2ae814a 100644 --- a/examples/source/simple-source/example.py +++ b/examples/source/simple-source/example.py @@ -9,12 +9,12 @@ Offset, PartitionsResponse, get_default_partitions, - SourcerClass, + Sourcer, SourceServer, ) -class SimpleSource(SourcerClass): +class SimpleSource(Sourcer): """ SimpleSource is a class for User Defined Source implementation. """ @@ -68,5 +68,5 @@ def partitions_handler(self) -> PartitionsResponse: if __name__ == "__main__": ud_source = SimpleSource() - grpc_server = SourceServer(sourcer_instance=ud_source) + grpc_server = SourceServer(ud_source) grpc_server.start() diff --git a/examples/source/simple-source/pipeline-numaflow.yaml b/examples/source/simple-source/pipeline-numaflow.yaml index 66ea6c22..50246f7d 100644 --- a/examples/source/simple-source/pipeline-numaflow.yaml +++ b/examples/source/simple-source/pipeline-numaflow.yaml @@ -9,7 +9,7 @@ spec: udsource: container: # A simple user-defined source for e2e testing - image: quay.io/numaio/numaflow-python/simple-source:v0.6.1 + image: quay.io/numaio/numaflow-python/simple-source:v0.7.0 imagePullPolicy: Always limits: readBatchSize: 2 diff --git a/examples/source/simple-source/pyproject.toml b/examples/source/simple-source/pyproject.toml index e57b1751..a2fa357a 100644 --- a/examples/source/simple-source/pyproject.toml +++ b/examples/source/simple-source/pyproject.toml @@ -6,7 +6,7 @@ authors = ["Numaflow developers"] [tool.poetry.dependencies] python = "~3.10" -pynumaflow = "~0.6.1" +pynumaflow = "~0.7.0" [tool.poetry.dev-dependencies] diff --git a/examples/sourcetransform/event_time_filter/Makefile b/examples/sourcetransform/event_time_filter/Makefile index c0df5ba6..e22f7be8 100644 --- a/examples/sourcetransform/event_time_filter/Makefile +++ b/examples/sourcetransform/event_time_filter/Makefile @@ -1,6 +1,6 @@ .PHONY: image image: - docker build -t "quay.io/numaio/numaflow-python/mapt-event-time-filter:v0.6.1" . + docker build -t "quay.io/numaio/numaflow-python/mapt-event-time-filter:v0.7.0" . # Github CI runner uses platform linux/amd64. If your local environment don't, the image built by command above might not work # under the CI E2E test environment. # To build an image that supports multiple platforms(linux/amd64,linux/arm64) and push to quay.io, use the following command diff --git a/examples/sourcetransform/event_time_filter/example.py b/examples/sourcetransform/event_time_filter/example.py index 43f49ad2..add91b96 100644 --- a/examples/sourcetransform/event_time_filter/example.py +++ b/examples/sourcetransform/event_time_filter/example.py @@ -43,5 +43,5 @@ def my_handler(keys: list[str], datum: Datum) -> Messages: if __name__ == "__main__": - grpc_server = SourceTransformServer(source_transform_instance=my_handler) + grpc_server = SourceTransformServer(my_handler) grpc_server.start() diff --git a/examples/sourcetransform/event_time_filter/pyproject.toml b/examples/sourcetransform/event_time_filter/pyproject.toml index be102248..31dfd83c 100644 --- a/examples/sourcetransform/event_time_filter/pyproject.toml +++ b/examples/sourcetransform/event_time_filter/pyproject.toml @@ -8,7 +8,7 @@ packages = [{include = "mapt_event_time_filter"}] [tool.poetry.dependencies] python = ">=3.9, <3.12" -pynumaflow = "~0.6.1" +pynumaflow = "~0.7.0" [build-system] requires = ["poetry-core"] From 678ec60441ae74c80a20960deda0270d12f4eaaa Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Wed, 17 Jan 2024 00:48:01 -0800 Subject: [PATCH 61/78] examples Signed-off-by: Sidhant Kohli --- examples/map/forward_message/example.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/map/forward_message/example.py b/examples/map/forward_message/example.py index 012f1086..34272f53 100644 --- a/examples/map/forward_message/example.py +++ b/examples/map/forward_message/example.py @@ -4,7 +4,6 @@ class Example(Mapper): - def handler(self, keys: list[str], datum: Datum) -> Messages: val = datum.value _ = datum.event_time From cdb97c18fd3ca74f7a42468c750039d53158b19c Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Wed, 17 Jan 2024 00:53:26 -0800 Subject: [PATCH 62/78] examples Signed-off-by: Sidhant Kohli --- README.md | 13 +++++-------- examples/developer_guide/example.py | 16 +++++++--------- 2 files changed, 12 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 14cacc01..9e6b1b87 100644 --- a/README.md +++ b/README.md @@ -93,7 +93,7 @@ if __name__ == "__main__": ```python from typing import AsyncIterable -from pynumaflow.reducer import Messages, Message, Datum, Metadata, ReduceServer +from pynumaflow.reducer import Messages, Message, Datum, Metadata, ReduceAsyncServer async def reduce_handler(keys: list[str], datums: AsyncIterable[Datum], md: Metadata) -> Messages: @@ -109,7 +109,7 @@ async def reduce_handler(keys: list[str], datums: AsyncIterable[Datum], md: Meta if __name__ == "__main__": - grpc_server = ReduceServer(reducer_instance=reduce_handler) + grpc_server = ReduceAsyncServer(reducer_instance=reduce_handler) grpc_server.start() ``` @@ -196,24 +196,21 @@ Currently we support the following server types: Not all of the above are supported for all UDFs and UDSinks. -To use a server type, the user needs to pass the server type to the server constructor. - -There is a class of the ```ServerType``` which can be imported from the package and be used. ### SyncServer ``` -grpc_server = MapServer(mapper_instance=handler, server_type=ServerType.Sync) +grpc_server = MapServer(handler) ``` ### AsyncServer ``` -grpc_server = MapServer(mapper_instance=handler, server_type=ServerType.Async) +grpc_server = MapAsyncServer(handler) ``` ### MultiProcessServer ``` -grpc_server = MapServer(mapper_instance=handler, server_type=ServerType.MultiProc) +grpc_server = MapMultiProcServer(handler) ``` diff --git a/examples/developer_guide/example.py b/examples/developer_guide/example.py index 8dbee528..67b6b138 100644 --- a/examples/developer_guide/example.py +++ b/examples/developer_guide/example.py @@ -1,29 +1,27 @@ +from collections.abc import AsyncIterable + import aiorun -from collections.abc import Iterator from pynumaflow.reducer import ( Messages, Message, Datum, - Metadata, - AsyncReducer, + Metadata, ReduceAsyncServer, ) -async def my_handler(keys: list[str], datums: Iterator[Datum], md: Metadata) -> Messages: - # count the number of events +async def reduce_handler(keys: list[str], datums: AsyncIterable[Datum], md: Metadata) -> Messages: interval_window = md.interval_window counter = 0 async for _ in datums: counter += 1 - msg = ( f"counter:{counter} interval_window_start:{interval_window.start} " f"interval_window_end:{interval_window.end}" ) - return Messages(Message(keys=keys, value=str.encode(msg))) + return Messages(Message(str.encode(msg), keys=keys)) if __name__ == "__main__": - grpc_server = AsyncReducer(handler=my_handler) - aiorun.run(grpc_server.start()) + grpc_server = ReduceAsyncServer(reduce_handler) + grpc_server.start() From a25387fb1bf569c2dbdf85bd6ba393648e699a91 Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Wed, 17 Jan 2024 00:54:01 -0800 Subject: [PATCH 63/78] examples Signed-off-by: Sidhant Kohli --- examples/developer_guide/example.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/developer_guide/example.py b/examples/developer_guide/example.py index 67b6b138..dddc2e5e 100644 --- a/examples/developer_guide/example.py +++ b/examples/developer_guide/example.py @@ -1,12 +1,12 @@ from collections.abc import AsyncIterable -import aiorun from pynumaflow.reducer import ( Messages, Message, Datum, - Metadata, ReduceAsyncServer, + Metadata, + ReduceAsyncServer, ) From dffdc5f52d98e77b5402462b47c049166cf6d77f Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Wed, 17 Jan 2024 00:57:41 -0800 Subject: [PATCH 64/78] examples Signed-off-by: Sidhant Kohli --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 9e6b1b87..db65e473 100644 --- a/README.md +++ b/README.md @@ -64,7 +64,7 @@ def handler(keys: list[str], datum: Datum) -> Messages: if __name__ == "__main__": - grpc_server = MapServer(mapper_instance=handler) + grpc_server = MapServer(handler) grpc_server.start() ``` ### SourceTransformer - Map with event time assignment capability @@ -85,7 +85,7 @@ def transform_handler(keys: list[str], datum: Datum) -> Messages: if __name__ == "__main__": - grpc_server = SourceTransformServer(source_transform_instance=transform_handler) + grpc_server = SourceTransformServer(transform_handler) grpc_server.start() ``` @@ -109,7 +109,7 @@ async def reduce_handler(keys: list[str], datums: AsyncIterable[Datum], md: Meta if __name__ == "__main__": - grpc_server = ReduceAsyncServer(reducer_instance=reduce_handler) + grpc_server = ReduceAsyncServer(reduce_handler) grpc_server.start() ``` @@ -133,7 +133,7 @@ def my_handler(datums: Iterator[Datum]) -> Responses: if __name__ == "__main__": - grpc_server = SinkServer(sinker_instance=my_handler) + grpc_server = SinkServer(my_handler) grpc_server.start() ``` @@ -178,7 +178,7 @@ class MyHandler(Mapper): if __name__ == "__main__": class_instance = MyHandler() - grpc_server = MapServer(mapper_instance=class_instance) + grpc_server = MapServer(class_instance) grpc_server.start() ``` From 2742b562420d4b1a2b82e925740e1d6f32388e66 Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Wed, 17 Jan 2024 08:45:35 -0800 Subject: [PATCH 65/78] examples Signed-off-by: Sidhant Kohli --- examples/map/multiproc_map/Makefile | 2 +- examples/sideinput/simple-sideinput/udf/example.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/map/multiproc_map/Makefile b/examples/map/multiproc_map/Makefile index 283e90b7..e7679224 100644 --- a/examples/map/multiproc_map/Makefile +++ b/examples/map/multiproc_map/Makefile @@ -1,6 +1,6 @@ .PHONY: image image: - docker buildx build --no-cache -t "quay.io/kohlisid/numaflow-python/refactor:v0.7.2" --platform linux/amd64,linux/arm64 . --push + docker build -t "quay.io/numaio/numaflow-python/multiproc:v0.7.0" . # Github CI runner uses platform linux/amd64. If your local environment don't, the image built by command above might not work # under the CI E2E test environment. # To build an image that supports multiple platforms(linux/amd64,linux/arm64) and push to quay.io, use the following command diff --git a/examples/sideinput/simple-sideinput/udf/example.py b/examples/sideinput/simple-sideinput/udf/example.py index e9eaa464..5f3bc8f1 100644 --- a/examples/sideinput/simple-sideinput/udf/example.py +++ b/examples/sideinput/simple-sideinput/udf/example.py @@ -24,7 +24,7 @@ def watcher(): This function is used to start the GRPC server and the watcher thread. """ daemon = Thread(target=watcher, daemon=True, name="Monitor") - grpc_server = MapServer(mapper_instance=my_handler) + grpc_server = MapServer(my_handler) thread_server = Thread(target=grpc_server.start, daemon=True, name="GRPC Server") daemon.start() thread_server.start() From 6bf2c4aa6a6cc726d0b965e28c50eebd159f4914 Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Wed, 17 Jan 2024 09:20:00 -0800 Subject: [PATCH 66/78] examples Signed-off-by: Sidhant Kohli --- examples/map/multiproc_map/pipeline.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/map/multiproc_map/pipeline.yaml b/examples/map/multiproc_map/pipeline.yaml index ae4cc955..54fe2b6a 100644 --- a/examples/map/multiproc_map/pipeline.yaml +++ b/examples/map/multiproc_map/pipeline.yaml @@ -15,7 +15,7 @@ spec: - name: mult udf: container: - image: "quay.io/kohlisid/numaflow-python/refactor:v0.7.2" + image: "quay.io/numaio/numaflow-python/multiproc:v0.7.0" env: - name: PYTHONDEBUG value: "true" From 6e44a6ca211cf713f8ea7f7d6234b104af4ba348 Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Wed, 17 Jan 2024 12:46:09 -0800 Subject: [PATCH 67/78] README Signed-off-by: Sidhant Kohli --- README.md | 212 +++++++----------------- examples/map/even_odd/example.py | 7 + examples/map/flatmap/example.py | 9 + examples/map/forward_message/example.py | 13 +- examples/map/multiproc_map/example.py | 7 + examples/reduce/counter/example.py | 15 +- 6 files changed, 103 insertions(+), 160 deletions(-) diff --git a/README.md b/README.md index db65e473..fdf36a9f 100644 --- a/README.md +++ b/README.md @@ -5,9 +5,9 @@ [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](LICENSE) [![Release Version](https://img.shields.io/github/v/release/numaproj/numaflow-python?label=pynumaflow)](https://github.com/numaproj/numaflow-python/releases/latest) +This is the Python SDK for [Numaflow](https://numaflow.numaproj.io/). -This SDK provides the interface for writing [UDFs](https://numaflow.numaproj.io/user-guide/user-defined-functions/user-defined-functions/) -and [UDSinks](https://numaflow.numaproj.io/user-guide/sinks/user-defined-sinks/) in Python. +This SDK provides the interface for writing different functionalities of Numaflow like [UDFs](https://numaflow.numaproj.io/user-guide/user-defined-functions/user-defined-functions/), [UDSinks](https://numaflow.numaproj.io/user-guide/sinks/user-defined-sinks/), [UDSources](https://numaflow.numaproj.io/user-guide/sources/user-defined-sources/) and [SideInput](https://numaflow.numaproj.io/specifications/side-inputs/) in Python. ## Installation @@ -40,177 +40,89 @@ Setup [pre-commit](https://pre-commit.com/) hooks: pre-commit install ``` -## Implement a User Defined Function (UDF) - - -### Map - -```python -from pynumaflow.mapper import Messages, Message, Datum, MapServer +## Implementing different functionalities +- [Implement User Defined Sources](https://github.com/numaproj/numaflow-python/tree/main/examples/source) +- [Implement User Defined Source Transformers](https://github.com/numaproj/numaflow-python/tree/main/examples/sourcetransform) +- Implement User Defined Functions + - [Map](https://github.com/numaproj/numaflow-python/tree/main/examples/map) + - [Reduce](https://github.com/numaproj/numaflow-python/tree/main/examples/reduce/counter) + - [Map Stream](https://github.com/numaproj/numaflow-python/tree/main/examples/mapstream) +- [Implement User Defined Sinks](https://github.com/numaproj/numaflow-python/tree/main/examples/sink) +- [Implement User Defined SideInputs](https://github.com/numaproj/numaflow-python/tree/main/examples/sideinput) +## Server Types -def handler(keys: list[str], datum: Datum) -> Messages: - val = datum.value - _ = datum.event_time - _ = datum.watermark - strs = val.decode("utf-8").split(",") - messages = Messages() - if len(strs) == 0: - messages.append(Message.to_drop()) - return messages - for s in strs: - messages.append(Message(str.encode(s))) - return messages +There are different types of gRPC server mechanisms which can be used to serve the UDFs, UDSinks and UDSource. +These have different functionalities and are used for different use cases. +Currently we support the following server types: +- Sync Server +- Asyncronous Server +- MultiProcessing Server -if __name__ == "__main__": - grpc_server = MapServer(handler) - grpc_server.start() -``` -### SourceTransformer - Map with event time assignment capability -In addition to the regular Map function, SourceTransformer supports assigning a new event time to the message. -SourceTransformer is only supported at source vertex to enable (a) early data filtering and (b) watermark assignment by extracting new event time from the message payload. +Not all of the above are supported for all UDFs, UDSource and UDSinks. -```python -from datetime import datetime -from pynumaflow.sourcetransformer import Messages, Message, Datum, SourceTransformServer +For each of the UDFs, UDSource and UDSinks, there are seperate classes for each of the server types. +This helps in keeping the interface simple and easy to use, and the user can start the specific server type based +on the use case. -def transform_handler(keys: list[str], datum: Datum) -> Messages: - val = datum.value - new_event_time = datetime.now() - _ = datum.watermark - message_t_s = Messages(Message(val, event_time=new_event_time, keys=keys)) - return message_t_s +#### SyncServer +Syncronous Server is the simplest server type. It is a multithreaded threaded server which can be used for simple UDFs and UDSinks. +Here the server will invoke the handler function for each message. The messaging is synchronous and the server will wait +for the handler to return before processing the next message. -if __name__ == "__main__": - grpc_server = SourceTransformServer(transform_handler) - grpc_server.start() ``` - -### Reduce - -```python -from typing import AsyncIterable -from pynumaflow.reducer import Messages, Message, Datum, Metadata, ReduceAsyncServer - - -async def reduce_handler(keys: list[str], datums: AsyncIterable[Datum], md: Metadata) -> Messages: - interval_window = md.interval_window - counter = 0 - async for _ in datums: - counter += 1 - msg = ( - f"counter:{counter} interval_window_start:{interval_window.start} " - f"interval_window_end:{interval_window.end}" - ) - return Messages(Message(str.encode(msg), keys=keys)) - - -if __name__ == "__main__": - grpc_server = ReduceAsyncServer(reduce_handler) - grpc_server.start() +grpc_server = MapServer(handler) ``` -### Sample Image -A sample UDF [Dockerfile](examples/map/forward_message/Dockerfile) is provided -under [examples](examples/map/forward_message). - -## Implement a User Defined Sink (UDSink) - -```python -from typing import Iterator -from pynumaflow.sinker import Datum, Responses, Response, SinkServer +#### AsyncServer +Asyncronous Server is a multi threaded server which can be used for UDFs which are asyncronous. Here we utilize the asyncronous capabilities of Python to process multiple messages in parallel. The server will invoke the handler function for each message. The messaging is asyncronous and the server will not wait for the handler to return before processing the next message. Thus this server type is useful for UDFs which are asyncronous. +The handler function for such a server should be an async function. -def my_handler(datums: Iterator[Datum]) -> Responses: - responses = Responses() - for msg in datums: - print("User Defined Sink", msg.value.decode("utf-8")) - responses.append(Response.as_success(msg.id)) - return responses - - -if __name__ == "__main__": - grpc_server = SinkServer(my_handler) - grpc_server.start() ``` - -### Sample Image - -A sample UDSink [Dockerfile](examples/sink/log/Dockerfile) is provided -under [examples](examples/sink/log). - -## Class based handlers - -We can also implement UDFs and UDSinks using class based handlers. - -The class based handlers are useful when we want to maintain state across multiple invocations of the handler. - -Here we can pass the class instance to the server and the server will invoke the handler methods on the instance. - -To use a class based handler, we the user needs to inherit the base class of the UDF/UDSink. -And implement the required methods in the class. - -Example For Mapper, the user needs to inherit the [Mapper](pynumaflow/mapper/_dtypes.py#170) class and then implement the [handler](pynumaflow/mapper/_dtypes.py#170) method. - -### Map - -```python -from pynumaflow.mapper import Messages, Message, Datum, MapServer, Mapper - - -class MyHandler(Mapper): - def handler(self, keys: list[str], datum: Datum) -> Messages: - val = datum.value - _ = datum.event_time - _ = datum.watermark - strs = val.decode("utf-8").split(",") - messages = Messages() - if len(strs) == 0: - messages.append(Message.to_drop()) - return messages - for s in strs: - messages.append(Message(str.encode(s))) - return messages - - -if __name__ == "__main__": - class_instance = MyHandler() - grpc_server = MapServer(class_instance) - grpc_server.start() +grpc_server = MapAsyncServer(handler) ``` +#### MultiProcessServer -## Server Types - -For different types of UDFs and UDSinks, we have different server types which are supported. - -These have different functionalities and are used for different use cases. +MultiProcess Server is a multi process server which can be used for UDFs which are CPU intensive. Here we utilize the multi process capabilities of Python to process multiple messages in parallel by forking multiple servers in different processes. +The server will invoke the handler function for each message. Individually at the server level the messaging is synchronous and the server will wait for the handler to return before processing the next message. But since we have multiple servers running in parallel, the overall messaging also executes in parallel. -Currently we support the following server types: -1) SyncServer -2) AsyncServer -3) MultiProcessServer +This could be an alternative to creating multiple replicas of the same UDF container as here we are using the multi processing capabilities of the system to process multiple messages in parallel but within the same container. -Not all of the above are supported for all UDFs and UDSinks. - - - -### SyncServer +Thus this server type is useful for UDFs which are CPU intensive. ``` -grpc_server = MapServer(handler) +grpc_server = MapMultiProcServer(handler) ``` -### AsyncServer -``` -grpc_server = MapAsyncServer(handler) -``` +#### Currently Supported Server Types for each functionality + +These are the class names for the server types supported by each of the functionalities. + +- UDFs + - Map + - MapServer + - MapAsyncServer + - MapMultiProcServer + - Reduce + - ReduceAsyncServer + - MapStream + - MapStreamAsyncServer + - Source Transform + - SourceTransformServer + - SourceTransformMultiProcServer +- UDSource + - SourceServer + - SourceAsyncServer +- UDSink + - SinkServer + - SinkAsyncServer +- SideInput + - SideInputServer + -### MultiProcessServer -``` -grpc_server = MapMultiProcServer(handler) -``` diff --git a/examples/map/even_odd/example.py b/examples/map/even_odd/example.py index da5c75a6..52405590 100644 --- a/examples/map/even_odd/example.py +++ b/examples/map/even_odd/example.py @@ -22,5 +22,12 @@ def my_handler(keys: list[str], datum: Datum) -> Messages: if __name__ == "__main__": + """ + This example shows how to create a simple map function that takes in a + number and outputs it to the "even" or "odd" key depending on whether it + is even or odd. + We use a function as handler, but a class that implements + a Mapper can be used as well. + """ grpc_server = MapServer(my_handler) grpc_server.start() diff --git a/examples/map/flatmap/example.py b/examples/map/flatmap/example.py index f4da36c0..eda861bf 100644 --- a/examples/map/flatmap/example.py +++ b/examples/map/flatmap/example.py @@ -2,6 +2,11 @@ class Flatmap(Mapper): + """ + This is a class that inherits from the Mapper class. + It implements the handler method that is called for each datum. + """ + def handler(self, keys: list[str], datum: Datum) -> Messages: val = datum.value _ = datum.event_time @@ -17,5 +22,9 @@ def handler(self, keys: list[str], datum: Datum) -> Messages: if __name__ == "__main__": + """ + This example shows how to use the Flatmap mapper. + We use a class as handler, but a function can be used as well. + """ grpc_server = MapServer(Flatmap()) grpc_server.start() diff --git a/examples/map/forward_message/example.py b/examples/map/forward_message/example.py index 34272f53..ca0d2217 100644 --- a/examples/map/forward_message/example.py +++ b/examples/map/forward_message/example.py @@ -4,6 +4,11 @@ class Example(Mapper): + """ + This is a class that inherits from the Mapper class. + It implements the handler method that is called for each datum. + """ + def handler(self, keys: list[str], datum: Datum) -> Messages: val = datum.value _ = datum.event_time @@ -23,10 +28,12 @@ def my_handler(keys: list[str], datum: Datum) -> Messages: if __name__ == "__main__": + """ + Use the class based approach or function based handler + based on the env variable + Both can be used and passed directly to the server class + """ invoke = os.getenv("INVOKE", "handler") - # Use the class based approach or function based handler - # based on the env variable - # Both can be used and passed directly to the server class if invoke == "class": handler = Example() else: diff --git a/examples/map/multiproc_map/example.py b/examples/map/multiproc_map/example.py index 963e9c42..e99795b5 100644 --- a/examples/map/multiproc_map/example.py +++ b/examples/map/multiproc_map/example.py @@ -13,6 +13,12 @@ def is_prime(n): class PrimeMap(Mapper): + """ + This class needs to be of type Mapper class to be used + as a handler for the MapServer class. + Example of a mapper that calculates if a number is prime. + """ + def handler(self, keys: list[str], datum: Datum) -> Messages: val = datum.value _ = datum.event_time @@ -32,5 +38,6 @@ def handler(self, keys: list[str], datum: Datum) -> Messages: # NUM_CPU_MULTIPROC="N" server_count = int(os.getenv("NUM_CPU_MULTIPROC", "2")) prime_class = PrimeMap() + # Server count is the number of server processes to start grpc_server = MapMultiprocServer(prime_class, server_count=server_count) grpc_server.start() diff --git a/examples/reduce/counter/example.py b/examples/reduce/counter/example.py index e5467cfc..1ac4fee8 100644 --- a/examples/reduce/counter/example.py +++ b/examples/reduce/counter/example.py @@ -1,20 +1,22 @@ import os from collections.abc import AsyncIterable - from pynumaflow.reducer import Messages, Message, Datum, Metadata, ReduceAsyncServer, Reducer class Example(Reducer): + def __init__(self): + self.counter = 0 + async def handler( self, keys: list[str], datums: AsyncIterable[Datum], md: Metadata ) -> Messages: interval_window = md.interval_window - counter = 0 + self.counter = 0 async for _ in datums: - counter += 1 + self.counter += 1 msg = ( - f"counter:{counter} interval_window_start:{interval_window.start} " + f"counter:{self.counter} interval_window_start:{interval_window.start} " f"interval_window_end:{interval_window.end}" ) return Messages(Message(str.encode(msg), keys=keys)) @@ -35,8 +37,7 @@ async def reduce_handler(keys: list[str], datums: AsyncIterable[Datum], md: Meta if __name__ == "__main__": invoke = os.getenv("INVOKE", "handler") if invoke == "class": - handler = Example() + grpc_server = ReduceAsyncServer(Example()) else: - handler = reduce_handler - grpc_server = ReduceAsyncServer(handler) + grpc_server = ReduceAsyncServer(reduce_handler) grpc_server.start() From cbf47bfe579463f4560ad51824c94fe0e161687d Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Wed, 17 Jan 2024 16:12:24 -0800 Subject: [PATCH 68/78] REDUCER INSTANCE Signed-off-by: Sidhant Kohli --- pynumaflow/reducer/_dtypes.py | 2 +- pynumaflow/reducer/servicer/async_servicer.py | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/pynumaflow/reducer/_dtypes.py b/pynumaflow/reducer/_dtypes.py index 31722608..4bc9d0f5 100644 --- a/pynumaflow/reducer/_dtypes.py +++ b/pynumaflow/reducer/_dtypes.py @@ -257,5 +257,5 @@ async def handler( ReduceAsyncCallable = Callable[[list[str], AsyncIterable[Datum], Metadata], Awaitable[Messages]] -# ReduceCallable is a callable which can be used as a handler for the reduce UDF. +# ReduceCallable is a callable which can be used as a handler for the Reduce UDF. ReduceCallable = Union[ReduceAsyncCallable, Reducer] diff --git a/pynumaflow/reducer/servicer/async_servicer.py b/pynumaflow/reducer/servicer/async_servicer.py index 82988f6e..6bed98d3 100644 --- a/pynumaflow/reducer/servicer/async_servicer.py +++ b/pynumaflow/reducer/servicer/async_servicer.py @@ -12,7 +12,7 @@ STREAM_EOF, DELIMITER, ) -from pynumaflow.reducer._dtypes import Datum, IntervalWindow, Metadata +from pynumaflow.reducer._dtypes import Datum, IntervalWindow, Metadata, Reducer from pynumaflow.reducer._dtypes import ReduceResult, ReduceCallable from pynumaflow.reducer.servicer.asynciter import NonBlockingIterator from pynumaflow.proto.reducer import reduce_pb2, reduce_pb2_grpc @@ -141,8 +141,10 @@ async def __async_reduce_handler(self, interval_window, datum_iterator: AsyncIte async def __invoke_reduce( self, keys: list[str], request_iterator: AsyncIterable[Datum], md: Metadata ): + reducer_class = self.__reduce_handler.__class__ + new_instance = reducer_class() try: - msgs = await self.__reduce_handler(keys, request_iterator, md) + msgs = await new_instance(keys, request_iterator, md) except Exception as err: _LOGGER.critical("UDFError, re-raising the error", exc_info=True) raise err From d88ed5c8815bcc38ada2de4d7a5f27b2f507cb4b Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Wed, 17 Jan 2024 17:18:43 -0800 Subject: [PATCH 69/78] REDUCER INSTANCE Signed-off-by: Sidhant Kohli --- pynumaflow/reducer/async_server.py | 5 ++++- pynumaflow/reducer/servicer/async_servicer.py | 6 ++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/pynumaflow/reducer/async_server.py b/pynumaflow/reducer/async_server.py index f7442238..40e89410 100644 --- a/pynumaflow/reducer/async_server.py +++ b/pynumaflow/reducer/async_server.py @@ -8,7 +8,7 @@ from pynumaflow._constants import ( REDUCE_SOCK_PATH, MAX_MESSAGE_SIZE, - MAX_THREADS, + MAX_THREADS, _LOGGER, ) from pynumaflow.reducer._dtypes import ReduceCallable @@ -57,6 +57,9 @@ def start(self): Starter function for the Async server class, need a separate caller so that all the async coroutines can be started from a single context """ + _LOGGER.info( + "Starting Async Reduce Server", + ) aiorun.run(self.aexec(), use_uvloop=True) async def aexec(self): diff --git a/pynumaflow/reducer/servicer/async_servicer.py b/pynumaflow/reducer/servicer/async_servicer.py index 6bed98d3..6ce8411b 100644 --- a/pynumaflow/reducer/servicer/async_servicer.py +++ b/pynumaflow/reducer/servicer/async_servicer.py @@ -141,8 +141,10 @@ async def __async_reduce_handler(self, interval_window, datum_iterator: AsyncIte async def __invoke_reduce( self, keys: list[str], request_iterator: AsyncIterable[Datum], md: Metadata ): - reducer_class = self.__reduce_handler.__class__ - new_instance = reducer_class() + new_instance = self.__reduce_handler + if isinstance(self.__reduce_handler, Reducer): + reducer_class = self.__reduce_handler.__class__ + new_instance = reducer_class() try: msgs = await new_instance(keys, request_iterator, md) except Exception as err: From 3961f953e0aaec4ddd80e699c7f6490d839928d0 Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Thu, 18 Jan 2024 16:34:11 -0800 Subject: [PATCH 70/78] deep copy Signed-off-by: Sidhant Kohli --- pynumaflow/reducer/_dtypes.py | 12 ++++++++++++ pynumaflow/reducer/async_server.py | 3 ++- pynumaflow/reducer/servicer/async_servicer.py | 5 +++-- 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/pynumaflow/reducer/_dtypes.py b/pynumaflow/reducer/_dtypes.py index 4bc9d0f5..1f4e9c0b 100644 --- a/pynumaflow/reducer/_dtypes.py +++ b/pynumaflow/reducer/_dtypes.py @@ -1,6 +1,7 @@ from abc import ABCMeta, abstractmethod from asyncio import Task from collections.abc import Iterator, Sequence, Awaitable +from copy import deepcopy from dataclasses import dataclass from datetime import datetime from typing import TypeVar, Callable, Union @@ -246,6 +247,17 @@ def __call__(self, *args, **kwargs): """ return self.handler(*args, **kwargs) + def __deepcopy__(self, memo): + """ + Allow to deepcopy the class instance. + """ + cls = self.__class__ + result = cls.__new__(cls) + memo[id(self)] = result + for k, v in self.__dict__.items(): + setattr(result, k, deepcopy(v, memo)) + return result + @abstractmethod async def handler( self, keys: list[str], datums: AsyncIterable[Datum], md: Metadata diff --git a/pynumaflow/reducer/async_server.py b/pynumaflow/reducer/async_server.py index 40e89410..529528fb 100644 --- a/pynumaflow/reducer/async_server.py +++ b/pynumaflow/reducer/async_server.py @@ -8,7 +8,8 @@ from pynumaflow._constants import ( REDUCE_SOCK_PATH, MAX_MESSAGE_SIZE, - MAX_THREADS, _LOGGER, + MAX_THREADS, + _LOGGER, ) from pynumaflow.reducer._dtypes import ReduceCallable diff --git a/pynumaflow/reducer/servicer/async_servicer.py b/pynumaflow/reducer/servicer/async_servicer.py index 6ce8411b..0ee4344d 100644 --- a/pynumaflow/reducer/servicer/async_servicer.py +++ b/pynumaflow/reducer/servicer/async_servicer.py @@ -1,4 +1,5 @@ import asyncio +from copy import deepcopy from datetime import datetime, timezone from collections.abc import AsyncIterable @@ -143,8 +144,8 @@ async def __invoke_reduce( ): new_instance = self.__reduce_handler if isinstance(self.__reduce_handler, Reducer): - reducer_class = self.__reduce_handler.__class__ - new_instance = reducer_class() + _LOGGER.info("Creating a new copy of the reducer instance") + new_instance = deepcopy(self.__reduce_handler) try: msgs = await new_instance(keys, request_iterator, md) except Exception as err: From 4065717a1eb6b372f22bcd67c29c7f2f89401bf3 Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Mon, 22 Jan 2024 11:32:36 -0800 Subject: [PATCH 71/78] deep copy Signed-off-by: Sidhant Kohli --- README.md | 30 +++++++++++++ pynumaflow/reducer/async_server.py | 6 ++- pynumaflow/reducer/servicer/async_servicer.py | 4 +- tests/reduce/test_datatypes.py | 43 +++++++++++++++++++ 4 files changed, 81 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index fdf36a9f..8faebb3e 100644 --- a/README.md +++ b/README.md @@ -126,3 +126,33 @@ These are the class names for the server types supported by each of the function +### Handler Function and Classes + +All the server types take a instance of a handler class or a handler function as an argument. +The handler function or class is the function or class which implements the functionality of the UDF, UDSource or UDSink. +For ease of use the user can pass either of the two to the server and the server will handle the rest. + +The handler for each of the servers has a specific signature which is defined by the server type and the implentation of the handlers +should follow the same signature. + +For using the class based handlers the user can inherit from the base handler class for each of the functionalities and implement the handler function. +The base handler class for each of the functionalities has the same signature as the handler function for the respective server type. +The list of base handler classes for each of the functionalities is given below - +- UDFs + - Map + - Mapper + - Reduce + - Reducer + - MapStream + - MapStreamer + - Source Transform + - SourceTransformer +- UDSource + - Sourcer +- UDSink + - Sinker +- SideInput + - SideInput + +More details about the signature of the handler function for each of the server types is given in the +documentation of the respective server type. diff --git a/pynumaflow/reducer/async_server.py b/pynumaflow/reducer/async_server.py index 529528fb..ec93ff6d 100644 --- a/pynumaflow/reducer/async_server.py +++ b/pynumaflow/reducer/async_server.py @@ -1,3 +1,5 @@ +from concurrent.futures import ThreadPoolExecutor + import aiorun import grpc @@ -72,7 +74,9 @@ async def aexec(self): # same thread as the event loop so that all the async calls are made in the # same context # Create a new async server instance and add the servicer to it - server = grpc.aio.server() + server = grpc.aio.server(ThreadPoolExecutor( + max_workers=self.max_threads, + )) server.add_insecure_port(self.sock_path) reduce_servicer = self.servicer reduce_pb2_grpc.add_ReduceServicer_to_server(reduce_servicer, server) diff --git a/pynumaflow/reducer/servicer/async_servicer.py b/pynumaflow/reducer/servicer/async_servicer.py index 0ee4344d..2d016246 100644 --- a/pynumaflow/reducer/servicer/async_servicer.py +++ b/pynumaflow/reducer/servicer/async_servicer.py @@ -143,8 +143,10 @@ async def __invoke_reduce( self, keys: list[str], request_iterator: AsyncIterable[Datum], md: Metadata ): new_instance = self.__reduce_handler + # If the reduce handler is a class instance, create a new copy of it. + # It is required for a new key to be processed by a + # new instance of the reducer for a given window if isinstance(self.__reduce_handler, Reducer): - _LOGGER.info("Creating a new copy of the reducer instance") new_instance = deepcopy(self.__reduce_handler) try: msgs = await new_instance(keys, request_iterator, md) diff --git a/tests/reduce/test_datatypes.py b/tests/reduce/test_datatypes.py index 59f54dc1..7983f02b 100644 --- a/tests/reduce/test_datatypes.py +++ b/tests/reduce/test_datatypes.py @@ -1,6 +1,9 @@ +from copy import deepcopy import unittest +from typing import AsyncIterable from google.protobuf import timestamp_pb2 as _timestamp_pb2 +from pynumaflow.reducer import Reducer, Messages from pynumaflow.reducer._dtypes import ( IntervalWindow, @@ -102,5 +105,45 @@ def test_interval_window(self): self.assertEqual(i, m.interval_window) +class TestReducerClass(unittest.TestCase): + class ExampleClass(Reducer): + async def handler(self, keys: list[str], datums: AsyncIterable[Datum], md: Metadata) -> Messages: + pass + + def __init__(self, test1, test2): + self.test1 = test1 + self.test2 = test2 + self.test3 = self.test1 + + def test_init(self): + r = self.ExampleClass(test1=1, test2=2) + self.assertEqual(1, r.test1) + self.assertEqual(2, r.test2) + self.assertEqual(1, r.test3) + + def test_deep_copy(self): + """Test that the deepcopy works as expected""" + r = self.ExampleClass(test1=1, test2=2) + # Create a copy of r + r_copy = deepcopy(r) + # Check that the attributes are the same + self.assertEqual(1, r_copy.test1) + self.assertEqual(2, r_copy.test2) + self.assertEqual(1, r_copy.test3) + # Check that the objects are not the same + self.assertNotEqual(id(r), id(r_copy)) + # Update the attributes of r + r.test1 = 5 + r.test3 = 6 + # Check that the other object is not updated + self.assertNotEqual(r.test1, r_copy.test1) + self.assertNotEqual(r.test3, r_copy.test3) + self.assertNotEqual(id(r.test3), id(r_copy.test3)) + # Verify that the instance type is correct + self.assertTrue(isinstance(r_copy, self.ExampleClass)) + self.assertTrue(isinstance(r_copy, Reducer)) + + + if __name__ == "__main__": unittest.main() From d2c844d0840dc658ba93e1e18f54beb883708c09 Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Mon, 22 Jan 2024 11:59:03 -0800 Subject: [PATCH 72/78] deep copy Signed-off-by: Sidhant Kohli --- pynumaflow/reducer/async_server.py | 5 +---- pynumaflow/reducer/servicer/async_servicer.py | 2 +- tests/reduce/test_datatypes.py | 7 ++++--- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/pynumaflow/reducer/async_server.py b/pynumaflow/reducer/async_server.py index ec93ff6d..623bb70b 100644 --- a/pynumaflow/reducer/async_server.py +++ b/pynumaflow/reducer/async_server.py @@ -1,4 +1,3 @@ -from concurrent.futures import ThreadPoolExecutor import aiorun import grpc @@ -74,9 +73,7 @@ async def aexec(self): # same thread as the event loop so that all the async calls are made in the # same context # Create a new async server instance and add the servicer to it - server = grpc.aio.server(ThreadPoolExecutor( - max_workers=self.max_threads, - )) + server = grpc.aio.server() server.add_insecure_port(self.sock_path) reduce_servicer = self.servicer reduce_pb2_grpc.add_ReduceServicer_to_server(reduce_servicer, server) diff --git a/pynumaflow/reducer/servicer/async_servicer.py b/pynumaflow/reducer/servicer/async_servicer.py index 2d016246..4ca6df86 100644 --- a/pynumaflow/reducer/servicer/async_servicer.py +++ b/pynumaflow/reducer/servicer/async_servicer.py @@ -144,7 +144,7 @@ async def __invoke_reduce( ): new_instance = self.__reduce_handler # If the reduce handler is a class instance, create a new copy of it. - # It is required for a new key to be processed by a + # It is required for a new key to be processed by a # new instance of the reducer for a given window if isinstance(self.__reduce_handler, Reducer): new_instance = deepcopy(self.__reduce_handler) diff --git a/tests/reduce/test_datatypes.py b/tests/reduce/test_datatypes.py index 7983f02b..5433a044 100644 --- a/tests/reduce/test_datatypes.py +++ b/tests/reduce/test_datatypes.py @@ -1,6 +1,6 @@ from copy import deepcopy import unittest -from typing import AsyncIterable +from collections.abc import AsyncIterable from google.protobuf import timestamp_pb2 as _timestamp_pb2 from pynumaflow.reducer import Reducer, Messages @@ -107,7 +107,9 @@ def test_interval_window(self): class TestReducerClass(unittest.TestCase): class ExampleClass(Reducer): - async def handler(self, keys: list[str], datums: AsyncIterable[Datum], md: Metadata) -> Messages: + async def handler( + self, keys: list[str], datums: AsyncIterable[Datum], md: Metadata + ) -> Messages: pass def __init__(self, test1, test2): @@ -144,6 +146,5 @@ def test_deep_copy(self): self.assertTrue(isinstance(r_copy, Reducer)) - if __name__ == "__main__": unittest.main() From d3b59ffe94ab61c5f20964a66af4fce4b139810f Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Mon, 22 Jan 2024 12:00:58 -0800 Subject: [PATCH 73/78] lint Signed-off-by: Sidhant Kohli --- pynumaflow/reducer/async_server.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pynumaflow/reducer/async_server.py b/pynumaflow/reducer/async_server.py index 623bb70b..529528fb 100644 --- a/pynumaflow/reducer/async_server.py +++ b/pynumaflow/reducer/async_server.py @@ -1,4 +1,3 @@ - import aiorun import grpc From 6b56383a6d11fb7d0f3a13160abdd23e1a7ca890 Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Mon, 22 Jan 2024 12:29:06 -0800 Subject: [PATCH 74/78] add reducer test Signed-off-by: Sidhant Kohli --- tests/reduce/test_async_reduce.py | 30 +++--- tests/reduce/test_async_reduce_err.py | 145 ++++++++++++++++++++++++++ 2 files changed, 163 insertions(+), 12 deletions(-) create mode 100644 tests/reduce/test_async_reduce_err.py diff --git a/tests/reduce/test_async_reduce.py b/tests/reduce/test_async_reduce.py index 4f0d288f..94b6c0ca 100644 --- a/tests/reduce/test_async_reduce.py +++ b/tests/reduce/test_async_reduce.py @@ -3,7 +3,6 @@ import threading import unittest from collections.abc import AsyncIterable -from collections.abc import Iterator import grpc from google.protobuf import empty_pb2 as _empty_pb2 @@ -17,6 +16,7 @@ Datum, Metadata, ReduceAsyncServer, + Reducer, ) from pynumaflow.proto.reducer import reduce_pb2, reduce_pb2_grpc from tests.testing_utils import ( @@ -79,22 +79,28 @@ def startup_callable(loop): loop.run_forever() -async def reduce_handler(keys: list[str], datums: Iterator[Datum], md: Metadata) -> Messages: - interval_window = md.interval_window - counter = 0 - async for _ in datums: - counter += 1 - msg = ( - f"counter:{counter} interval_window_start:{interval_window.start} " - f"interval_window_end:{interval_window.end}" - ) - return Messages(Message(str.encode(msg), keys=keys)) +class ExampleClass(Reducer): + def __init__(self): + self.counter = 0 + + async def handler( + self, keys: list[str], datums: AsyncIterable[Datum], md: Metadata + ) -> Messages: + interval_window = md.interval_window + self.counter = 0 + async for _ in datums: + self.counter += 1 + msg = ( + f"counter:{self.counter} interval_window_start:{interval_window.start} " + f"interval_window_end:{interval_window.end}" + ) + return Messages(Message(str.encode(msg), keys=keys)) def NewAsyncReducer( reduce_handler=async_reduce_handler, ): - server_instance = ReduceAsyncServer(reducer_instance=async_reduce_handler) + server_instance = ReduceAsyncServer(reducer_instance=ExampleClass()) udfs = server_instance.servicer return udfs diff --git a/tests/reduce/test_async_reduce_err.py b/tests/reduce/test_async_reduce_err.py new file mode 100644 index 00000000..6223e823 --- /dev/null +++ b/tests/reduce/test_async_reduce_err.py @@ -0,0 +1,145 @@ +import asyncio +import logging +import threading +import unittest +from collections.abc import AsyncIterable + +import grpc +from grpc.aio._server import Server + +from pynumaflow import setup_logging +from pynumaflow._constants import WIN_START_TIME, WIN_END_TIME +from pynumaflow.reducer import ( + Messages, + Message, + Datum, + Metadata, + ReduceAsyncServer, +) +from pynumaflow.proto.reducer import reduce_pb2, reduce_pb2_grpc +from tests.testing_utils import ( + mock_message, + mock_interval_window_start, + mock_interval_window_end, + get_time_args, +) + +LOGGER = setup_logging(__name__) + + +def request_generator(count, request, resetkey: bool = False): + for i in range(count): + if resetkey: + request.keys.extend([f"key-{i}"]) + yield request + + +def start_request() -> (Datum, tuple): + event_time_timestamp, watermark_timestamp = get_time_args() + + request = reduce_pb2.ReduceRequest( + value=mock_message(), + event_time=event_time_timestamp, + watermark=watermark_timestamp, + ) + metadata = ( + (WIN_START_TIME, f"{mock_interval_window_start()}"), + (WIN_END_TIME, f"{mock_interval_window_end()}"), + ) + return request, metadata + + +_s: Server = None +_channel = grpc.insecure_channel("unix:///tmp/reduce_err.sock") +_loop = None + + +def startup_callable(loop): + asyncio.set_event_loop(loop) + loop.run_forever() + + +async def err_handler(keys: list[str], datums: AsyncIterable[Datum], md: Metadata) -> Messages: + interval_window = md.interval_window + counter = 0 + async for _ in datums: + counter += 1 + msg = ( + f"counter:{counter} interval_window_start:{interval_window.start} " + f"interval_window_end:{interval_window.end}" + ) + raise RuntimeError("Got a runtime error from reduce handler.") + return Messages(Message(str.encode(msg), keys=keys)) + + +def NewAsyncReducer(): + server_instance = ReduceAsyncServer(reducer_instance=err_handler) + udfs = server_instance.servicer + + return udfs + + +async def start_server(udfs): + server = grpc.aio.server() + reduce_pb2_grpc.add_ReduceServicer_to_server(udfs, server) + listen_addr = "unix:///tmp/reduce_err.sock" + server.add_insecure_port(listen_addr) + logging.info("Starting server on %s", listen_addr) + global _s + _s = server + await server.start() + await server.wait_for_termination() + + +class TestAsyncReducerError(unittest.TestCase): + @classmethod + def setUpClass(cls) -> None: + global _loop + loop = asyncio.new_event_loop() + _loop = loop + _thread = threading.Thread(target=startup_callable, args=(loop,), daemon=True) + _thread.start() + udfs = NewAsyncReducer() + asyncio.run_coroutine_threadsafe(start_server(udfs), loop=loop) + while True: + try: + with grpc.insecure_channel("unix:///tmp/reduce_err.sock") as channel: + f = grpc.channel_ready_future(channel) + f.result(timeout=10) + if f.done(): + break + except grpc.FutureTimeoutError as e: + LOGGER.error("error trying to connect to grpc server") + LOGGER.error(e) + + @classmethod + def tearDownClass(cls) -> None: + try: + _loop.stop() + LOGGER.info("stopped the event loop") + except Exception as e: + LOGGER.error(e) + + def test_reduce(self) -> None: + stub = self.__stub() + request, metadata = start_request() + generator_response = None + try: + generator_response = stub.ReduceFn( + request_iterator=request_generator(count=10, request=request), metadata=metadata + ) + counter = 0 + for _ in generator_response: + counter += 1 + except Exception as err: + self.assertTrue("Got a runtime error from reduce handler." in err.__str__()) + return + self.fail("Expected an exception.") + + def __stub(self): + return reduce_pb2_grpc.ReduceStub(_channel) + + +if __name__ == "__main__": + logging.basicConfig(level=logging.DEBUG) + unittest.main() From 02afc012111523bf443d9f93f3a941e1c98026da Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Mon, 22 Jan 2024 22:12:03 -0800 Subject: [PATCH 75/78] change reduce signature Signed-off-by: Sidhant Kohli --- README.md | 2 +- examples/reduce/README.md | 65 +++++++++++++++++++ examples/reduce/counter/example.py | 10 ++- pynumaflow/reducer/_dtypes.py | 39 +++++++---- pynumaflow/reducer/async_server.py | 37 +++++++++-- pynumaflow/reducer/servicer/async_servicer.py | 24 ++++--- tests/reduce/test_async_reduce.py | 45 ++++++------- tests/reduce/test_async_reduce_err.py | 2 +- 8 files changed, 169 insertions(+), 55 deletions(-) create mode 100644 examples/reduce/README.md diff --git a/README.md b/README.md index 8faebb3e..af3809fa 100644 --- a/README.md +++ b/README.md @@ -45,7 +45,7 @@ pre-commit install - [Implement User Defined Source Transformers](https://github.com/numaproj/numaflow-python/tree/main/examples/sourcetransform) - Implement User Defined Functions - [Map](https://github.com/numaproj/numaflow-python/tree/main/examples/map) - - [Reduce](https://github.com/numaproj/numaflow-python/tree/main/examples/reduce/counter) + - [Reduce](https://github.com/numaproj/numaflow-python/tree/main/examples/reduce) - [Map Stream](https://github.com/numaproj/numaflow-python/tree/main/examples/mapstream) - [Implement User Defined Sinks](https://github.com/numaproj/numaflow-python/tree/main/examples/sink) - [Implement User Defined SideInputs](https://github.com/numaproj/numaflow-python/tree/main/examples/sideinput) diff --git a/examples/reduce/README.md b/examples/reduce/README.md new file mode 100644 index 00000000..130dc6ee --- /dev/null +++ b/examples/reduce/README.md @@ -0,0 +1,65 @@ +# Reducer in Python + +For creating a reducer UDF we can use two different approaches: +- Class based reducer + - For the class based reducer we need to implement a class that inherits from the `Reducer` class and implements the required methods. + - Next we need to create a `ReduceAsyncServer` instance and pass the reducer class to it along with any input args or + kwargs that the custom reducer class requires. + - Finally we need to call the `start` method on the `ReduceAsyncServer` instance to start the reducer server. + ```python + from numaflow import Reducer, ReduceAsyncServer + class Example(Reducer): + def __init__(self, counter): + self.counter = counter + + async def handler( + self, keys: list[str], datums: AsyncIterable[Datum], md: Metadata + ) -> Messages: + interval_window = md.interval_window + self.counter = 0 + async for _ in datums: + self.counter += 1 + msg = ( + f"counter:{self.counter} interval_window_start:{interval_window.start} " + f"interval_window_end:{interval_window.end}" + ) + return Messages(Message(str.encode(msg), keys=keys)) + + if __name__ == "__main__": + # Here we are using the class instance as the reducer_instance + # which will be used to invoke the handler function. + # We are passing the init_args for the class instance. + grpc_server = ReduceAsyncServer(Example, init_args=(0,)) + grpc_server.start() + ``` + +- Function based reducer + For the function based reducer we need to create a function of the signature + ```python + async def handler(keys: list[str], datums: AsyncIterable[Datum], md: Metadata) -> Messages: + ``` + that takes the required arguments and returns the `Messages` object. + - Next we need to create a `ReduceAsyncServer` instance and pass the function to it along with any input args or kwargs that the custom reducer function requires. + - Finally we need to call the `start` method on the `ReduceAsyncServer` instance to start the reducer server. + - We must ensure that no init_args or init_kwargs are passed to the `ReduceAsyncServer` instance as they are not used for function based reducers. + ```python + from numaflow import ReduceAsyncServer + async def handler(keys: list[str], datums: AsyncIterable[Datum], md: Metadata) -> Messages: + counter = 0 + interval_window = md.interval_window + async for _ in datums: + counter += 1 + msg = ( + f"counter:{counter} interval_window_start:{interval_window.start} " + f"interval_window_end:{interval_window.end}" + ) + return Messages(Message(str.encode(msg), keys=keys)) + + if __name__ == "__main__": + # Here we are using the function as the reducer_instance + # which will be used to invoke the handler function. + grpc_server = ReduceAsyncServer(handler) + grpc_server.start() + ``` + + diff --git a/examples/reduce/counter/example.py b/examples/reduce/counter/example.py index 1ac4fee8..02856894 100644 --- a/examples/reduce/counter/example.py +++ b/examples/reduce/counter/example.py @@ -5,8 +5,8 @@ class Example(Reducer): - def __init__(self): - self.counter = 0 + def __init__(self, counter): + self.counter = counter async def handler( self, keys: list[str], datums: AsyncIterable[Datum], md: Metadata @@ -37,7 +37,11 @@ async def reduce_handler(keys: list[str], datums: AsyncIterable[Datum], md: Meta if __name__ == "__main__": invoke = os.getenv("INVOKE", "handler") if invoke == "class": - grpc_server = ReduceAsyncServer(Example()) + # Here we are using the class instance as the reducer_instance + # which will be used to invoke the handler function. + # We are passing the init_args for the class instance. + grpc_server = ReduceAsyncServer(Example, init_args=(0,)) else: + # Here we are using the handler function directly as the reducer_instance. grpc_server = ReduceAsyncServer(reduce_handler) grpc_server.start() diff --git a/pynumaflow/reducer/_dtypes.py b/pynumaflow/reducer/_dtypes.py index 1f4e9c0b..ee6c41f8 100644 --- a/pynumaflow/reducer/_dtypes.py +++ b/pynumaflow/reducer/_dtypes.py @@ -1,7 +1,6 @@ from abc import ABCMeta, abstractmethod from asyncio import Task from collections.abc import Iterator, Sequence, Awaitable -from copy import deepcopy from dataclasses import dataclass from datetime import datetime from typing import TypeVar, Callable, Union @@ -234,6 +233,9 @@ def keys(self) -> list[str]: return self._key +ReduceAsyncCallable = Callable[[list[str], AsyncIterable[Datum], Metadata], Awaitable[Messages]] + + class Reducer(metaclass=ABCMeta): """ Provides an interface to write a Reducer @@ -247,17 +249,6 @@ def __call__(self, *args, **kwargs): """ return self.handler(*args, **kwargs) - def __deepcopy__(self, memo): - """ - Allow to deepcopy the class instance. - """ - cls = self.__class__ - result = cls.__new__(cls) - memo[id(self)] = result - for k, v in self.__dict__.items(): - setattr(result, k, deepcopy(v, memo)) - return result - @abstractmethod async def handler( self, keys: list[str], datums: AsyncIterable[Datum], md: Metadata @@ -268,6 +259,26 @@ async def handler( pass -ReduceAsyncCallable = Callable[[list[str], AsyncIterable[Datum], Metadata], Awaitable[Messages]] +class ReduceBuilderClass: + """ + Class to build a Reducer class instance. + Args: + reducer_class: the reducer class to be used for Reduce UDF + args: the arguments to be passed to the reducer class + kwargs: the keyword arguments to be passed to the reducer class + """ + + def __init__(self, reducer_class: type[Reducer], args: tuple, kwargs: dict): + self._reducer_class: type[Reducer] = reducer_class + self._args = args + self._kwargs = kwargs + + def create(self) -> Reducer: + """ + Create a new Reducer instance. + """ + return self._reducer_class(*self._args, **self._kwargs) + + # ReduceCallable is a callable which can be used as a handler for the Reduce UDF. -ReduceCallable = Union[ReduceAsyncCallable, Reducer] +ReduceCallable = Union[ReduceAsyncCallable, type[Reducer]] diff --git a/pynumaflow/reducer/async_server.py b/pynumaflow/reducer/async_server.py index 529528fb..b35a475c 100644 --- a/pynumaflow/reducer/async_server.py +++ b/pynumaflow/reducer/async_server.py @@ -1,3 +1,5 @@ +import inspect + import aiorun import grpc @@ -12,11 +14,34 @@ _LOGGER, ) -from pynumaflow.reducer._dtypes import ReduceCallable +from pynumaflow.reducer._dtypes import ( + ReduceCallable, + ReduceBuilderClass, + Reducer, +) from pynumaflow.shared.server import NumaflowServer, start_async_server +def get_handler(reducer_handler: ReduceCallable, init_args: tuple = (), init_kwargs: dict = None): + """ + Get the correct handler type based on the arguments passed + """ + if inspect.isfunction(reducer_handler): + if len(init_args) > 0 or len(init_kwargs) > 0: + # if the init_args or init_kwargs are passed, then the reducer_handler + # can only be of class Reducer type + raise TypeError("Cannot pass function handler with init args or kwargs") + # return the function handler + return reducer_handler + elif issubclass(reducer_handler, Reducer): + # if handler is type of Class Reducer, create a new instance of + # a ReducerBuilderClass + return ReduceBuilderClass(reducer_handler, init_args, init_kwargs) + else: + raise TypeError("Invalid type passed") + + class ReduceAsyncServer(NumaflowServer): """ Class for a new Reduce Server instance. @@ -24,7 +49,9 @@ class ReduceAsyncServer(NumaflowServer): def __init__( self, - reducer_instance: ReduceCallable, + reducer_handler: ReduceCallable, + init_args: tuple = (), + init_kwargs: dict = None, sock_path=REDUCE_SOCK_PATH, max_message_size=MAX_MESSAGE_SIZE, max_threads=MAX_THREADS, @@ -41,7 +68,9 @@ def __init__( defaults to number of processors x4 server_type: The type of server to be used """ - self.reducer_instance: ReduceCallable = reducer_instance + if init_kwargs is None: + init_kwargs = {} + self.reducer_handler = get_handler(reducer_handler, init_args, init_kwargs) self.sock_path = f"unix://{sock_path}" self.max_message_size = max_message_size self.max_threads = max_threads @@ -51,7 +80,7 @@ def __init__( ("grpc.max_receive_message_length", self.max_message_size), ] # Get the servicer instance for the async server - self.servicer = AsyncReduceServicer(reducer_instance) + self.servicer = AsyncReduceServicer(self.reducer_handler) def start(self): """ diff --git a/pynumaflow/reducer/servicer/async_servicer.py b/pynumaflow/reducer/servicer/async_servicer.py index 4ca6df86..d80d2752 100644 --- a/pynumaflow/reducer/servicer/async_servicer.py +++ b/pynumaflow/reducer/servicer/async_servicer.py @@ -1,8 +1,8 @@ import asyncio -from copy import deepcopy from datetime import datetime, timezone from collections.abc import AsyncIterable +from typing import Union import grpc from google.protobuf import empty_pb2 as _empty_pb2 @@ -13,8 +13,14 @@ STREAM_EOF, DELIMITER, ) -from pynumaflow.reducer._dtypes import Datum, IntervalWindow, Metadata, Reducer -from pynumaflow.reducer._dtypes import ReduceResult, ReduceCallable +from pynumaflow.reducer._dtypes import ( + Datum, + IntervalWindow, + Metadata, + ReduceAsyncCallable, + ReduceBuilderClass, +) +from pynumaflow.reducer._dtypes import ReduceResult from pynumaflow.reducer.servicer.asynciter import NonBlockingIterator from pynumaflow.proto.reducer import reduce_pb2, reduce_pb2_grpc from pynumaflow.types import NumaflowServicerContext @@ -43,13 +49,14 @@ class AsyncReduceServicer(reduce_pb2_grpc.ReduceServicer): def __init__( self, - handler: ReduceCallable, + handler: Union[ReduceAsyncCallable, ReduceBuilderClass], ): # Collection for storing strong references to all running tasks. # Event loop only keeps a weak reference, which can cause it to # get lost during execution. self.background_tasks = set() - self.__reduce_handler: ReduceCallable = handler + # The reduce handler can be a function or a builder class instance. + self.__reduce_handler: Union[ReduceAsyncCallable, ReduceBuilderClass] = handler async def ReduceFn( self, @@ -143,11 +150,12 @@ async def __invoke_reduce( self, keys: list[str], request_iterator: AsyncIterable[Datum], md: Metadata ): new_instance = self.__reduce_handler - # If the reduce handler is a class instance, create a new copy of it. + # If the reduce handler is a class instance, create a new instance of it. # It is required for a new key to be processed by a # new instance of the reducer for a given window - if isinstance(self.__reduce_handler, Reducer): - new_instance = deepcopy(self.__reduce_handler) + # Otherwise the function handler can be called directly + if isinstance(self.__reduce_handler, ReduceBuilderClass): + new_instance = self.__reduce_handler.create() try: msgs = await new_instance(keys, request_iterator, md) except Exception as err: diff --git a/tests/reduce/test_async_reduce.py b/tests/reduce/test_async_reduce.py index 94b6c0ca..e585c95a 100644 --- a/tests/reduce/test_async_reduce.py +++ b/tests/reduce/test_async_reduce.py @@ -28,24 +28,6 @@ LOGGER = setup_logging(__name__) -# if set to true, map handler will raise a `ValueError` exception. -raise_error_from_map = False - - -async def async_reduce_handler( - keys: list[str], datums: AsyncIterable[Datum], md: Metadata -) -> Messages: - interval_window = md.interval_window - counter = 0 - async for _ in datums: - counter += 1 - msg = ( - f"counter:{counter} interval_window_start:{interval_window.start} " - f"interval_window_end:{interval_window.end}" - ) - - return Messages(Message(str.encode(msg), keys=keys)) - def request_generator(count, request, resetkey: bool = False): for i in range(count): @@ -80,8 +62,8 @@ def startup_callable(loop): class ExampleClass(Reducer): - def __init__(self): - self.counter = 0 + def __init__(self, counter): + self.counter = counter async def handler( self, keys: list[str], datums: AsyncIterable[Datum], md: Metadata @@ -97,10 +79,20 @@ async def handler( return Messages(Message(str.encode(msg), keys=keys)) -def NewAsyncReducer( - reduce_handler=async_reduce_handler, -): - server_instance = ReduceAsyncServer(reducer_instance=ExampleClass()) +async def err_handler(keys: list[str], datums: AsyncIterable[Datum], md: Metadata) -> Messages: + interval_window = md.interval_window + counter = 0 + async for _ in datums: + counter += 1 + msg = ( + f"counter:{counter} interval_window_start:{interval_window.start} " + f"interval_window_end:{interval_window.end}" + ) + return Messages(Message(str.encode(msg), keys=keys)) + + +def NewAsyncReducer(): + server_instance = ReduceAsyncServer(ExampleClass, init_args=(0,)) udfs = server_instance.servicer return udfs @@ -240,8 +232,13 @@ def __stub(self): return reduce_pb2_grpc.ReduceStub(_channel) def test_error_init(self): + # Check that reducer_handler in required with self.assertRaises(TypeError): ReduceAsyncServer() + # Check that the init_args and init_kwargs are passed + # only with a Reducer class + with self.assertRaises(TypeError): + ReduceAsyncServer(err_handler, init_args=(0, 1)) if __name__ == "__main__": diff --git a/tests/reduce/test_async_reduce_err.py b/tests/reduce/test_async_reduce_err.py index 6223e823..8da36d0c 100644 --- a/tests/reduce/test_async_reduce_err.py +++ b/tests/reduce/test_async_reduce_err.py @@ -73,7 +73,7 @@ async def err_handler(keys: list[str], datums: AsyncIterable[Datum], md: Metadat def NewAsyncReducer(): - server_instance = ReduceAsyncServer(reducer_instance=err_handler) + server_instance = ReduceAsyncServer(err_handler) udfs = server_instance.servicer return udfs From d8bcefe52c99c5bedc8b2689615a076200f238f1 Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Tue, 23 Jan 2024 12:02:59 -0800 Subject: [PATCH 76/78] comments Signed-off-by: Sidhant Kohli --- examples/map/forward_message/example.py | 14 ++-- examples/mapstream/flatmap_stream/example.py | 6 +- .../mapstream/flatmap_stream/pipeline.yaml | 2 +- examples/reduce/counter/Makefile | 2 +- examples/reduce/counter/example.py | 6 +- examples/reduce/counter/pipeline.yaml | 4 +- examples/reduce/counter/pyproject.toml | 3 +- .../sideinput/simple-sideinput/example.py | 2 +- examples/sink/async_log/example.py | 7 +- .../sink/async_log/pipeline-numaflow.yaml | 2 +- examples/sink/log/example.py | 7 +- examples/sink/log/pipeline-numaflow.yaml | 2 +- pynumaflow/mapper/_dtypes.py | 2 +- pynumaflow/mapper/async_server.py | 22 ++++++ pynumaflow/mapper/multiproc_server.py | 45 +++++++++-- pynumaflow/mapper/sync_server.py | 36 +++++++++ pynumaflow/mapstreamer/_dtypes.py | 2 +- pynumaflow/mapstreamer/async_server.py | 53 +++++++++++-- pynumaflow/reducer/_dtypes.py | 4 +- pynumaflow/reducer/async_server.py | 56 ++++++++++++++ pynumaflow/sideinput/server.py | 34 ++++++++- pynumaflow/sinker/_dtypes.py | 2 +- pynumaflow/sinker/async_server.py | 56 +++++++++++--- pynumaflow/sinker/server.py | 32 ++++++++ pynumaflow/sourcer/_dtypes.py | 2 +- pynumaflow/sourcer/async_server.py | 75 +++++++++++++++++-- pynumaflow/sourcer/server.py | 73 ++++++++++++++++-- pynumaflow/sourcetransformer/_dtypes.py | 2 +- .../sourcetransformer/multiproc_server.py | 64 ++++++++++++++-- pynumaflow/sourcetransformer/server.py | 64 ++++++++++++++-- 30 files changed, 597 insertions(+), 84 deletions(-) diff --git a/examples/map/forward_message/example.py b/examples/map/forward_message/example.py index ca0d2217..9a6c9d09 100644 --- a/examples/map/forward_message/example.py +++ b/examples/map/forward_message/example.py @@ -3,7 +3,7 @@ from pynumaflow.mapper import Messages, Message, Datum, MapServer, Mapper -class Example(Mapper): +class MessageForwarder(Mapper): """ This is a class that inherits from the Mapper class. It implements the handler method that is called for each datum. @@ -13,18 +13,14 @@ def handler(self, keys: list[str], datum: Datum) -> Messages: val = datum.value _ = datum.event_time _ = datum.watermark - messages = Messages() - messages.append(Message(value=val, keys=keys)) - return messages + return Messages(Message(value=val, keys=keys)) def my_handler(keys: list[str], datum: Datum) -> Messages: val = datum.value _ = datum.event_time _ = datum.watermark - messages = Messages() - messages.append(Message(value=val, keys=keys)) - return messages + return Messages(Message(value=val, keys=keys)) if __name__ == "__main__": @@ -33,9 +29,9 @@ def my_handler(keys: list[str], datum: Datum) -> Messages: based on the env variable Both can be used and passed directly to the server class """ - invoke = os.getenv("INVOKE", "handler") + invoke = os.getenv("INVOKE", "func_handler") if invoke == "class": - handler = Example() + handler = MessageForwarder() else: handler = my_handler grpc_server = MapServer(handler) diff --git a/examples/mapstream/flatmap_stream/example.py b/examples/mapstream/flatmap_stream/example.py index 310edd1a..e25a11ab 100644 --- a/examples/mapstream/flatmap_stream/example.py +++ b/examples/mapstream/flatmap_stream/example.py @@ -3,7 +3,7 @@ from pynumaflow.mapstreamer import Message, Datum, MapStreamAsyncServer, MapStreamer -class Example(MapStreamer): +class FlatMapStream(MapStreamer): async def handler(self, keys: list[str], datum: Datum) -> AsyncIterable[Message]: """ A handler that splits the input datum value into multiple strings by `,` separator and @@ -39,9 +39,9 @@ async def map_stream_handler(_: list[str], datum: Datum) -> AsyncIterable[Messag if __name__ == "__main__": - invoke = os.getenv("INVOKE", "handler") + invoke = os.getenv("INVOKE", "func_handler") if invoke == "class": - handler = Example() + handler = FlatMapStream() else: handler = map_stream_handler grpc_server = MapStreamAsyncServer(handler) diff --git a/examples/mapstream/flatmap_stream/pipeline.yaml b/examples/mapstream/flatmap_stream/pipeline.yaml index b711f522..eb568f72 100644 --- a/examples/mapstream/flatmap_stream/pipeline.yaml +++ b/examples/mapstream/flatmap_stream/pipeline.yaml @@ -26,7 +26,7 @@ spec: - name: PYTHONDEBUG value: "true" - name : INVOKE - value: "handler" + value: "func_handler" containerTemplate: resources: limits: diff --git a/examples/reduce/counter/Makefile b/examples/reduce/counter/Makefile index 893f8a00..cbdb5d33 100644 --- a/examples/reduce/counter/Makefile +++ b/examples/reduce/counter/Makefile @@ -1,6 +1,6 @@ .PHONY: image image: - docker build -t "quay.io/numaio/numaflow-python/reduce-counter:v0.6.1" . + docker build --no-cache -t "quay.io/kohlisid/numaflow-python/reduce:refactor1" . # Github CI runner uses platform linux/amd64. If your local environment don't, the image built by command above might not work # under the CI E2E test environment. # To build an image that supports multiple platforms(linux/amd64,linux/arm64) and push to quay.io, use the following command diff --git a/examples/reduce/counter/example.py b/examples/reduce/counter/example.py index 02856894..124f010c 100644 --- a/examples/reduce/counter/example.py +++ b/examples/reduce/counter/example.py @@ -4,7 +4,7 @@ from pynumaflow.reducer import Messages, Message, Datum, Metadata, ReduceAsyncServer, Reducer -class Example(Reducer): +class ReduceCounter(Reducer): def __init__(self, counter): self.counter = counter @@ -35,12 +35,12 @@ async def reduce_handler(keys: list[str], datums: AsyncIterable[Datum], md: Meta if __name__ == "__main__": - invoke = os.getenv("INVOKE", "handler") + invoke = os.getenv("INVOKE", "func_handler") if invoke == "class": # Here we are using the class instance as the reducer_instance # which will be used to invoke the handler function. # We are passing the init_args for the class instance. - grpc_server = ReduceAsyncServer(Example, init_args=(0,)) + grpc_server = ReduceAsyncServer(ReduceCounter, init_args=(0,)) else: # Here we are using the handler function directly as the reducer_instance. grpc_server = ReduceAsyncServer(reduce_handler) diff --git a/examples/reduce/counter/pipeline.yaml b/examples/reduce/counter/pipeline.yaml index 4fb35f4b..10d53704 100644 --- a/examples/reduce/counter/pipeline.yaml +++ b/examples/reduce/counter/pipeline.yaml @@ -18,7 +18,7 @@ spec: udf: container: # compute the sum - image: quay.io/numaio/numaflow-python/reduce-counter:v0.6.1 + image: quay.io/kohlisid/numaflow-python/reduce:refactor1 imagePullPolicy: Always env: - name: PYTHONDEBUG @@ -34,7 +34,7 @@ spec: persistentVolumeClaim: volumeSize: 10Gi accessMode: ReadWriteOnce - partitions: 2 + partitions: 1 - name: sink scale: min: 1 diff --git a/examples/reduce/counter/pyproject.toml b/examples/reduce/counter/pyproject.toml index dc3cc41d..345ec1d3 100644 --- a/examples/reduce/counter/pyproject.toml +++ b/examples/reduce/counter/pyproject.toml @@ -6,7 +6,8 @@ authors = ["Numaflow developers"] [tool.poetry.dependencies] python = "~3.10" -pynumaflow = "~0.7.0" +#pynumaflow = "~0.7.0" +pynumaflow = {git = "https://github.com/kohlisid/numaflow-python", rev = "class-refactor"} [tool.poetry.dev-dependencies] diff --git a/examples/sideinput/simple-sideinput/example.py b/examples/sideinput/simple-sideinput/example.py index acef3779..c4265711 100644 --- a/examples/sideinput/simple-sideinput/example.py +++ b/examples/sideinput/simple-sideinput/example.py @@ -12,7 +12,7 @@ def retrieve_handler(self) -> Response: """ time_now = datetime.datetime.now() # val is the value to be broadcasted - val = "an example:" + str(time_now) + val = f"an example: {str(time_now)}" self.counter += 1 # broadcast every other time if self.counter % 2 == 0: diff --git a/examples/sink/async_log/example.py b/examples/sink/async_log/example.py index a42a4c96..d7968115 100644 --- a/examples/sink/async_log/example.py +++ b/examples/sink/async_log/example.py @@ -4,13 +4,14 @@ from pynumaflow.sinker import Datum, Responses, Response, Sinker from pynumaflow.sinker import SinkAsyncServer +from pynumaflow._constants import _LOGGER class UserDefinedSink(Sinker): async def handler(self, datums: AsyncIterable[Datum]) -> Responses: responses = Responses() async for msg in datums: - print("User Defined Sink", msg.value.decode("utf-8")) + _LOGGER.info("User Defined Sink %s", msg.value.decode("utf-8")) responses.append(Response.as_success(msg.id)) return responses @@ -18,13 +19,13 @@ async def handler(self, datums: AsyncIterable[Datum]) -> Responses: async def udsink_handler(datums: AsyncIterable[Datum]) -> Responses: responses = Responses() async for msg in datums: - print("User Defined Sink", msg.value.decode("utf-8")) + _LOGGER.info("User Defined Sink %s", msg.value.decode("utf-8")) responses.append(Response.as_success(msg.id)) return responses if __name__ == "__main__": - invoke = os.getenv("INVOKE", "handler") + invoke = os.getenv("INVOKE", "func_handler") if invoke == "class": sink_handler = UserDefinedSink() else: diff --git a/examples/sink/async_log/pipeline-numaflow.yaml b/examples/sink/async_log/pipeline-numaflow.yaml index 26d10ac2..b2e23ac8 100644 --- a/examples/sink/async_log/pipeline-numaflow.yaml +++ b/examples/sink/async_log/pipeline-numaflow.yaml @@ -27,7 +27,7 @@ spec: - name: PYTHONDEBUG value: "true" - name: INVOKE - value: "handler" + value: "func_handler" - name: log-output sink: log: {} diff --git a/examples/sink/log/example.py b/examples/sink/log/example.py index 187eb000..391529ad 100644 --- a/examples/sink/log/example.py +++ b/examples/sink/log/example.py @@ -3,13 +3,14 @@ from pynumaflow.sinker import Datum, Responses, Response, SinkServer from pynumaflow.sinker import Sinker +from pynumaflow._constants import _LOGGER class UserDefinedSink(Sinker): def handler(self, datums: Iterator[Datum]) -> Responses: responses = Responses() for msg in datums: - print("User Defined Sink", msg.value.decode("utf-8")) + _LOGGER.info("User Defined Sink %s", msg.value.decode("utf-8")) responses.append(Response.as_success(msg.id)) return responses @@ -17,13 +18,13 @@ def handler(self, datums: Iterator[Datum]) -> Responses: def udsink_handler(datums: Iterator[Datum]) -> Responses: responses = Responses() for msg in datums: - print("User Defined Sink", msg.value.decode("utf-8")) + _LOGGER.info("User Defined Sink %s", msg.value.decode("utf-8")) responses.append(Response.as_success(msg.id)) return responses if __name__ == "__main__": - invoke = os.getenv("INVOKE", "handler") + invoke = os.getenv("INVOKE", "func_handler") if invoke == "class": sink_handler = UserDefinedSink() else: diff --git a/examples/sink/log/pipeline-numaflow.yaml b/examples/sink/log/pipeline-numaflow.yaml index 0f9ab368..609ed58c 100644 --- a/examples/sink/log/pipeline-numaflow.yaml +++ b/examples/sink/log/pipeline-numaflow.yaml @@ -27,7 +27,7 @@ spec: - name: PYTHONDEBUG value: "true" - name: INVOKE - value: "handler" + value: "func_handler" - name: log-output sink: log: {} diff --git a/pynumaflow/mapper/_dtypes.py b/pynumaflow/mapper/_dtypes.py index c7fda744..607ef4c8 100644 --- a/pynumaflow/mapper/_dtypes.py +++ b/pynumaflow/mapper/_dtypes.py @@ -179,7 +179,7 @@ class instance is sent as a callable. @abstractmethod def handler(self, keys: list[str], datum: Datum) -> Messages: """ - Write a handler function which implements the MapSyncCallable interface. + Implement this handler function which implements the MapSyncCallable interface. """ pass diff --git a/pynumaflow/mapper/async_server.py b/pynumaflow/mapper/async_server.py index 85b7ff2e..534c6606 100644 --- a/pynumaflow/mapper/async_server.py +++ b/pynumaflow/mapper/async_server.py @@ -20,6 +20,28 @@ class MapAsyncServer(NumaflowServer): """ Create a new grpc Map Server instance. + Args: + mapper_instance: The mapper instance to be used for Map UDF + sock_path: The UNIX socket path to be used for the server + max_message_size: The max message size in bytes the server can receive and send + max_threads: The max number of threads to be spawned; + defaults to number of processors x4 + + Example invocation: + from pynumaflow.mapper import Messages, Message, Datum, MapAsyncServer + async def async_map_handler(keys: list[str], datum: Datum) -> Messages: + val = datum.value + msg = "payload:{} event_time:{} watermark:{}".format( + val.decode("utf-8"), + datum.event_time, + datum.watermark, + ) + val = bytes(msg, encoding="utf-8") + return Messages(Message(value=val, keys=keys)) + + if __name__ == "__main__": + grpc_server = MapAsyncServer(async_map_handler) + grpc_server.start() """ def __init__( diff --git a/pynumaflow/mapper/multiproc_server.py b/pynumaflow/mapper/multiproc_server.py index b76dbd3d..e91a1fa2 100644 --- a/pynumaflow/mapper/multiproc_server.py +++ b/pynumaflow/mapper/multiproc_server.py @@ -33,12 +33,45 @@ def __init__( A new servicer instance is created and attached to the server. The server instance is returned. Args: - mapper_instance: The mapper instance to be used for Map UDF - server_count: The number of grpc server instances to be forked for multiproc - sock_path: The UNIX socket path to be used for the server - max_message_size: The max message size in bytes the server can receive and send - max_threads: The max number of threads to be spawned; - defaults to number of processors x4 + mapper_instance: The mapper instance to be used for Map UDF + server_count: The number of grpc server instances to be forked for multiproc + sock_path: The UNIX socket path to be used for the server + max_message_size: The max message size in bytes the server can receive and send + max_threads: The max number of threads to be spawned; + defaults to number of processors x4 + + Example invocation: + import math + import os + from pynumaflow.mapper import Messages, Message, Datum, Mapper, MapMultiprocServer + + def is_prime(n): + for i in range(2, int(math.ceil(math.sqrt(n)))): + if n % i == 0: + return False + else: + return True + + class PrimeMap(Mapper): + def handler(self, keys: list[str], datum: Datum) -> Messages: + val = datum.value + _ = datum.event_time + _ = datum.watermark + messages = Messages() + for i in range(2, 100000): + is_prime(i) + messages.append(Message(val, keys=keys)) + return messages + + if __name__ == "__main__": + # To set the env server_count value set the env variable + # NUM_CPU_MULTIPROC="N" + server_count = int(os.getenv("NUM_CPU_MULTIPROC", "2")) + prime_class = PrimeMap() + # Server count is the number of server processes to start + grpc_server = MapMultiprocServer(prime_class, server_count=server_count) + grpc_server.start() + """ self.sock_path = f"unix://{sock_path}" self.max_threads = min(max_threads, int(os.getenv("MAX_THREADS", "4"))) diff --git a/pynumaflow/mapper/sync_server.py b/pynumaflow/mapper/sync_server.py index 957b9784..a45f582d 100644 --- a/pynumaflow/mapper/sync_server.py +++ b/pynumaflow/mapper/sync_server.py @@ -21,6 +21,42 @@ class MapServer(NumaflowServer): """ Create a new grpc Map Server instance. + Args: + mapper_instance: The mapper instance to be used for Map UDF + sock_path: The UNIX socket path to be used for the server + max_message_size: The max message size in bytes the server can receive and send + max_threads: The max number of threads to be spawned; + defaults to number of processors x4 + + Example Invocation: + from pynumaflow.mapper import Messages, Message, Datum, MapServer, Mapper + + class MessageForwarder(Mapper): + def handler(self, keys: list[str], datum: Datum) -> Messages: + val = datum.value + _ = datum.event_time + _ = datum.watermark + return Messages(Message(value=val, keys=keys)) + + def my_handler(keys: list[str], datum: Datum) -> Messages: + val = datum.value + _ = datum.event_time + _ = datum.watermark + return Messages(Message(value=val, keys=keys)) + + + if __name__ == "__main__": + Use the class based approach or function based handler + based on the env variable + Both can be used and passed directly to the server class + + invoke = os.getenv("INVOKE", "func_handler") + if invoke == "class": + handler = MessageForwarder() + else: + handler = my_handler + grpc_server = MapServer(handler) + grpc_server.start() """ def __init__( diff --git a/pynumaflow/mapstreamer/_dtypes.py b/pynumaflow/mapstreamer/_dtypes.py index a5363370..2a467b9c 100644 --- a/pynumaflow/mapstreamer/_dtypes.py +++ b/pynumaflow/mapstreamer/_dtypes.py @@ -182,7 +182,7 @@ def __call__(self, *args, **kwargs): @abstractmethod async def handler(self, keys: list[str], datum: Datum) -> AsyncIterable[Message]: """ - Write a handler function which implements the MapSyncCallable interface. + Implement this handler function which implements the MapSyncCallable interface. """ pass diff --git a/pynumaflow/mapstreamer/async_server.py b/pynumaflow/mapstreamer/async_server.py index 415b8399..1092c456 100644 --- a/pynumaflow/mapstreamer/async_server.py +++ b/pynumaflow/mapstreamer/async_server.py @@ -35,12 +35,53 @@ def __init__( A new servicer instance is created and attached to the server. The server instance is returned. Args: - map_stream_instance: The map stream instance to be used for Map Stream UDF - sock_path: The UNIX socket path to be used for the server - max_message_size: The max message size in bytes the server can receive and send - max_threads: The max number of threads to be spawned; - defaults to number of processors x4 - server_type: The type of server to be used + map_stream_instance: The map stream instance to be used for Map Stream UDF + sock_path: The UNIX socket path to be used for the server + max_message_size: The max message size in bytes the server can receive and send + max_threads: The max number of threads to be spawned; + defaults to number of processors x4 + server_type: The type of server to be used + + Example invocation: + import os + from collections.abc import AsyncIterable + from pynumaflow.mapstreamer import Message, Datum, MapStreamAsyncServer, MapStreamer + + class FlatMapStream(MapStreamer): + async def handler(self, keys: list[str], datum: Datum) -> AsyncIterable[Message]: + val = datum.value + _ = datum.event_time + _ = datum.watermark + strs = val.decode("utf-8").split(",") + + if len(strs) == 0: + yield Message.to_drop() + return + for s in strs: + yield Message(str.encode(s)) + + async def map_stream_handler(_: list[str], datum: Datum) -> AsyncIterable[Message]: + + val = datum.value + _ = datum.event_time + _ = datum.watermark + strs = val.decode("utf-8").split(",") + + if len(strs) == 0: + yield Message.to_drop() + return + for s in strs: + yield Message(str.encode(s)) + + if __name__ == "__main__": + invoke = os.getenv("INVOKE", "func_handler") + if invoke == "class": + handler = FlatMapStream() + else: + handler = map_stream_handler + grpc_server = MapStreamAsyncServer(handler) + grpc_server.start() + """ self.map_stream_instance: MapStreamCallable = map_stream_instance self.sock_path = f"unix://{sock_path}" diff --git a/pynumaflow/reducer/_dtypes.py b/pynumaflow/reducer/_dtypes.py index ee6c41f8..8217a985 100644 --- a/pynumaflow/reducer/_dtypes.py +++ b/pynumaflow/reducer/_dtypes.py @@ -254,7 +254,7 @@ async def handler( self, keys: list[str], datums: AsyncIterable[Datum], md: Metadata ) -> Messages: """ - Write a handler function which implements the ReduceCallable interface. + Implement this handler function which implements the ReduceCallable interface. """ pass @@ -262,6 +262,8 @@ async def handler( class ReduceBuilderClass: """ Class to build a Reducer class instance. + Used Internally + Args: reducer_class: the reducer class to be used for Reduce UDF args: the arguments to be passed to the reducer class diff --git a/pynumaflow/reducer/async_server.py b/pynumaflow/reducer/async_server.py index b35a475c..6e074f9f 100644 --- a/pynumaflow/reducer/async_server.py +++ b/pynumaflow/reducer/async_server.py @@ -45,6 +45,62 @@ def get_handler(reducer_handler: ReduceCallable, init_args: tuple = (), init_kwa class ReduceAsyncServer(NumaflowServer): """ Class for a new Reduce Server instance. + A new servicer instance is created and attached to the server. + The server instance is returned. + Args: + reducer_handler: The reducer instance to be used for Reduce UDF + sock_path: The UNIX socket path to be used for the server + max_message_size: The max message size in bytes the server can receive and send + max_threads: The max number of threads to be spawned; + defaults to number of processors x4 + Example invocation: + import os + from collections.abc import AsyncIterable + from pynumaflow.reducer import Messages, Message, Datum, Metadata, + ReduceAsyncServer, Reducer + + class ReduceCounter(Reducer): + def __init__(self, counter): + self.counter = counter + + async def handler( + self, keys: list[str], datums: AsyncIterable[Datum], md: Metadata + ) -> Messages: + interval_window = md.interval_window + self.counter = 0 + async for _ in datums: + self.counter += 1 + msg = ( + f"counter:{self.counter} interval_window_start:{interval_window.start} " + f"interval_window_end:{interval_window.end}" + ) + return Messages(Message(str.encode(msg), keys=keys)) + + async def reduce_handler(keys: list[str], + datums: AsyncIterable[Datum], + md: Metadata) -> Messages: + interval_window = md.interval_window + counter = 0 + async for _ in datums: + counter += 1 + msg = ( + f"counter:{counter} interval_window_start:{interval_window.start} " + f"interval_window_end:{interval_window.end}" + ) + return Messages(Message(str.encode(msg), keys=keys)) + + if __name__ == "__main__": + invoke = os.getenv("INVOKE", "func_handler") + if invoke == "class": + # Here we are using the class instance as the reducer_instance + # which will be used to invoke the handler function. + # We are passing the init_args for the class instance. + grpc_server = ReduceAsyncServer(ReduceCounter, init_args=(0,)) + else: + # Here we are using the handler function directly as the reducer_instance. + grpc_server = ReduceAsyncServer(reduce_handler) + grpc_server.start() + """ def __init__( diff --git a/pynumaflow/sideinput/server.py b/pynumaflow/sideinput/server.py index 121bf496..ea6685e0 100644 --- a/pynumaflow/sideinput/server.py +++ b/pynumaflow/sideinput/server.py @@ -14,7 +14,39 @@ class SideInputServer(NumaflowServer): - """Server for side input""" + """ + Class for a new Side Input Server instance. + Args: + side_input_instance: The side input instance to be used for Side Input UDF + sock_path: The UNIX socket path to be used for the server + max_message_size: The max message size in bytes the server can receive and send + max_threads: The max number of threads to be spawned; + + Example invocation: + import datetime + from pynumaflow.sideinput import Response, SideInputServer, SideInput + + class ExampleSideInput(SideInput): + def __init__(self): + self.counter = 0 + + def retrieve_handler(self) -> Response: + time_now = datetime.datetime.now() + # val is the value to be broadcasted + val = f"an example: {str(time_now)}" + self.counter += 1 + # broadcast every other time + if self.counter % 2 == 0: + # no_broadcast_message() is used to indicate that there is no broadcast + return Response.no_broadcast_message() + # broadcast_message() is used to indicate that there is a broadcast + return Response.broadcast_message(val.encode("utf-8")) + + if __name__ == "__main__": + grpc_server = SideInputServer(ExampleSideInput()) + grpc_server.start() + + """ def __init__( self, diff --git a/pynumaflow/sinker/_dtypes.py b/pynumaflow/sinker/_dtypes.py index a767e053..5f6c5b12 100644 --- a/pynumaflow/sinker/_dtypes.py +++ b/pynumaflow/sinker/_dtypes.py @@ -180,7 +180,7 @@ def __call__(self, *args, **kwargs): @abstractmethod def handler(self, datums: Iterator[Datum]) -> Responses: """ - Write a handler function which implements the SinkCallable interface. + Implement this handler function which implements the SinkCallable interface. """ pass diff --git a/pynumaflow/sinker/async_server.py b/pynumaflow/sinker/async_server.py index 6442db16..6cb0eabc 100644 --- a/pynumaflow/sinker/async_server.py +++ b/pynumaflow/sinker/async_server.py @@ -19,7 +19,50 @@ class SinkAsyncServer(NumaflowServer): """ - SinkServer is the main class to start a gRPC server for a sinker. + SinkAsyncServer is the main class to start a gRPC server for a sinker. + Create a new grpc Async Sink Server instance. + A new servicer instance is created and attached to the server. + The server instance is returned. + Args: + sinker_instance: The sinker instance to be used for Sink UDF + sock_path: The UNIX socket path to be used for the server + max_message_size: The max message size in bytes the server can receive and send + max_threads: The max number of threads to be spawned; + defaults to number of processors x4 + + Example invocation: + import os + from collections.abc import AsyncIterable + from pynumaflow.sinker import Datum, Responses, Response, Sinker + from pynumaflow.sinker import SinkAsyncServer + from pynumaflow._constants import _LOGGER + + + class UserDefinedSink(Sinker): + async def handler(self, datums: AsyncIterable[Datum]) -> Responses: + responses = Responses() + async for msg in datums: + _LOGGER.info("User Defined Sink %s", msg.value.decode("utf-8")) + responses.append(Response.as_success(msg.id)) + return responses + + + async def udsink_handler(datums: AsyncIterable[Datum]) -> Responses: + responses = Responses() + async for msg in datums: + _LOGGER.info("User Defined Sink %s", msg.value.decode("utf-8")) + responses.append(Response.as_success(msg.id)) + return responses + + + if __name__ == "__main__": + invoke = os.getenv("INVOKE", "func_handler") + if invoke == "class": + sink_handler = UserDefinedSink() + else: + sink_handler = udsink_handler + grpc_server = SinkAsyncServer(sink_handler) + grpc_server.start() """ def __init__( @@ -29,17 +72,6 @@ def __init__( max_message_size=MAX_MESSAGE_SIZE, max_threads=MAX_THREADS, ): - """ - Create a new grpc Sink Server instance. - A new servicer instance is created and attached to the server. - The server instance is returned. - Args: - sinker_instance: The sinker instance to be used for Sink UDF - sock_path: The UNIX socket path to be used for the server - max_message_size: The max message size in bytes the server can receive and send - max_threads: The max number of threads to be spawned; - defaults to number of processors x4 - """ self.sock_path = f"unix://{sock_path}" self.max_threads = min(max_threads, int(os.getenv("MAX_THREADS", "4"))) self.max_message_size = max_message_size diff --git a/pynumaflow/sinker/server.py b/pynumaflow/sinker/server.py index 0cd4a5e8..8c95a861 100644 --- a/pynumaflow/sinker/server.py +++ b/pynumaflow/sinker/server.py @@ -37,6 +37,38 @@ def __init__( max_message_size: The max message size in bytes the server can receive and send max_threads: The max number of threads to be spawned; defaults to number of processors x4 + Example invocation: + import os + from collections.abc import Iterator + + from pynumaflow.sinker import Datum, Responses, Response, SinkServer + from pynumaflow.sinker import Sinker + from pynumaflow._constants import _LOGGER + + class UserDefinedSink(Sinker): + def handler(self, datums: Iterator[Datum]) -> Responses: + responses = Responses() + for msg in datums: + _LOGGER.info("User Defined Sink %s", msg.value.decode("utf-8")) + responses.append(Response.as_success(msg.id)) + return responses + + def udsink_handler(datums: Iterator[Datum]) -> Responses: + responses = Responses() + for msg in datums: + _LOGGER.info("User Defined Sink %s", msg.value.decode("utf-8")) + responses.append(Response.as_success(msg.id)) + return responses + + if __name__ == "__main__": + invoke = os.getenv("INVOKE", "func_handler") + if invoke == "class": + sink_handler = UserDefinedSink() + else: + sink_handler = udsink_handler + grpc_server = SinkServer(sink_handler) + grpc_server.start() + """ self.sock_path = f"unix://{sock_path}" self.max_threads = min(max_threads, int(os.getenv("MAX_THREADS", "4"))) diff --git a/pynumaflow/sourcer/_dtypes.py b/pynumaflow/sourcer/_dtypes.py index f26e9a40..9fd0e910 100644 --- a/pynumaflow/sourcer/_dtypes.py +++ b/pynumaflow/sourcer/_dtypes.py @@ -222,7 +222,7 @@ def __call__(self, *args, **kwargs): @abstractmethod def read_handler(self, datum: ReadRequest) -> Iterable[Message]: """ - Write a handler function which implements the SourceReadCallable interface. + Implement this handler function which implements the SourceReadCallable interface. read_handler is used to read the data from the source and send the data forward for each read request we process num_records and increment the read_idx to indicate that the message has been read and the same is added to the ack set diff --git a/pynumaflow/sourcer/async_server.py b/pynumaflow/sourcer/async_server.py index 8e8f2dd1..98c026f6 100644 --- a/pynumaflow/sourcer/async_server.py +++ b/pynumaflow/sourcer/async_server.py @@ -28,15 +28,77 @@ def __init__( max_threads=MAX_THREADS, ): """ - Create a new grpc Source Server instance. + Create a new grpc Async Source Server instance. A new servicer instance is created and attached to the server. The server instance is returned. Args: - sourcer_instance: The sourcer instance to be used for Source UDF - sock_path: The UNIX socket path to be used for the server - max_message_size: The max message size in bytes the server can receive and send - max_threads: The max number of threads to be spawned; - defaults to number of processors x4 + sourcer_instance: The sourcer instance to be used for Source UDF + sock_path: The UNIX socket path to be used for the server + max_message_size: The max message size in bytes the server can receive and send + max_threads: The max number of threads to be spawned; + defaults to number of processors x4 + + Example invocation: + from collections.abc import AsyncIterable + from datetime import datetime + from pynumaflow.sourcer import ( + ReadRequest, + Message, + AckRequest, + PendingResponse, + Offset, + PartitionsResponse, + get_default_partitions, + Sourcer, + SourceAsyncServer, + ) + + class AsyncSource(Sourcer): + # AsyncSource is a class for User Defined Source implementation. + + def __init__(self): + # to_ack_set: Set to maintain a track of the offsets yet to be acknowledged + # read_idx : the offset idx till where the messages have been read + self.to_ack_set = set() + self.read_idx = 0 + + async def read_handler(self, datum: ReadRequest) -> AsyncIterable[Message]: + # read_handler is used to read the data from the source and send + # the data forward + # for each read request we process num_records and increment + # the read_idx to indicate that + # the message has been read and the same is added to the ack set + if self.to_ack_set: + return + + for x in range(datum.num_records): + yield Message( + payload=str(self.read_idx).encode(), + offset=Offset.offset_with_default_partition_id(str(self.read_idx).encode()), + event_time=datetime.now(), + ) + self.to_ack_set.add(str(self.read_idx)) + self.read_idx += 1 + + async def ack_handler(self, ack_request: AckRequest): + # The ack handler is used acknowledge the offsets that have been read, + # and remove them from the to_ack_set + for offset in ack_request.offset: + self.to_ack_set.remove(str(offset.offset, "utf-8")) + + async def pending_handler(self) -> PendingResponse: + # The simple source always returns zero to indicate there is no pending record. + return PendingResponse(count=0) + + async def partitions_handler(self) -> PartitionsResponse: + # The simple source always returns default partitions. + return PartitionsResponse(partitions=get_default_partitions()) + + if __name__ == "__main__": + ud_source = AsyncSource() + grpc_server = SourceAsyncServer(ud_source) + grpc_server.start() + """ self.sock_path = f"unix://{sock_path}" self.max_threads = min(max_threads, int(os.getenv("MAX_THREADS", "4"))) @@ -62,7 +124,6 @@ async def aexec(self): """ Starts the Async gRPC server on the given UNIX socket with given max threads """ - # As the server is async, we need to create a new server instance in the # same thread as the event loop so that all the async calls are made in the # same context diff --git a/pynumaflow/sourcer/server.py b/pynumaflow/sourcer/server.py index 2a1bbc8e..6177045f 100644 --- a/pynumaflow/sourcer/server.py +++ b/pynumaflow/sourcer/server.py @@ -29,11 +29,74 @@ def __init__( A new servicer instance is created and attached to the server. The server instance is returned. Args: - sourcer_instance: The sourcer instance to be used for Source UDF - sock_path: The UNIX socket path to be used for the server - max_message_size: The max message size in bytes the server can receive and send - max_threads: The max number of threads to be spawned; - defaults to number of processors x4 + sourcer_instance: The sourcer instance to be used for Source UDF + sock_path: The UNIX socket path to be used for the server + max_message_size: The max message size in bytes the server can receive and send + max_threads: The max number of threads to be spawned; + defaults to number of processors x4 + + Example invocation: + from collections.abc import Iterable + from datetime import datetime + + from pynumaflow.sourcer import ( + ReadRequest, + Message, + AckRequest, + PendingResponse, + Offset, + PartitionsResponse, + get_default_partitions, + Sourcer, + SourceServer, + ) + + class SimpleSource(Sourcer): + # SimpleSource is a class for User Defined Source implementation. + + def __init__(self): + # to_ack_set: Set to maintain a track of the offsets yet to be acknowledged + # read_idx : the offset idx till where the messages have been read + self.to_ack_set = set() + self.read_idx = 0 + + def read_handler(self, datum: ReadRequest) -> Iterable[Message]: + # read_handler is used to read the data from the source and + # send the data forward + # for each read request we process num_records and increment the + # read_idx to indicate that + # the message has been read and the same is added to the ack set + if self.to_ack_set: + return + + for x in range(datum.num_records): + yield Message( + payload=str(self.read_idx).encode(), + offset=Offset.offset_with_default_partition_id(str(self.read_idx).encode()), + event_time=datetime.now(), + ) + self.to_ack_set.add(str(self.read_idx)) + self.read_idx += 1 + + def ack_handler(self, ack_request: AckRequest): + # The ack handler is used acknowledge the offsets that have been + # read, and remove them + # from the to_ack_set + for offset in ack_request.offset: + self.to_ack_set.remove(str(offset.offset, "utf-8")) + + def pending_handler(self) -> PendingResponse: + # The simple source always returns zero to indicate there is no pending record. + return PendingResponse(count=0) + + def partitions_handler(self) -> PartitionsResponse: + # The simple source always returns zero to indicate there is no pending record. + return PartitionsResponse(partitions=get_default_partitions()) + + if __name__ == "__main__": + ud_source = SimpleSource() + grpc_server = SourceServer(ud_source) + grpc_server.start() """ self.sock_path = f"unix://{sock_path}" self.max_threads = min(max_threads, int(os.getenv("MAX_THREADS", "4"))) diff --git a/pynumaflow/sourcetransformer/_dtypes.py b/pynumaflow/sourcetransformer/_dtypes.py index ad0d5426..66e6978c 100644 --- a/pynumaflow/sourcetransformer/_dtypes.py +++ b/pynumaflow/sourcetransformer/_dtypes.py @@ -189,7 +189,7 @@ def __call__(self, *args, **kwargs): @abstractmethod def handler(self, keys: list[str], datum: Datum) -> Messages: """ - Write a handler function which implements the + Implement this handler function which implements the SourceTransformCallable interface. """ pass diff --git a/pynumaflow/sourcetransformer/multiproc_server.py b/pynumaflow/sourcetransformer/multiproc_server.py index 23d7101b..ace9aa1d 100644 --- a/pynumaflow/sourcetransformer/multiproc_server.py +++ b/pynumaflow/sourcetransformer/multiproc_server.py @@ -35,12 +35,64 @@ def __init__( A new servicer instance is created and attached to the server. The server instance is returned. Args: - source_transform_instance: The source transformer instance to be used for - Source Transformer UDF - sock_path: The UNIX socket path to be used for the server - max_message_size: The max message size in bytes the server can receive and send - max_threads: The max number of threads to be spawned; - defaults to number of processors x4 + source_transform_instance: The source transformer instance to be used for + Source Transformer UDF + sock_path: The UNIX socket path to be used for the server + max_message_size: The max message size in bytes the server can receive and send + max_threads: The max number of threads to be spawned; + defaults to number of processors x4 + + Example invocation: + import datetime + import logging + + from pynumaflow.sourcetransformer import Messages, Message, Datum, SourceTransformServer + + # This is a simple User Defined Function example which receives a message, + # applies the following + # data transformation, and returns the message. + # If the message event time is before year 2022, drop the message + # with event time unchanged. + # If it's within year 2022, update the tag to "within_year_2022" and + # update the message event time to Jan 1st 2022. + # Otherwise, (exclusively after year 2022), update the tag to + # "after_year_2022" and update the + + + january_first_2022 = datetime.datetime.fromtimestamp(1640995200) + january_first_2023 = datetime.datetime.fromtimestamp(1672531200) + + + def my_handler(keys: list[str], datum: Datum) -> Messages: + val = datum.value + event_time = datum.event_time + messages = Messages() + + if event_time < january_first_2022: + logging.info("Got event time:%s, it is before 2022, so dropping", event_time) + messages.append(Message.to_drop(event_time)) + elif event_time < january_first_2023: + logging.info( + "Got event time:%s, it is within year 2022, so + forwarding to within_year_2022", + event_time, + ) + messages.append( + Message(value=val, event_time=january_first_2022, tags=["within_year_2022"]) + ) + else: + logging.info( + "Got event time:%s, it is after year 2022, so forwarding to + after_year_2022", event_time + ) + messages.append(Message(value=val, event_time=january_first_2023, + tags=["after_year_2022"])) + + return messages + + if __name__ == "__main__": + grpc_server = SourceTransformServer(my_handler) + grpc_server.start() """ self.sock_path = f"unix://{sock_path}" self.max_threads = min(max_threads, int(os.getenv("MAX_THREADS", "4"))) diff --git a/pynumaflow/sourcetransformer/server.py b/pynumaflow/sourcetransformer/server.py index 0f248eb2..a2dd6c57 100644 --- a/pynumaflow/sourcetransformer/server.py +++ b/pynumaflow/sourcetransformer/server.py @@ -30,12 +30,64 @@ def __init__( A new servicer instance is created and attached to the server. The server instance is returned. Args: - source_transform_instance: The source transformer instance to be used for - Source Transformer UDF - sock_path: The UNIX socket path to be used for the server - max_message_size: The max message size in bytes the server can receive and send - max_threads: The max number of threads to be spawned; - defaults to number of processors x4 + source_transform_instance: The source transformer instance to be used for + Source Transformer UDF + sock_path: The UNIX socket path to be used for the server + max_message_size: The max message size in bytes the server can receive and send + max_threads: The max number of threads to be spawned; + defaults to number of processors x4 + + Example Invocation: + + import datetime + import logging + + from pynumaflow.sourcetransformer import Messages, Message, Datum, SourceTransformServer + # This is a simple User Defined Function example which receives a message, + # applies the following + # data transformation, and returns the message. + # If the message event time is before year 2022, drop the message with event time unchanged. + # If it's within year 2022, update the tag to "within_year_2022" and + # update the message event time to Jan 1st 2022. + # Otherwise, (exclusively after year 2022), update the tag to + # "after_year_2022" and update the + # message event time to Jan 1st 2023. + + january_first_2022 = datetime.datetime.fromtimestamp(1640995200) + january_first_2023 = datetime.datetime.fromtimestamp(1672531200) + + + def my_handler(keys: list[str], datum: Datum) -> Messages: + val = datum.value + event_time = datum.event_time + messages = Messages() + + if event_time < january_first_2022: + logging.info("Got event time:%s, it is before 2022, so dropping", event_time) + messages.append(Message.to_drop(event_time)) + elif event_time < january_first_2023: + logging.info( + "Got event time:%s, it is within year 2022, so forwarding to within_year_2022", + event_time, + ) + messages.append( + Message(value=val, event_time=january_first_2022, + tags=["within_year_2022"]) + ) + else: + logging.info( + "Got event time:%s, it is after year 2022, so forwarding to + after_year_2022", event_time + ) + messages.append(Message(value=val, event_time=january_first_2023, + tags=["after_year_2022"])) + + return messages + + + if __name__ == "__main__": + grpc_server = SourceTransformServer(my_handler) + grpc_server.start() """ self.sock_path = f"unix://{sock_path}" self.max_threads = min(max_threads, int(os.getenv("MAX_THREADS", "4"))) From d8ce1ceda9f82ccf6c5ecec26015cb812d0676bd Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Tue, 23 Jan 2024 18:00:40 -0800 Subject: [PATCH 77/78] comments Signed-off-by: Sidhant Kohli --- examples/reduce/counter/Makefile | 2 +- examples/reduce/counter/pipeline.yaml | 2 +- examples/reduce/counter/pyproject.toml | 3 +-- examples/sink/async_log/example.py | 7 ++++--- examples/sink/log/example.py | 6 ++++-- pynumaflow/reducer/_dtypes.py | 2 +- pynumaflow/reducer/async_server.py | 4 ++-- pynumaflow/reducer/servicer/async_servicer.py | 8 ++++---- 8 files changed, 18 insertions(+), 16 deletions(-) diff --git a/examples/reduce/counter/Makefile b/examples/reduce/counter/Makefile index cbdb5d33..f787dd3a 100644 --- a/examples/reduce/counter/Makefile +++ b/examples/reduce/counter/Makefile @@ -1,6 +1,6 @@ .PHONY: image image: - docker build --no-cache -t "quay.io/kohlisid/numaflow-python/reduce:refactor1" . + docker build --no-cache -t "quay.io/numaio/numaflow-python/reduce-counter:v0.7.0" . # Github CI runner uses platform linux/amd64. If your local environment don't, the image built by command above might not work # under the CI E2E test environment. # To build an image that supports multiple platforms(linux/amd64,linux/arm64) and push to quay.io, use the following command diff --git a/examples/reduce/counter/pipeline.yaml b/examples/reduce/counter/pipeline.yaml index 10d53704..1b7d6b34 100644 --- a/examples/reduce/counter/pipeline.yaml +++ b/examples/reduce/counter/pipeline.yaml @@ -18,7 +18,7 @@ spec: udf: container: # compute the sum - image: quay.io/kohlisid/numaflow-python/reduce:refactor1 + image: quay.io/numaio/numaflow-python/reduce-counter:latest imagePullPolicy: Always env: - name: PYTHONDEBUG diff --git a/examples/reduce/counter/pyproject.toml b/examples/reduce/counter/pyproject.toml index 345ec1d3..dc3cc41d 100644 --- a/examples/reduce/counter/pyproject.toml +++ b/examples/reduce/counter/pyproject.toml @@ -6,8 +6,7 @@ authors = ["Numaflow developers"] [tool.poetry.dependencies] python = "~3.10" -#pynumaflow = "~0.7.0" -pynumaflow = {git = "https://github.com/kohlisid/numaflow-python", rev = "class-refactor"} +pynumaflow = "~0.7.0" [tool.poetry.dev-dependencies] diff --git a/examples/sink/async_log/example.py b/examples/sink/async_log/example.py index d7968115..7e338c3e 100644 --- a/examples/sink/async_log/example.py +++ b/examples/sink/async_log/example.py @@ -1,10 +1,11 @@ import os from collections.abc import AsyncIterable - - from pynumaflow.sinker import Datum, Responses, Response, Sinker from pynumaflow.sinker import SinkAsyncServer -from pynumaflow._constants import _LOGGER +import logging + +logging.basicConfig(level=logging.DEBUG) +_LOGGER = logging.getLogger(__name__) class UserDefinedSink(Sinker): diff --git a/examples/sink/log/example.py b/examples/sink/log/example.py index 391529ad..2c960139 100644 --- a/examples/sink/log/example.py +++ b/examples/sink/log/example.py @@ -1,9 +1,11 @@ import os from collections.abc import Iterator - from pynumaflow.sinker import Datum, Responses, Response, SinkServer from pynumaflow.sinker import Sinker -from pynumaflow._constants import _LOGGER +import logging + +logging.basicConfig(level=logging.DEBUG) +_LOGGER = logging.getLogger(__name__) class UserDefinedSink(Sinker): diff --git a/pynumaflow/reducer/_dtypes.py b/pynumaflow/reducer/_dtypes.py index 8217a985..bc881e33 100644 --- a/pynumaflow/reducer/_dtypes.py +++ b/pynumaflow/reducer/_dtypes.py @@ -259,7 +259,7 @@ async def handler( pass -class ReduceBuilderClass: +class _ReduceBuilderClass: """ Class to build a Reducer class instance. Used Internally diff --git a/pynumaflow/reducer/async_server.py b/pynumaflow/reducer/async_server.py index 6e074f9f..e18ad3c3 100644 --- a/pynumaflow/reducer/async_server.py +++ b/pynumaflow/reducer/async_server.py @@ -16,7 +16,7 @@ from pynumaflow.reducer._dtypes import ( ReduceCallable, - ReduceBuilderClass, + _ReduceBuilderClass, Reducer, ) @@ -37,7 +37,7 @@ def get_handler(reducer_handler: ReduceCallable, init_args: tuple = (), init_kwa elif issubclass(reducer_handler, Reducer): # if handler is type of Class Reducer, create a new instance of # a ReducerBuilderClass - return ReduceBuilderClass(reducer_handler, init_args, init_kwargs) + return _ReduceBuilderClass(reducer_handler, init_args, init_kwargs) else: raise TypeError("Invalid type passed") diff --git a/pynumaflow/reducer/servicer/async_servicer.py b/pynumaflow/reducer/servicer/async_servicer.py index d80d2752..b8f0aef9 100644 --- a/pynumaflow/reducer/servicer/async_servicer.py +++ b/pynumaflow/reducer/servicer/async_servicer.py @@ -18,7 +18,7 @@ IntervalWindow, Metadata, ReduceAsyncCallable, - ReduceBuilderClass, + _ReduceBuilderClass, ) from pynumaflow.reducer._dtypes import ReduceResult from pynumaflow.reducer.servicer.asynciter import NonBlockingIterator @@ -49,14 +49,14 @@ class AsyncReduceServicer(reduce_pb2_grpc.ReduceServicer): def __init__( self, - handler: Union[ReduceAsyncCallable, ReduceBuilderClass], + handler: Union[ReduceAsyncCallable, _ReduceBuilderClass], ): # Collection for storing strong references to all running tasks. # Event loop only keeps a weak reference, which can cause it to # get lost during execution. self.background_tasks = set() # The reduce handler can be a function or a builder class instance. - self.__reduce_handler: Union[ReduceAsyncCallable, ReduceBuilderClass] = handler + self.__reduce_handler: Union[ReduceAsyncCallable, _ReduceBuilderClass] = handler async def ReduceFn( self, @@ -154,7 +154,7 @@ async def __invoke_reduce( # It is required for a new key to be processed by a # new instance of the reducer for a given window # Otherwise the function handler can be called directly - if isinstance(self.__reduce_handler, ReduceBuilderClass): + if isinstance(self.__reduce_handler, _ReduceBuilderClass): new_instance = self.__reduce_handler.create() try: msgs = await new_instance(keys, request_iterator, md) From 59d65508896eee84d612144c18616d07eda9002e Mon Sep 17 00:00:00 2001 From: Sidhant Kohli Date: Wed, 24 Jan 2024 11:55:48 -0800 Subject: [PATCH 78/78] comments Signed-off-by: Sidhant Kohli --- examples/reduce/counter/Makefile | 2 +- pynumaflow/reducer/async_server.py | 7 ++++--- pynumaflow/shared/server.py | 14 ++++++++++++++ tests/reduce/test_async_reduce.py | 10 ++++++++-- 4 files changed, 27 insertions(+), 6 deletions(-) diff --git a/examples/reduce/counter/Makefile b/examples/reduce/counter/Makefile index f787dd3a..363debad 100644 --- a/examples/reduce/counter/Makefile +++ b/examples/reduce/counter/Makefile @@ -1,6 +1,6 @@ .PHONY: image image: - docker build --no-cache -t "quay.io/numaio/numaflow-python/reduce-counter:v0.7.0" . + docker build -t "quay.io/numaio/numaflow-python/reduce-counter:v0.7.0" . # Github CI runner uses platform linux/amd64. If your local environment don't, the image built by command above might not work # under the CI E2E test environment. # To build an image that supports multiple platforms(linux/amd64,linux/arm64) and push to quay.io, use the following command diff --git a/pynumaflow/reducer/async_server.py b/pynumaflow/reducer/async_server.py index e18ad3c3..a42d7ee7 100644 --- a/pynumaflow/reducer/async_server.py +++ b/pynumaflow/reducer/async_server.py @@ -20,7 +20,7 @@ Reducer, ) -from pynumaflow.shared.server import NumaflowServer, start_async_server +from pynumaflow.shared.server import NumaflowServer, checkInstance, start_async_server def get_handler(reducer_handler: ReduceCallable, init_args: tuple = (), init_kwargs: dict = None): @@ -34,12 +34,13 @@ def get_handler(reducer_handler: ReduceCallable, init_args: tuple = (), init_kwa raise TypeError("Cannot pass function handler with init args or kwargs") # return the function handler return reducer_handler - elif issubclass(reducer_handler, Reducer): + elif not checkInstance(reducer_handler, Reducer) and issubclass(reducer_handler, Reducer): # if handler is type of Class Reducer, create a new instance of # a ReducerBuilderClass return _ReduceBuilderClass(reducer_handler, init_args, init_kwargs) else: - raise TypeError("Invalid type passed") + _LOGGER.error("Invalid Type: please provide the handler or the class name") + raise TypeError("Inavlid Type: please provide the handler or the class name") class ReduceAsyncServer(NumaflowServer): diff --git a/pynumaflow/shared/server.py b/pynumaflow/shared/server.py index 853cd4da..d58af987 100644 --- a/pynumaflow/shared/server.py +++ b/pynumaflow/shared/server.py @@ -235,3 +235,17 @@ def _reserve_port(port_num: int) -> Iterator[int]: yield sock.getsockname()[1] finally: sock.close() + + +def checkInstance(instance, callable_type) -> bool: + """ + Check if the given instance is of the given callable_type. + """ + try: + if not isinstance(instance, callable_type): + return False + else: + return True + except Exception as e: + _LOGGER.error(e) + return False diff --git a/tests/reduce/test_async_reduce.py b/tests/reduce/test_async_reduce.py index e585c95a..c65f69ac 100644 --- a/tests/reduce/test_async_reduce.py +++ b/tests/reduce/test_async_reduce.py @@ -79,7 +79,9 @@ async def handler( return Messages(Message(str.encode(msg), keys=keys)) -async def err_handler(keys: list[str], datums: AsyncIterable[Datum], md: Metadata) -> Messages: +async def reduce_handler_func( + keys: list[str], datums: AsyncIterable[Datum], md: Metadata +) -> Messages: interval_window = md.interval_window counter = 0 async for _ in datums: @@ -238,7 +240,11 @@ def test_error_init(self): # Check that the init_args and init_kwargs are passed # only with a Reducer class with self.assertRaises(TypeError): - ReduceAsyncServer(err_handler, init_args=(0, 1)) + ReduceAsyncServer(reduce_handler_func, init_args=(0, 1)) + # Check that an instance is not passed instead of the class + # signature + with self.assertRaises(TypeError): + ReduceAsyncServer(ExampleClass(0)) if __name__ == "__main__":