Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DO NOT MERGE] JAX profiling on JetStream server #7

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 23 additions & 13 deletions jetstream/core/implementations/mock/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,8 @@

from absl import app
from absl import flags

from jetstream.core.implementations.mock import config as mock_config
from jetstream.core import server_lib
from jetstream.core.implementations.mock import config as mock_config


_PORT = flags.DEFINE_integer('port', 9000, 'port to listen on')
Expand All @@ -30,19 +29,30 @@
'available servers',
)


def main(argv: Sequence[str]):
del argv
# No devices for local cpu test. A None for prefill and a None for generate.
devices = server_lib.get_devices()
server_config = mock_config.get_server_config(_CONFIG.value)
# We separate credential from run so that we can unit test it with local credentials.
# TODO: Add grpc credentials for OSS.
jetstream_server = server_lib.run(
port=_PORT.value,
config=server_config,
devices=devices,
)
jetstream_server.wait_for_termination()
jetstream_server = None
try:
# No devices for local cpu test. A None for prefill and a None for generate.
devices = server_lib.get_devices()
server_config = mock_config.get_server_config(_CONFIG.value)
# We separate credential from run so that we can unit test it with local credentials.
# TODO: Add grpc credentials for OSS.
jetstream_server = server_lib.run(
port=_PORT.value,
config=server_config,
devices=devices,
)
jetstream_server.wait_for_termination()
except KeyboardInterrupt:
print('Stopping profiler and exiting...')
print(
'NOTE: DO NOT Interrupt again; the profiler is slowly collecting data'
' and existing...'
)
if jetstream_server:
jetstream_server.stop()


if __name__ == '__main__':
Expand Down
4 changes: 4 additions & 0 deletions jetstream/core/server_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,13 +39,17 @@ def __init__(self, driver: orchestrator.Driver, server: grpc.Server):
self._server = server

def start(self, port, credentials) -> None:
# start jax profiler
jax.profiler.start_trace("/tmp/tensorboard")
self._server.add_secure_port(f'{_HOST}:{port}', credentials)
self._server.start()

def stop(self) -> None:
# Gracefully clean up threads in the orchestrator.
self._driver.stop()
self._server.stop(0)
# end jax profiler
jax.profiler.stop_trace()

def wait_for_termination(self) -> None:
self._server.wait_for_termination()
Expand Down
Loading