Skip to content

Commit

Permalink
Dashboard (#68)
Browse files Browse the repository at this point in the history
* dashboard v0

* websocket logs

* logging

* update

* dockerfiles + docker-compose

* backend.default.svc.cluster.local:5001 (broken APIs)

* v0.5 dashboard

* dashboard 0.7

* dashboard v0.9

* DASHBOARD V1.0

* dashboard 1.01

* fixes v1

* fixes

* dashboard fixes minus socketio

* DASHBOARD V2

* dashboard logger

* small cleanup

* final?

* real final

* namespace route api fix?

* DASHBOARD FINAL
  • Loading branch information
ryanhayame authored Nov 11, 2024
1 parent 98247b1 commit 1fa2c4d
Show file tree
Hide file tree
Showing 45 changed files with 8,809 additions and 3 deletions.
57 changes: 57 additions & 0 deletions Dockerfile.backend
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# Use Python 3.11 slim as the base image
FROM python:3.11-slim AS base

# Set environment variables for Python behavior
ENV PYTHONFAULTHANDLER=1 \
PYTHONHASHSEED=random \
PYTHONUNBUFFERED=1

# Set the working directory inside the container
WORKDIR /app

# Builder stage: Install dependencies and build the backend package
FROM base AS builder

# Set environment variables for pip and Poetry
ENV PIP_DEFAULT_TIMEOUT=100 \
PIP_DISABLE_PIP_VERSION_CHECK=1 \
PIP_NO_CACHE_DIR=1 \
POETRY_VERSION=1.3.1

# Install Poetry
RUN pip install "poetry==$POETRY_VERSION"

# Copy the pyproject.toml and poetry.lock files
COPY pyproject.toml poetry.lock ./

# Copy the entire konduktor directory to the container
COPY konduktor ./konduktor

# List the contents of the konduktor directory to verify the copy
RUN ls -la ./konduktor

# Configure Poetry and install dependencies only for the dashboard group
RUN poetry config virtualenvs.in-project true && \
poetry install --with dashboard --no-root

# Final stage for production
FROM base AS final

# Set the working directory
WORKDIR /app

# Copy the virtual environment from the builder stage
COPY --from=builder /app/.venv ./.venv

# Copy the konduktor directory from the builder stage
COPY --from=builder /app/konduktor ./konduktor

# Copy the startup script
COPY startup.sh /app/startup.sh
RUN chmod +x /app/startup.sh

# Expose the port the app runs on
EXPOSE 5001

# Set the startup command
CMD ["/app/startup.sh"]
23 changes: 23 additions & 0 deletions Dockerfile.frontend
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Use the official Node.js 18 slim image
FROM node:18-slim

# Set the working directory
WORKDIR /app

# Copy package.json and package-lock.json from the /frontend folder
COPY konduktor/dashboard/frontend/package*.json ./

# Install dependencies
RUN npm install

# Copy the entire frontend source code
COPY konduktor/dashboard/frontend/ .

# Build the frontend for production
RUN npm run build

# Expose the frontend port
EXPOSE 5173

# Start the frontend app
CMD ["npm", "run", "start"]
35 changes: 34 additions & 1 deletion docs/source/admin/observability.rst
Original file line number Diff line number Diff line change
Expand Up @@ -75,4 +75,37 @@ As well as (S)Xid errors by following :code:`dmesg` on each node. You can also p
.. figure:: ../images/otel-loki.png
:width: 120%
:align: center
:alt: dashboard
:alt: dashboard

Dashboard
------------

This is a user-friendly localhost dashboard to manage workloads within a cluster, all in one place.

Features include:

- Grafana konduktor dashboard
- Loki logs (search + filtering by namespace)
- Table to view, delete, and modify priority of workloads in queue

To open the dashboard, run this inside the root konduktor directory:

.. code-block:: console
$ ./start_dashboard.sh
If running into a permission error, try this instead:

.. code-block:: console
$ chmod +x start_dashboard.sh && ./start_dashboard.sh
.. figure:: ../images/dashboard-logs.png
:width: 120%
:align: center
:alt: dashboard-logs

.. figure:: ../images/dashboard-jobs.png
:width: 120%
:align: center
:alt: dashboard-jobs
Binary file added docs/source/images/dashboard-jobs.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/source/images/dashboard-logs.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
4 changes: 2 additions & 2 deletions format.sh
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#!/usr/bin/env bash
set -eo pipefail

RUFF_VERSION=$(ruff --version | head -n 1 | awk '{print $2}')
MYPY_VERSION=$(mypy --version | awk '{print $2}')
RUFF_VERSION=$(poetry run ruff --version | head -n 1 | awk '{print $2}')
MYPY_VERSION=$(poetry run mypy --version | awk '{print $2}')

echo "ruff ver $RUFF_VERSION"
echo "mypy ver $MYPY_VERSION"
Expand Down
30 changes: 30 additions & 0 deletions konduktor/dashboard/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
### Prereqs: kubectl is configured with remote machine/cluster

# OPTION 1 (Automated Setup)

To open the dashboard, run this inside the root konduktor directory:
```
./start_dashboard.sh
```

# OPTION 2 (Manual Setup)

## 1. Apply kubernetes manifest
Inside manifests directory (one with dashboard_deployment.yaml):
```
kubectl apply -f dashboard_deployment.yaml
```
Then, wait a minute or two for the pods to finish setup

## 2. Port forward frontend in a terminal
```
kubectl port-forward svc/frontend 5173:5173 -n konduktor-dashboard
```

## 3. Port forward grafana in a terminal
```
kubectl port-forward svc/kube-prometheus-stack-grafana 3000:80 -n prometheus
```

## 4. Open dashboard at http://localhost:5173/

169 changes: 169 additions & 0 deletions konduktor/dashboard/backend/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
from typing import Any, Dict, List

import socketio
from fastapi import FastAPI, Request
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse
from kubernetes import client
from kubernetes.client.exceptions import ApiException

from konduktor import logging as konduktor_logging
from konduktor.kube_client import batch_api, core_api, crd_api

from .sockets import socketio as sio

logger = konduktor_logging.get_logger2(__name__)

# FastAPI app
app = FastAPI()


# CORS Configuration
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # Allow all origins
allow_credentials=True,
allow_methods=["*"], # Allow all methods
allow_headers=["*"], # Allow all headers
)

# Use Kubernetes API clients
# Initialize BatchV1 and CoreV1 API (native kubernetes)
batch_client = batch_api()
core_client = core_api()
# Initialize Kueue API
crd_client = crd_api()


@app.get("/")
async def home():
return JSONResponse({"home": "/"})


@app.delete("/deleteJob")
async def delete_job(request: Request):
data = await request.json()
name = data.get("name", "")
namespace = data.get("namespace", "default")

try:
delete_options = client.V1DeleteOptions(propagation_policy="Background")

crd_client.delete_namespaced_custom_object(
group="kueue.x-k8s.io",
version="v1beta1",
namespace=namespace,
plural="workloads",
name=name,
body=delete_options,
)
logger.debug(f"Kueue Workload '{name}' deleted successfully.")

return JSONResponse({"success": True, "status": 200})

except ApiException as e:
logger.debug(f"Exception: {e}")
return JSONResponse({"error": str(e)}, status_code=e.status)


@app.get("/getJobs")
async def get_jobs():
rows = fetch_jobs()
return JSONResponse(rows)


@app.get("/getNamespaces")
async def get_namespaces():
try:
# Get the list of namespaces
namespaces = core_client.list_namespace()
# Extract the namespace names from the response
namespace_list = [ns.metadata.name for ns in namespaces.items]
return JSONResponse(namespace_list)
except ApiException as e:
logger.debug(f"Exception: {e}")
return JSONResponse({"error": str(e)}, status_code=e.status)


@app.put("/updatePriority")
async def update_priority(request: Request):
data = await request.json()
name = data.get("name", "")
namespace = data.get("namespace", "default")
priority = data.get("priority", 0)

try:
job = crd_client.get_namespaced_custom_object(
group="kueue.x-k8s.io",
version="v1beta1",
namespace=namespace,
plural="workloads",
name=name,
)

job["spec"]["priority"] = priority

crd_client.patch_namespaced_custom_object(
group="kueue.x-k8s.io",
version="v1beta1",
namespace=namespace,
plural="workloads",
name=name,
body=job,
)
return JSONResponse({"success": True, "status": 200})

except ApiException as e:
logger.debug(f"Exception: {e}")
return JSONResponse({"error": str(e)}, status_code=e.status)


# Get a listing of workloads in kueue
def fetch_jobs():
listing = crd_client.list_namespaced_custom_object(
group="kueue.x-k8s.io",
version="v1beta1",
namespace="default",
plural="workloads",
)

return format_workloads(listing)


def format_workloads(listing: Dict[str, Any]) -> List[Dict[str, Any]]:
if not listing:
return []

res = []

for job in listing["items"]:
id = job["metadata"]["uid"]
name = job["metadata"]["name"]
created_at = job["metadata"]["creationTimestamp"]
namespace = job["metadata"]["namespace"]
localQueueName = job["spec"].get("queueName", "Unknown")
priority = job["spec"]["priority"]
active = job["spec"].get("active", 0)
status = "ADMITTED" if "admission" in job.get("status", {}) else "PENDING"

statusVal = 1 if "admission" in job.get("status", {}) else 0
order = (statusVal * 10) + priority

res.append(
{
"id": id,
"name": name,
"namespace": namespace,
"localQueueName": localQueueName,
"priority": priority,
"status": status,
"active": active,
"created_at": created_at,
"order": order,
}
)

return res


app = socketio.ASGIApp(sio, app)
Loading

0 comments on commit 1fa2c4d

Please sign in to comment.