Skip to content

Commit

Permalink
Merge pull request #192 from Datura-ai/main
Browse files Browse the repository at this point in the history
deploy executor
  • Loading branch information
pyon12 authored Jan 16, 2025
2 parents ca7e2e6 + 968f550 commit 2eadb9a
Show file tree
Hide file tree
Showing 18 changed files with 176 additions and 80 deletions.
2 changes: 1 addition & 1 deletion contrib/STYLE.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ Python's official style guide is PEP 8, which provides conventions for writing c

#### More details

Use `black` to format your python code before commiting for consistency across such a large pool of contributors. Black's code [style](https://black.readthedocs.io/en/stable/the_black_code_style/current_style.html#code-style) ensures consistent and opinionated code formatting. It automatically formats your Python code according to the Black style guide, enhancing code readability and maintainability.
Use `black` to format your python code before committing for consistency across such a large pool of contributors. Black's code [style](https://black.readthedocs.io/en/stable/the_black_code_style/current_style.html#code-style) ensures consistent and opinionated code formatting. It automatically formats your Python code according to the Black style guide, enhancing code readability and maintainability.

Key Features of Black:

Expand Down
7 changes: 4 additions & 3 deletions neurons/executor/.env.template
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
INTERNAL_PORT=8001
EXTERNAL_PORT=8001
INTERNAL_PORT=8001 # interal port of docker
EXTERNAL_PORT=8001 # external port of docker

SSH_PORT=2200
SSH_PORT=2200 # external ssh port of docker map into 22
SSH_PUBLIC_PORT=2200 # Optional. in case you are using proxy and public port is different from internal port in your server

# NOTE: please use either RENTING_PORT_RANGE or RENTING_PORT_MAPPINGS, both are not allowed
# Note: If you are not using proxy and all ports are available publicly,
Expand Down
2 changes: 1 addition & 1 deletion neurons/executor/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,6 @@ RUN mkdir -p /etc/docker
RUN mkdir -p /etc/nvidia-container-runtime
RUN mkdir -p /root/.ssh

LABEL version="3.3.2"
LABEL version="3.3.3"

CMD ["bash", "run.sh"]
2 changes: 1 addition & 1 deletion neurons/executor/Dockerfile.runner
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,6 @@ COPY entrypoint.sh /entrypoint.sh

RUN chmod u+x /entrypoint.sh

LABEL version="3.3.2"
LABEL version="3.3.3"

ENTRYPOINT ["/entrypoint.sh"]
3 changes: 2 additions & 1 deletion neurons/executor/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,8 @@ You can change the ports for `INTERNAL_PORT`, `EXTERNAL_PORT`, `SSH_PORT` based

- **INTERNAL_PORT**: internal port of your executor docker container
- **EXTERNAL_PORT**: external expose port of your executor docker container
- **SSH_PORT**: ssh access port of your executor docker container
- **SSH_PORT**: ssh port map into 22 of your executor docker container
- **SSH_PUBLIC_PORT**: [Optional] ssh public access port of your executor docker container. If `SSH_PUBLIC_PORT` is equal to `SSH_PORT` then you don't have to specify this port.
- **MINER_HOTKEY_SS58_ADDRESS**: the miner hotkey address
- **RENTING_PORT_RANGE**: The port range that are publicly accessible. This can be empty if all ports are open. Available formats are:
- Range Specification(`from-to`): Miners can specify a range of ports, such as 2000-2005. This means ports from 2000 to 2005 will be open for the validator to select.
Expand Down
3 changes: 2 additions & 1 deletion neurons/executor/src/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ class Settings(BaseSettings):
PROJECT_NAME: str = "compute-subnet-executor"

INTERNAL_PORT: int = Field(env="INTERNAL_PORT", default=8001)
SSH_PORT: int = Field(env="SSH_PORT", default=22)
SSH_PORT: int = Field(env="SSH_PORT", default=2200)
SSH_PUBLIC_PORT: Optional[int] = Field(env="SSH_PUBLIC_PORT", default=None)

MINER_HOTKEY_SS58_ADDRESS: str = Field(env="MINER_HOTKEY_SS58_ADDRESS")

Expand Down
2 changes: 1 addition & 1 deletion neurons/executor/src/services/miner_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ async def upload_ssh_key(self, paylod: MinerAuthPayload):

return {
"ssh_username": self.ssh_service.get_current_os_user(),
"ssh_port": settings.SSH_PORT,
"ssh_port": settings.SSH_PUBLIC_PORT or settings.SSH_PORT,
"python_path": sys.executable,
"root_dir": str(Path(__file__).resolve().parents[2]),
"port_range": settings.RENTING_PORT_RANGE,
Expand Down
2 changes: 1 addition & 1 deletion neurons/miners/src/core/miner.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ def announce(self):
except Exception as e:
logger.error(
_m(
'[announce] Annoucing miner error',
'[announce] Announcing miner error',
extra=get_extra_info({
**self.default_extra,
"error": str(e)
Expand Down
2 changes: 1 addition & 1 deletion neurons/validators/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,6 @@ RUN echo "export PYTHONPATH=$PYTHONPATH" >> ~/.bash_profile
COPY --from=base-image /root/app/ /root/app/
COPY --from=base-image /opt/pypackages/ /opt/pypackages/

LABEL version="3.3.14"
LABEL version="3.4.0"

CMD ["bash", "run.sh"]
2 changes: 1 addition & 1 deletion neurons/validators/Dockerfile.runner
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ WORKDIR /root/validator
COPY docker-compose.app.yml docker-compose.yml
COPY entrypoint.sh /entrypoint.sh

LABEL version="3.3.14"
LABEL version="3.4.0"

RUN chmod u+x /entrypoint.sh
ENTRYPOINT ["/entrypoint.sh"]
16 changes: 8 additions & 8 deletions neurons/validators/src/clients/compute_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
ContainerCreateRequest,
ContainerDeleteRequest,
FailedContainerRequest,
DuplicateContainersResponse,
DuplicateExecutorsResponse,
ContainerStartRequest,
ContainerStopRequest,
ContainerBaseRequest,
Expand All @@ -24,7 +24,7 @@
ExecutorSpecRequest,
LogStreamRequest,
RentedMachineRequest,
DuplicateContainersRequest,
DuplicateExecutorsRequest,
)
from pydantic import BaseModel
from websockets.asyncio.client import ClientConnection
Expand Down Expand Up @@ -407,7 +407,7 @@ async def poll_rented_machines(self):
extra=self.logging_extra,
)
)
await self.send_model(DuplicateContainersRequest())
await self.send_model(DuplicateExecutorsRequest())

await asyncio.sleep(10 * 60)
else:
Expand Down Expand Up @@ -460,26 +460,26 @@ async def handle_message(self, raw_msg: str | bytes):
return

try:
response = pydantic.TypeAdapter(DuplicateContainersResponse).validate_json(raw_msg)
response = pydantic.TypeAdapter(DuplicateExecutorsResponse).validate_json(raw_msg)
except pydantic.ValidationError as exc:
logger.error(
_m(
"could not parse raw message as DuplicateContainersResponse",
"could not parse raw message as DuplicateExecutorsResponse",
extra={**self.logging_extra, "error": str(exc), "raw_msg": raw_msg},
)
)
else:
logger.info(
_m(
"Duplicated containers",
extra={**self.logging_extra, "machines": response.containers},
"Duplicated executors",
extra={**self.logging_extra, "executors": len(response.executors)},
)
)

redis_service = self.miner_service.redis_service
await redis_service.delete(DUPLICATED_MACHINE_SET)

for container_id, details_list in response.containers.items():
for _, details_list in response.executors.items():
for detail in details_list:
executor_id = detail.get("executor_id")
miner_hotkey = detail.get("miner_hotkey")
Expand Down
2 changes: 1 addition & 1 deletion neurons/validators/src/core/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ async def initiate_services(self):
else:
self.miner_scores = json.loads(miner_scores_json)

await self.redis_service.clear_all_ssh_ports()
# await self.redis_service.clear_all_ssh_ports()
except Exception as e:
logger.error(
_m(
Expand Down
12 changes: 11 additions & 1 deletion neurons/validators/src/miner_jobs/machine_scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,15 @@ def nvmlSystemGetDriverVersion():
return c_version.value


@convertStrBytes
def nvmlDeviceGetUUID(handle):
c_uuid = create_string_buffer(NVML_DEVICE_UUID_V2_BUFFER_SIZE)
fn = _nvmlGetFunctionPointer("nvmlDeviceGetUUID")
ret = fn(handle, c_uuid, c_uint(NVML_DEVICE_UUID_V2_BUFFER_SIZE))
_nvmlCheckReturn(ret)
return c_uuid.value


def nvmlSystemGetCudaDriverVersion():
c_cuda_version = c_int()
fn = _nvmlGetFunctionPointer("nvmlSystemGetCudaDriverVersion")
Expand Down Expand Up @@ -658,6 +667,7 @@ def get_machine_specs():
data["gpu"]["details"].append(
{
"name": nvmlDeviceGetName(handle),
"uuid": nvmlDeviceGetUUID(handle),
"capacity": nvmlDeviceGetMemoryInfo(handle).total / (1024 ** 2),
"cuda": f"{major}.{minor}",
"power_limit": nvmlDeviceGetPowerManagementLimit(handle) / 1000,
Expand Down Expand Up @@ -767,4 +777,4 @@ def _encrypt(key: str, payload: str) -> str:
key = 'encrypt_key'
machine_specs = get_machine_specs()
encoded_str = _encrypt(key, json.dumps(machine_specs))
print(encoded_str)
print(encoded_str)
8 changes: 4 additions & 4 deletions neurons/validators/src/payload_models/payloads.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ class ContainerRequestType(enum.Enum):
ContainerStartRequest = "ContainerStartRequest"
ContainerStopRequest = "ContainerStopRequest"
ContainerDeleteRequest = "ContainerDeleteRequest"
DuplicateContainersResponse = "DuplicateContainersResponse"
DuplicateExecutorsResponse = "DuplicateExecutorsResponse"


class ContainerBaseRequest(BaseRequest):
Expand Down Expand Up @@ -140,6 +140,6 @@ class FailedContainerRequest(ContainerBaseResponse):
error_code: FailedContainerErrorCodes | None = None


class DuplicateContainersResponse(BaseModel):
message_type: ContainerRequestType = ContainerRequestType.DuplicateContainersResponse
containers: dict[str, list]
class DuplicateExecutorsResponse(BaseModel):
message_type: ContainerRequestType = ContainerRequestType.DuplicateExecutorsResponse
executors: dict[str, list]
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ class RequestType(enum.Enum):
ExecutorSpecRequest = "ExecutorSpecRequest"
RentedMachineRequest = "RentedMachineRequest"
LogStreamRequest = "LogStreamRequest"
DuplicateContainersRequest = "DuplicateContainersRequest"
DuplicateExecutorsRequest = "DuplicateExecutorsRequest"


class BaseValidatorRequest(BaseRequest):
Expand Down Expand Up @@ -73,5 +73,5 @@ class LogStreamRequest(BaseValidatorRequest):
logs: list[dict]


class DuplicateContainersRequest(BaseValidatorRequest):
message_type: RequestType = RequestType.DuplicateContainersRequest
class DuplicateExecutorsRequest(BaseValidatorRequest):
message_type: RequestType = RequestType.DuplicateExecutorsRequest
17 changes: 16 additions & 1 deletion neurons/validators/src/services/const.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"NVIDIA H100 NVL": 2.79,
"NVIDIA H100 PCIe": 2.69,
"NVIDIA GeForce RTX 4090": 0.69,
"NVIDIA GeForce RTX 4090 D": 0.62,
"NVIDIA RTX 4000 Ada Generation": 0.38,
"NVIDIA RTX 6000 Ada Generation": 1.03,
"NVIDIA L4": 0.43,
Expand Down Expand Up @@ -34,7 +35,7 @@

MAX_GPU_COUNT = 14

UNRENTED_MULTIPLIER = 0.25
UNRENTED_MULTIPLIER = 1

GPU_UTILIZATION_LIMIT = 1
GPU_MEMORY_UTILIZATION_LIMIT = 1
Expand Down Expand Up @@ -274,6 +275,19 @@
34.388632106781,
],
},
"NVIDIA GeForce RTX 4090 D": {
"digits": 11,
"average_time": [
12.535813426971435,
13.367040371894836,
14.397390270233155,
15.773727321624756,
16.52033654212952,
18.87070236206055,
20.572682762145995,
22.169760519266127,
],
},
"NVIDIA GeForce RTX 4090": {
"digits": 11,
"average_time": [
Expand Down Expand Up @@ -322,6 +336,7 @@
"560.35.05": "1eec299b50e33a6cfa5155ded53495ab",
"565.57.01": "c801dd3fc4660f3a8ddf977cfdffe113",
"550.127.08": "ac925f2cd192ad971c5466d55945a243",
"550.142": "e68b535a61be6434fc7f12450561a3d0"
}

DOCKER_DIGESTS = {
Expand Down
Loading

0 comments on commit 2eadb9a

Please sign in to comment.