Merge pull request #192 from Datura-ai/main

deploy executor
Datura-ai · Jan 16, 2025 · 2eadb9a · 2eadb9a
2 parents ca7e2e6 + 968f550
commit 2eadb9a
Show file tree

Hide file tree

Showing 18 changed files with 176 additions and 80 deletions.
diff --git a/contrib/STYLE.md b/contrib/STYLE.md
@@ -58,7 +58,7 @@ Python's official style guide is PEP 8, which provides conventions for writing c
 
 #### More details
 
-Use `black` to format your python code before commiting for consistency across such a large pool of contributors. Black's code [style](https://black.readthedocs.io/en/stable/the_black_code_style/current_style.html#code-style) ensures consistent and opinionated code formatting. It automatically formats your Python code according to the Black style guide, enhancing code readability and maintainability.
+Use `black` to format your python code before committing for consistency across such a large pool of contributors. Black's code [style](https://black.readthedocs.io/en/stable/the_black_code_style/current_style.html#code-style) ensures consistent and opinionated code formatting. It automatically formats your Python code according to the Black style guide, enhancing code readability and maintainability.
 
 Key Features of Black:
 

diff --git a/neurons/executor/.env.template b/neurons/executor/.env.template
@@ -1,7 +1,8 @@
-INTERNAL_PORT=8001
-EXTERNAL_PORT=8001
+INTERNAL_PORT=8001 # interal port of docker
+EXTERNAL_PORT=8001 # external port of docker
 
-SSH_PORT=2200
+SSH_PORT=2200 # external ssh port of docker map into 22
+SSH_PUBLIC_PORT=2200 # Optional. in case you are using proxy and public port is different from internal port in your server
 
 # NOTE: please use either RENTING_PORT_RANGE or RENTING_PORT_MAPPINGS, both are not allowed
 # Note: If you are not using proxy and all ports are available publicly,

diff --git a/neurons/executor/Dockerfile b/neurons/executor/Dockerfile
@@ -53,6 +53,6 @@ RUN mkdir -p /etc/docker
 RUN mkdir -p /etc/nvidia-container-runtime
 RUN mkdir -p /root/.ssh
 
-LABEL version="3.3.2"
+LABEL version="3.3.3"
 
 CMD ["bash", "run.sh"]
diff --git a/neurons/executor/Dockerfile.runner b/neurons/executor/Dockerfile.runner
@@ -6,6 +6,6 @@ COPY entrypoint.sh /entrypoint.sh
 
 RUN chmod u+x /entrypoint.sh
 
-LABEL version="3.3.2"
+LABEL version="3.3.3"
 
 ENTRYPOINT ["/entrypoint.sh"]
diff --git a/neurons/executor/README.md b/neurons/executor/README.md
@@ -47,7 +47,8 @@ You can change the ports for `INTERNAL_PORT`, `EXTERNAL_PORT`, `SSH_PORT` based
 
 - **INTERNAL_PORT**: internal port of your executor docker container
 - **EXTERNAL_PORT**: external expose port of your executor docker container
-- **SSH_PORT**: ssh access port of your executor docker container
+- **SSH_PORT**: ssh port map into 22 of your executor docker container
+- **SSH_PUBLIC_PORT**: [Optional] ssh public access port of your executor docker container. If `SSH_PUBLIC_PORT` is equal to `SSH_PORT` then you don't have to specify this port.
 - **MINER_HOTKEY_SS58_ADDRESS**: the miner hotkey address
 - **RENTING_PORT_RANGE**: The port range that are publicly accessible. This can be empty if all ports are open. Available formats are: 
   - Range Specification(`from-to`): Miners can specify a range of ports, such as 2000-2005. This means ports from 2000 to 2005 will be open for the validator to select.

diff --git a/neurons/executor/src/core/config.py b/neurons/executor/src/core/config.py
@@ -8,7 +8,8 @@ class Settings(BaseSettings):
     PROJECT_NAME: str = "compute-subnet-executor"
 
     INTERNAL_PORT: int = Field(env="INTERNAL_PORT", default=8001)
-    SSH_PORT: int = Field(env="SSH_PORT", default=22)
+    SSH_PORT: int = Field(env="SSH_PORT", default=2200)
+    SSH_PUBLIC_PORT: Optional[int] = Field(env="SSH_PUBLIC_PORT", default=None)
 
     MINER_HOTKEY_SS58_ADDRESS: str = Field(env="MINER_HOTKEY_SS58_ADDRESS")
 

diff --git a/neurons/executor/src/services/miner_service.py b/neurons/executor/src/services/miner_service.py
@@ -26,7 +26,7 @@ async def upload_ssh_key(self, paylod: MinerAuthPayload):
 
         return {
             "ssh_username": self.ssh_service.get_current_os_user(),
-            "ssh_port": settings.SSH_PORT,
+            "ssh_port": settings.SSH_PUBLIC_PORT or settings.SSH_PORT,
             "python_path": sys.executable,
             "root_dir": str(Path(__file__).resolve().parents[2]),
             "port_range": settings.RENTING_PORT_RANGE,

diff --git a/neurons/miners/src/core/miner.py b/neurons/miners/src/core/miner.py
@@ -146,7 +146,7 @@ def announce(self):
         except Exception as e:
             logger.error(
                 _m(
-                    '[announce] Annoucing miner error',
+                    '[announce] Announcing miner error',
                     extra=get_extra_info({
                         **self.default_extra,
                         "error": str(e)

diff --git a/neurons/validators/Dockerfile b/neurons/validators/Dockerfile
@@ -47,6 +47,6 @@ RUN echo "export PYTHONPATH=$PYTHONPATH" >> ~/.bash_profile
 COPY --from=base-image /root/app/ /root/app/
 COPY --from=base-image /opt/pypackages/ /opt/pypackages/
 
-LABEL version="3.3.14"
+LABEL version="3.4.0"
 
 CMD ["bash", "run.sh"]
diff --git a/neurons/validators/Dockerfile.runner b/neurons/validators/Dockerfile.runner
@@ -3,7 +3,7 @@ WORKDIR /root/validator
 COPY docker-compose.app.yml docker-compose.yml
 COPY entrypoint.sh /entrypoint.sh
 
-LABEL version="3.3.14"
+LABEL version="3.4.0"
 
 RUN chmod u+x /entrypoint.sh
 ENTRYPOINT ["/entrypoint.sh"]
diff --git a/neurons/validators/src/clients/compute_client.py b/neurons/validators/src/clients/compute_client.py
@@ -13,7 +13,7 @@
     ContainerCreateRequest,
     ContainerDeleteRequest,
     FailedContainerRequest,
-    DuplicateContainersResponse,
+    DuplicateExecutorsResponse,
     ContainerStartRequest,
     ContainerStopRequest,
     ContainerBaseRequest,
@@ -24,7 +24,7 @@
     ExecutorSpecRequest,
     LogStreamRequest,
     RentedMachineRequest,
-    DuplicateContainersRequest,
+    DuplicateExecutorsRequest,
 )
 from pydantic import BaseModel
 from websockets.asyncio.client import ClientConnection
@@ -407,7 +407,7 @@ async def poll_rented_machines(self):
                         extra=self.logging_extra,
                     )
                 )
-                await self.send_model(DuplicateContainersRequest())
+                await self.send_model(DuplicateExecutorsRequest())
 
                 await asyncio.sleep(10 * 60)
             else:
@@ -460,26 +460,26 @@ async def handle_message(self, raw_msg: str | bytes):
             return
 
         try:
-            response = pydantic.TypeAdapter(DuplicateContainersResponse).validate_json(raw_msg)
+            response = pydantic.TypeAdapter(DuplicateExecutorsResponse).validate_json(raw_msg)
         except pydantic.ValidationError as exc:
             logger.error(
                 _m(
-                    "could not parse raw message as DuplicateContainersResponse",
+                    "could not parse raw message as DuplicateExecutorsResponse",
                     extra={**self.logging_extra, "error": str(exc), "raw_msg": raw_msg},
                 )
             )
         else:
             logger.info(
                 _m(
-                    "Duplicated containers",
-                    extra={**self.logging_extra, "machines": response.containers},
+                    "Duplicated executors",
+                    extra={**self.logging_extra, "executors": len(response.executors)},
                 )
             )
 
             redis_service = self.miner_service.redis_service
             await redis_service.delete(DUPLICATED_MACHINE_SET)
 
-            for container_id, details_list in response.containers.items():
+            for _, details_list in response.executors.items():
                 for detail in details_list:
                     executor_id = detail.get("executor_id")
                     miner_hotkey = detail.get("miner_hotkey")

diff --git a/neurons/validators/src/core/validator.py b/neurons/validators/src/core/validator.py
@@ -107,7 +107,7 @@ async def initiate_services(self):
                 else:
                     self.miner_scores = json.loads(miner_scores_json)
 
-            await self.redis_service.clear_all_ssh_ports()
+            # await self.redis_service.clear_all_ssh_ports()
         except Exception as e:
             logger.error(
                 _m(

diff --git a/neurons/validators/src/miner_jobs/machine_scrape.py b/neurons/validators/src/miner_jobs/machine_scrape.py
@@ -370,6 +370,15 @@ def nvmlSystemGetDriverVersion():
     return c_version.value
 
 
+@convertStrBytes
+def nvmlDeviceGetUUID(handle):
+    c_uuid = create_string_buffer(NVML_DEVICE_UUID_V2_BUFFER_SIZE)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetUUID")
+    ret = fn(handle, c_uuid, c_uint(NVML_DEVICE_UUID_V2_BUFFER_SIZE))
+    _nvmlCheckReturn(ret)
+    return c_uuid.value
+
+
 def nvmlSystemGetCudaDriverVersion():
     c_cuda_version = c_int()
     fn = _nvmlGetFunctionPointer("nvmlSystemGetCudaDriverVersion")
@@ -658,6 +667,7 @@ def get_machine_specs():
             data["gpu"]["details"].append(
                 {
                     "name": nvmlDeviceGetName(handle),
+                    "uuid": nvmlDeviceGetUUID(handle),
                     "capacity": nvmlDeviceGetMemoryInfo(handle).total / (1024 ** 2),
                     "cuda": f"{major}.{minor}",
                     "power_limit": nvmlDeviceGetPowerManagementLimit(handle) / 1000,
@@ -767,4 +777,4 @@ def _encrypt(key: str, payload: str) -> str:
 key = 'encrypt_key'
 machine_specs = get_machine_specs()
 encoded_str = _encrypt(key, json.dumps(machine_specs))
-print(encoded_str)
+print(encoded_str)
diff --git a/neurons/validators/src/payload_models/payloads.py b/neurons/validators/src/payload_models/payloads.py
@@ -50,7 +50,7 @@ class ContainerRequestType(enum.Enum):
     ContainerStartRequest = "ContainerStartRequest"
     ContainerStopRequest = "ContainerStopRequest"
     ContainerDeleteRequest = "ContainerDeleteRequest"
-    DuplicateContainersResponse = "DuplicateContainersResponse"
+    DuplicateExecutorsResponse = "DuplicateExecutorsResponse"
 
 
 class ContainerBaseRequest(BaseRequest):
@@ -140,6 +140,6 @@ class FailedContainerRequest(ContainerBaseResponse):
     error_code: FailedContainerErrorCodes | None = None
 
 
-class DuplicateContainersResponse(BaseModel):
-    message_type: ContainerRequestType = ContainerRequestType.DuplicateContainersResponse
-    containers: dict[str, list]
+class DuplicateExecutorsResponse(BaseModel):
+    message_type: ContainerRequestType = ContainerRequestType.DuplicateExecutorsResponse
+    executors: dict[str, list]
diff --git a/neurons/validators/src/protocol/vc_protocol/validator_requests.py b/neurons/validators/src/protocol/vc_protocol/validator_requests.py
@@ -13,7 +13,7 @@ class RequestType(enum.Enum):
     ExecutorSpecRequest = "ExecutorSpecRequest"
     RentedMachineRequest = "RentedMachineRequest"
     LogStreamRequest = "LogStreamRequest"
-    DuplicateContainersRequest = "DuplicateContainersRequest"
+    DuplicateExecutorsRequest = "DuplicateExecutorsRequest"
 
 
 class BaseValidatorRequest(BaseRequest):
@@ -73,5 +73,5 @@ class LogStreamRequest(BaseValidatorRequest):
     logs: list[dict]
 
 
-class DuplicateContainersRequest(BaseValidatorRequest):
-    message_type: RequestType = RequestType.DuplicateContainersRequest
+class DuplicateExecutorsRequest(BaseValidatorRequest):
+    message_type: RequestType = RequestType.DuplicateExecutorsRequest
diff --git a/neurons/validators/src/services/const.py b/neurons/validators/src/services/const.py
@@ -7,6 +7,7 @@
     "NVIDIA H100 NVL": 2.79,
     "NVIDIA H100 PCIe": 2.69,
     "NVIDIA GeForce RTX 4090": 0.69,
+    "NVIDIA GeForce RTX 4090 D": 0.62,
     "NVIDIA RTX 4000 Ada Generation": 0.38,
     "NVIDIA RTX 6000 Ada Generation": 1.03,
     "NVIDIA L4": 0.43,
@@ -34,7 +35,7 @@
 
 MAX_GPU_COUNT = 14
 
-UNRENTED_MULTIPLIER = 0.25
+UNRENTED_MULTIPLIER = 1
 
 GPU_UTILIZATION_LIMIT = 1
 GPU_MEMORY_UTILIZATION_LIMIT = 1
@@ -274,6 +275,19 @@
             34.388632106781,
         ],
     },
+    "NVIDIA GeForce RTX 4090 D": {
+        "digits": 11,
+        "average_time": [
+            12.535813426971435,
+            13.367040371894836,
+            14.397390270233155,
+            15.773727321624756,
+            16.52033654212952,
+            18.87070236206055,
+            20.572682762145995,
+            22.169760519266127,
+        ],
+    },
     "NVIDIA GeForce RTX 4090": {
         "digits": 11,
         "average_time": [
@@ -322,6 +336,7 @@
     "560.35.05": "1eec299b50e33a6cfa5155ded53495ab",
     "565.57.01": "c801dd3fc4660f3a8ddf977cfdffe113",
     "550.127.08": "ac925f2cd192ad971c5466d55945a243",
+    "550.142": "e68b535a61be6434fc7f12450561a3d0"
 }
 
 DOCKER_DIGESTS = {