Merge pull request #201 from Datura-ai/main

deploy validator
Datura-ai · Jan 21, 2025 · 34bef99 · 34bef99
2 parents 117dd40 + e330bb2
commit 34bef99
Show file tree

Hide file tree

Showing 13 changed files with 114 additions and 12 deletions.
diff --git a/neurons/executor/Dockerfile b/neurons/executor/Dockerfile
@@ -53,6 +53,4 @@ RUN mkdir -p /etc/docker
 RUN mkdir -p /etc/nvidia-container-runtime
 RUN mkdir -p /root/.ssh
 
-LABEL version="3.3.3"
-
 CMD ["bash", "run.sh"]
diff --git a/neurons/executor/Dockerfile.runner b/neurons/executor/Dockerfile.runner
@@ -3,9 +3,8 @@ WORKDIR /root/executor
 
 COPY docker-compose.app.yml docker-compose.yml
 COPY entrypoint.sh /entrypoint.sh
+COPY version.txt .
 
 RUN chmod u+x /entrypoint.sh
 
-LABEL version="3.3.3"
-
 ENTRYPOINT ["/entrypoint.sh"]
diff --git a/neurons/executor/docker-compose.app.yml b/neurons/executor/docker-compose.app.yml
@@ -11,6 +11,7 @@ services:
       - /var/run/docker.sock:/var/run/docker.sock
       - /etc/docker/daemon.json:/etc/docker/daemon.json
       - /etc/nvidia-container-runtime/config.toml:/etc/nvidia-container-runtime/config.toml
+    pid: host
     deploy:
       resources:
         reservations:

diff --git a/neurons/executor/version.txt b/neurons/executor/version.txt
@@ -0,0 +1 @@
+3.4.0
diff --git a/neurons/miners/Dockerfile b/neurons/miners/Dockerfile
@@ -46,6 +46,4 @@ RUN echo "export PYTHONPATH=$PYTHONPATH" >> ~/.bash_profile
 COPY --from=base-image /root/app/ /root/app/
 COPY --from=base-image /opt/pypackages/ /opt/pypackages/
 
-LABEL version="3.3.2"
-
 CMD ["bash", "run.sh"]
diff --git a/neurons/miners/Dockerfile.runner b/neurons/miners/Dockerfile.runner
@@ -2,8 +2,7 @@ FROM docker:26-cli
 WORKDIR /root/miner
 COPY docker-compose.app.yml docker-compose.yml
 COPY entrypoint.sh /entrypoint.sh
-
-LABEL version="3.3.2"
+COPY version.txt .
 
 RUN chmod u+x /entrypoint.sh
 ENTRYPOINT ["/entrypoint.sh"]
diff --git a/neurons/miners/version.txt b/neurons/miners/version.txt
@@ -0,0 +1 @@
+3.4.0
diff --git a/neurons/validators/Dockerfile b/neurons/validators/Dockerfile
@@ -47,6 +47,4 @@ RUN echo "export PYTHONPATH=$PYTHONPATH" >> ~/.bash_profile
 COPY --from=base-image /root/app/ /root/app/
 COPY --from=base-image /opt/pypackages/ /opt/pypackages/
 
-LABEL version="3.4.1"
-
 CMD ["bash", "run.sh"]
diff --git a/neurons/validators/Dockerfile.runner b/neurons/validators/Dockerfile.runner
@@ -2,8 +2,7 @@ FROM docker:26-cli
 WORKDIR /root/validator
 COPY docker-compose.app.yml docker-compose.yml
 COPY entrypoint.sh /entrypoint.sh
-
-LABEL version="3.4.1"
+COPY version.txt .
 
 RUN chmod u+x /entrypoint.sh
 ENTRYPOINT ["/entrypoint.sh"]
diff --git a/neurons/validators/src/core/config.py b/neurons/validators/src/core/config.py
@@ -42,6 +42,9 @@ class Settings(BaseSettings):
     )
 
     ENV: str = Field(env="ENV", default="dev")
+
+    # Read version from version.txt
+    VERSION: str = (pathlib.Path(__file__).parent / ".." / ".." / "version.txt").read_text().strip()
 
     def get_bittensor_wallet(self) -> "Wallet":
         if not self.BITTENSOR_WALLET_NAME or not self.BITTENSOR_WALLET_HOTKEY_NAME:

diff --git a/neurons/validators/src/miner_jobs/machine_scrape.py b/neurons/validators/src/miner_jobs/machine_scrape.py
@@ -79,6 +79,8 @@
 NVML_CLOCK_VIDEO = 3
 NVML_CLOCK_COUNT = 4
 
+NVML_VALUE_NOT_AVAILABLE_ulonglong = c_ulonglong(-1)
+
 
 class struct_c_nvmlDevice_t(Structure):
     pass  # opaque handle
@@ -222,6 +224,21 @@ def __eq__(self, other):
         return self.value == other.value
 
 
+class c_nvmlProcessInfo_v2_t(_PrintableStructure):
+    _fields_ = [
+        ('pid', c_uint),
+        ('usedGpuMemory', c_ulonglong),
+        ('gpuInstanceId', c_uint),
+        ('computeInstanceId', c_uint),
+    ]
+    _fmt_ = {'usedGpuMemory': "%d B"}
+
+
+c_nvmlProcessInfo_v3_t = c_nvmlProcessInfo_v2_t
+
+c_nvmlProcessInfo_t = c_nvmlProcessInfo_v3_t
+
+
 def convertStrBytes(func):
     '''
     In python 3, strings are unicode instead of bytes, and need to be converted for ctypes
@@ -520,6 +537,57 @@ def nvmlDeviceGetUtilizationRates(handle):
     return c_util
 
 
+class nvmlFriendlyObject(object):
+    def __init__(self, dictionary):
+        for x in dictionary:
+            setattr(self, x, dictionary[x])
+
+    def __str__(self):
+        return self.__dict__.__str__()
+
+
+def nvmlStructToFriendlyObject(struct):
+    d = {}
+    for x in struct._fields_:
+        key = x[0]
+        value = getattr(struct, key)
+        # only need to convert from bytes if bytes, no need to check python version.
+        d[key] = value.decode() if isinstance(value, bytes) else value
+    obj = nvmlFriendlyObject(d)
+    return obj
+
+
+def nvmlDeviceGetComputeRunningProcesses_v2(handle):
+    # first call to get the size
+    c_count = c_uint(0)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetComputeRunningProcesses_v2")
+    ret = fn(handle, byref(c_count), None)
+    if (ret == NVML_SUCCESS):
+        # special case, no running processes
+        return []
+    elif (ret == NVML_ERROR_INSUFFICIENT_SIZE):
+        # typical case
+        # oversize the array incase more processes are created
+        c_count.value = c_count.value * 2 + 5
+        proc_array = c_nvmlProcessInfo_v2_t * c_count.value
+        c_procs = proc_array()
+        # make the call again
+        ret = fn(handle, byref(c_count), c_procs)
+        _nvmlCheckReturn(ret)
+        procs = []
+        for i in range(c_count.value):
+            # use an alternative struct for this object
+            obj = nvmlStructToFriendlyObject(c_procs[i])
+            if (obj.usedGpuMemory == NVML_VALUE_NOT_AVAILABLE_ulonglong.value):
+                # special case for WDDM on Windows, see comment above
+                obj.usedGpuMemory = None
+            procs.append(obj)
+        return procs
+    else:
+        # error case
+        raise NVMLError(ret)
+
+
 def run_cmd(cmd):
     proc = subprocess.run(cmd, shell=True, capture_output=True, check=False, text=True)
     if proc.returncode != 0:
@@ -623,6 +691,28 @@ def get_file_content(path: str):
     return content
 
 
+def get_gpu_processes(pids: set):
+    if not pids:
+        return []
+
+    processes = []
+    for pid in pids:
+        try:
+            cmd = f'cat /proc/{pid}/cgroup'
+            info = run_cmd(cmd).strip()
+            processes.append({
+                "pid": pid,
+                "info": info
+            })
+        except:
+            processes.append({
+                "pid": pid,
+                "info": None
+            })
+
+    return processes
+
+
 def get_machine_specs():
     """Get Specs of miner machine."""
     data = {}
@@ -631,6 +721,8 @@ def get_machine_specs():
         return data
 
     data["gpu"] = {"count": 0, "details": []}
+    gpu_process_ids = set()
+
     try:
         libnvidia_path = get_libnvidia_ml_path()
         if not libnvidia_path:
@@ -680,6 +772,12 @@ def get_machine_specs():
                 }
             )
 
+            processes = nvmlDeviceGetComputeRunningProcesses_v2(handle)
+
+            # Collect process IDs
+            for proc in processes:
+                gpu_process_ids.add(proc.pid)
+
         nvmlShutdown()
     except Exception as exc:
         # print(f'Error getting os specs: {exc}', flush=True)
@@ -699,6 +797,8 @@ def get_machine_specs():
         except Exception as exc:
             data["docker_cfg_scrape_error"] = repr(exc)
 
+    data['gpu_processes'] = get_gpu_processes(gpu_process_ids)
+
     data["cpu"] = {"count": 0, "model": "", "clocks": []}
     try:
         lscpu_output = run_cmd("lscpu")

diff --git a/neurons/validators/src/services/task_service.py b/neurons/validators/src/services/task_service.py
@@ -226,6 +226,7 @@ async def docker_connection_check(
                         "executor_port": executor_info.port,
                         "ssh_username": executor_info.ssh_username,
                         "ssh_port": executor_info.ssh_port,
+                        "version": settings.VERSION
                     }
                 ),
             )
@@ -246,6 +247,7 @@ async def docker_connection_check(
             "ssh_port": executor_info.ssh_port,
             "internal_port": internal_port,
             "external_port": external_port,
+            "version": settings.VERSION,
         }
         context.set(f"[_docker_connection_check][{executor_name}]")
 
@@ -358,6 +360,7 @@ async def create_task(
             "executor_port": executor_info.port,
             "executor_ssh_username": executor_info.ssh_username,
             "executor_ssh_port": executor_info.ssh_port,
+            "version": settings.VERSION,
         }
         try:
             logger.info(_m("Start job on an executor", extra=get_extra_info(default_extra)))
@@ -956,6 +959,7 @@ async def _run_task(
                 "executor_port": executor_info.port,
                 "miner_hotkey": miner_hotkey,
                 "command": command[:100] + ("..." if len(command) > 100 else ""),
+                "version": settings.VERSION,
             }
             context.set(f"[_run_task][{executor_name}]")
             logger.info(

diff --git a/neurons/validators/version.txt b/neurons/validators/version.txt
@@ -0,0 +1 @@
+3.4.2