Skip to content

Commit

Permalink
Merge pull request #201 from Datura-ai/main
Browse files Browse the repository at this point in the history
deploy validator
  • Loading branch information
pyon12 authored Jan 21, 2025
2 parents 117dd40 + e330bb2 commit 34bef99
Show file tree
Hide file tree
Showing 13 changed files with 114 additions and 12 deletions.
2 changes: 0 additions & 2 deletions neurons/executor/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,4 @@ RUN mkdir -p /etc/docker
RUN mkdir -p /etc/nvidia-container-runtime
RUN mkdir -p /root/.ssh

LABEL version="3.3.3"

CMD ["bash", "run.sh"]
3 changes: 1 addition & 2 deletions neurons/executor/Dockerfile.runner
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,8 @@ WORKDIR /root/executor

COPY docker-compose.app.yml docker-compose.yml
COPY entrypoint.sh /entrypoint.sh
COPY version.txt .

RUN chmod u+x /entrypoint.sh

LABEL version="3.3.3"

ENTRYPOINT ["/entrypoint.sh"]
1 change: 1 addition & 0 deletions neurons/executor/docker-compose.app.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ services:
- /var/run/docker.sock:/var/run/docker.sock
- /etc/docker/daemon.json:/etc/docker/daemon.json
- /etc/nvidia-container-runtime/config.toml:/etc/nvidia-container-runtime/config.toml
pid: host
deploy:
resources:
reservations:
Expand Down
1 change: 1 addition & 0 deletions neurons/executor/version.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
3.4.0
2 changes: 0 additions & 2 deletions neurons/miners/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,4 @@ RUN echo "export PYTHONPATH=$PYTHONPATH" >> ~/.bash_profile
COPY --from=base-image /root/app/ /root/app/
COPY --from=base-image /opt/pypackages/ /opt/pypackages/

LABEL version="3.3.2"

CMD ["bash", "run.sh"]
3 changes: 1 addition & 2 deletions neurons/miners/Dockerfile.runner
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@ FROM docker:26-cli
WORKDIR /root/miner
COPY docker-compose.app.yml docker-compose.yml
COPY entrypoint.sh /entrypoint.sh

LABEL version="3.3.2"
COPY version.txt .

RUN chmod u+x /entrypoint.sh
ENTRYPOINT ["/entrypoint.sh"]
1 change: 1 addition & 0 deletions neurons/miners/version.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
3.4.0
2 changes: 0 additions & 2 deletions neurons/validators/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,4 @@ RUN echo "export PYTHONPATH=$PYTHONPATH" >> ~/.bash_profile
COPY --from=base-image /root/app/ /root/app/
COPY --from=base-image /opt/pypackages/ /opt/pypackages/

LABEL version="3.4.1"

CMD ["bash", "run.sh"]
3 changes: 1 addition & 2 deletions neurons/validators/Dockerfile.runner
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@ FROM docker:26-cli
WORKDIR /root/validator
COPY docker-compose.app.yml docker-compose.yml
COPY entrypoint.sh /entrypoint.sh

LABEL version="3.4.1"
COPY version.txt .

RUN chmod u+x /entrypoint.sh
ENTRYPOINT ["/entrypoint.sh"]
3 changes: 3 additions & 0 deletions neurons/validators/src/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ class Settings(BaseSettings):
)

ENV: str = Field(env="ENV", default="dev")

# Read version from version.txt
VERSION: str = (pathlib.Path(__file__).parent / ".." / ".." / "version.txt").read_text().strip()

def get_bittensor_wallet(self) -> "Wallet":
if not self.BITTENSOR_WALLET_NAME or not self.BITTENSOR_WALLET_HOTKEY_NAME:
Expand Down
100 changes: 100 additions & 0 deletions neurons/validators/src/miner_jobs/machine_scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,8 @@
NVML_CLOCK_VIDEO = 3
NVML_CLOCK_COUNT = 4

NVML_VALUE_NOT_AVAILABLE_ulonglong = c_ulonglong(-1)


class struct_c_nvmlDevice_t(Structure):
pass # opaque handle
Expand Down Expand Up @@ -222,6 +224,21 @@ def __eq__(self, other):
return self.value == other.value


class c_nvmlProcessInfo_v2_t(_PrintableStructure):
_fields_ = [
('pid', c_uint),
('usedGpuMemory', c_ulonglong),
('gpuInstanceId', c_uint),
('computeInstanceId', c_uint),
]
_fmt_ = {'usedGpuMemory': "%d B"}


c_nvmlProcessInfo_v3_t = c_nvmlProcessInfo_v2_t

c_nvmlProcessInfo_t = c_nvmlProcessInfo_v3_t


def convertStrBytes(func):
'''
In python 3, strings are unicode instead of bytes, and need to be converted for ctypes
Expand Down Expand Up @@ -520,6 +537,57 @@ def nvmlDeviceGetUtilizationRates(handle):
return c_util


class nvmlFriendlyObject(object):
def __init__(self, dictionary):
for x in dictionary:
setattr(self, x, dictionary[x])

def __str__(self):
return self.__dict__.__str__()


def nvmlStructToFriendlyObject(struct):
d = {}
for x in struct._fields_:
key = x[0]
value = getattr(struct, key)
# only need to convert from bytes if bytes, no need to check python version.
d[key] = value.decode() if isinstance(value, bytes) else value
obj = nvmlFriendlyObject(d)
return obj


def nvmlDeviceGetComputeRunningProcesses_v2(handle):
# first call to get the size
c_count = c_uint(0)
fn = _nvmlGetFunctionPointer("nvmlDeviceGetComputeRunningProcesses_v2")
ret = fn(handle, byref(c_count), None)
if (ret == NVML_SUCCESS):
# special case, no running processes
return []
elif (ret == NVML_ERROR_INSUFFICIENT_SIZE):
# typical case
# oversize the array incase more processes are created
c_count.value = c_count.value * 2 + 5
proc_array = c_nvmlProcessInfo_v2_t * c_count.value
c_procs = proc_array()
# make the call again
ret = fn(handle, byref(c_count), c_procs)
_nvmlCheckReturn(ret)
procs = []
for i in range(c_count.value):
# use an alternative struct for this object
obj = nvmlStructToFriendlyObject(c_procs[i])
if (obj.usedGpuMemory == NVML_VALUE_NOT_AVAILABLE_ulonglong.value):
# special case for WDDM on Windows, see comment above
obj.usedGpuMemory = None
procs.append(obj)
return procs
else:
# error case
raise NVMLError(ret)


def run_cmd(cmd):
proc = subprocess.run(cmd, shell=True, capture_output=True, check=False, text=True)
if proc.returncode != 0:
Expand Down Expand Up @@ -623,6 +691,28 @@ def get_file_content(path: str):
return content


def get_gpu_processes(pids: set):
if not pids:
return []

processes = []
for pid in pids:
try:
cmd = f'cat /proc/{pid}/cgroup'
info = run_cmd(cmd).strip()
processes.append({
"pid": pid,
"info": info
})
except:
processes.append({
"pid": pid,
"info": None
})

return processes


def get_machine_specs():
"""Get Specs of miner machine."""
data = {}
Expand All @@ -631,6 +721,8 @@ def get_machine_specs():
return data

data["gpu"] = {"count": 0, "details": []}
gpu_process_ids = set()

try:
libnvidia_path = get_libnvidia_ml_path()
if not libnvidia_path:
Expand Down Expand Up @@ -680,6 +772,12 @@ def get_machine_specs():
}
)

processes = nvmlDeviceGetComputeRunningProcesses_v2(handle)

# Collect process IDs
for proc in processes:
gpu_process_ids.add(proc.pid)

nvmlShutdown()
except Exception as exc:
# print(f'Error getting os specs: {exc}', flush=True)
Expand All @@ -699,6 +797,8 @@ def get_machine_specs():
except Exception as exc:
data["docker_cfg_scrape_error"] = repr(exc)

data['gpu_processes'] = get_gpu_processes(gpu_process_ids)

data["cpu"] = {"count": 0, "model": "", "clocks": []}
try:
lscpu_output = run_cmd("lscpu")
Expand Down
4 changes: 4 additions & 0 deletions neurons/validators/src/services/task_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,7 @@ async def docker_connection_check(
"executor_port": executor_info.port,
"ssh_username": executor_info.ssh_username,
"ssh_port": executor_info.ssh_port,
"version": settings.VERSION
}
),
)
Expand All @@ -246,6 +247,7 @@ async def docker_connection_check(
"ssh_port": executor_info.ssh_port,
"internal_port": internal_port,
"external_port": external_port,
"version": settings.VERSION,
}
context.set(f"[_docker_connection_check][{executor_name}]")

Expand Down Expand Up @@ -358,6 +360,7 @@ async def create_task(
"executor_port": executor_info.port,
"executor_ssh_username": executor_info.ssh_username,
"executor_ssh_port": executor_info.ssh_port,
"version": settings.VERSION,
}
try:
logger.info(_m("Start job on an executor", extra=get_extra_info(default_extra)))
Expand Down Expand Up @@ -956,6 +959,7 @@ async def _run_task(
"executor_port": executor_info.port,
"miner_hotkey": miner_hotkey,
"command": command[:100] + ("..." if len(command) > 100 else ""),
"version": settings.VERSION,
}
context.set(f"[_run_task][{executor_name}]")
logger.info(
Expand Down
1 change: 1 addition & 0 deletions neurons/validators/version.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
3.4.2

0 comments on commit 34bef99

Please sign in to comment.