-
Notifications
You must be signed in to change notification settings - Fork 29
/
Copy pathgpumon.py
111 lines (91 loc) · 3.16 KB
/
gpumon.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import argparse
import os
import signal
import subprocess
import time
from queue import Empty, Queue
from threading import Thread
import numpy as np
import wandb
parser = argparse.ArgumentParser()
parser.add_argument("--wbname", type=str, required=True)
parser.add_argument("--wbproj", type=str, required=True)
parser.add_argument("--wbgroup", type=str, default=None)
args = parser.parse_args()
if args.wbgroup is None:
wandb.init(project=args.wbproj, name=args.wbname, settings=wandb.Settings(_disable_stats=True))
else:
wandb.init(project=args.wbproj, group=args.wbgroup, name=args.wbname, settings=wandb.Settings(_disable_stats=True))
def enqueue_output(out, queue):
for line in iter(out.readline, b""):
queue.put(line)
out.close()
os.setpgrp()
clock_proc = subprocess.Popen("nvidia-smi dmon -s c", shell=True, stdout=subprocess.PIPE, bufsize=1)
clock_proc.daemon = True
time.sleep(0.5)
throttle_reasons = [
"clocks_throttle_reasons.gpu_idle",
"clocks_throttle_reasons.applications_clocks_setting",
"clocks_throttle_reasons.sw_power_cap",
"clocks_throttle_reasons.sw_thermal_slowdown",
"clocks_throttle_reasons.hw_slowdown",
"clocks_throttle_reasons.hw_thermal_slowdown",
"clocks_throttle_reasons.hw_power_brake_slowdown",
"clocks_throttle_reasons.sync_boost",
]
throttle_proc = subprocess.Popen(
f"nvidia-smi --query-gpu=index,{','.join(throttle_reasons)} --format=csv,noheader --loop=1",
shell=True,
stdout=subprocess.PIPE,
bufsize=1,
)
throttle_proc.daemon = True
# create queue that gets the output lines from both processes
q = Queue()
clock_thread = Thread(target=enqueue_output, args=(clock_proc.stdout, q))
clock_thread.daemon = True
thottle_thread = Thread(target=enqueue_output, args=(throttle_proc.stdout, q))
thottle_thread.daemon = True
clock_thread.start()
thottle_thread.start()
throttles = [[], []]
clocks = [[], []]
while clock_proc.poll() is None or not q.empty():
try:
line = q.get_nowait()
except Empty:
pass
else:
line = line.decode("utf-8").strip()
if "#" in line:
continue
if "," in line:
raw = line.split(",")
gpu = int(raw[0])
bits = [0 if "Not" in a else 1 for a in raw[1:]]
throttles[gpu].append(bits)
# print(gpu, bits)
else:
raw = line.split(" ")
gpu = int(raw[0])
clock = int(raw[-1])
clocks[gpu].append(clock)
# print(gpu, clock)
if len(clocks[0]) > 30:
try:
throttles = np.array(throttles)
clocks = np.array(clocks)
log_dict = {}
for gpu in [0, 1]:
log_dict[f"gpu.{gpu}.clock.speed"] = np.mean(clocks[gpu])
for r, reason in enumerate(throttle_reasons):
log_dict[f"gpu.{gpu}.{reason}"] = np.mean(throttles[gpu, :, r])
print("\n".join([k.ljust(80) + str(v) for k, v in log_dict.items()]))
wandb.log(log_dict)
except:
pass
throttles = [[], []]
clocks = [[], []]
os.kill(throttle_proc.pid, signal.SIGINT)
os.kill(clock_proc.pid, signal.SIGINT)