From a03ac88d0d36d5f550bab17e02633a5e370ed83e Mon Sep 17 00:00:00 2001 From: root Date: Wed, 18 Oct 2023 15:09:53 +0200 Subject: [PATCH] adding a bit of documentation --- .../power_measure/experiment.py | 36 ++++++++++++++----- .../power_measure/prometheus_client.py | 8 +++-- .../power_measure/rapl_power.py | 10 ++++-- 3 files changed, 40 insertions(+), 14 deletions(-) diff --git a/deep_learning_power_measure/power_measure/experiment.py b/deep_learning_power_measure/power_measure/experiment.py index 8326487..1290c86 100644 --- a/deep_learning_power_measure/power_measure/experiment.py +++ b/deep_learning_power_measure/power_measure/experiment.py @@ -110,6 +110,19 @@ def get_pid_list(current_pid, parent_pid=None): pid_list.remove(queue_pid) return pid_list +def collect_all(p,d): + for c in p.children(): + name = c.username()+"_"+c.name() + if name not in d: + d[name]= [] + d[name].append(c.pid) + collect_all(c,d) + +def get_pid_list_all(): + p = psutil.Process(1) + collect_all(p,d) + return d + def interpolate(metric1, metric2): """ return two new metrics so that metric1 and metric2 have the same range of dates @@ -362,13 +375,14 @@ def monitor_machine(self, pid_args=None, parent_pid = 1, period=1, measurement_p metrics_gpu = gpu_power.get_nvidia_gpu_power(pid_list) self.log_usage(metrics_gpu, pid_list) if self.rapl_available: - metrics['cpu'] = rapl_power.get_metrics(pid_list, memory_usage=True, rapl=True, cpu_usage=True, period=period) + metrics['cpu'] = rapl_power.get_metrics(pid_list, memory_usage=True, measure_rapl=True, measure_cpu_usage=True, period=period) else: - metrics['cpu'] = rapl_power.get_metrics(pid_list, memory_usage=True, rapl=False, cpu_usage=True, period=period) + metrics['cpu'] = rapl_power.get_metrics(pid_list, memory_usage=True, measure_rapl=False, measure_cpu_usage=True, period=period) if self.nvidia_available: per_gpu_attributable_power, _ = self.allocate_gpu_power(metrics_gpu['per_gpu_power_draw']) metrics_gpu['per_gpu_attributable_power'] = per_gpu_attributable_power metrics['gpu'] = metrics_gpu + summarize_metrics() self.db_driver.save_power_metrics(metrics) def measure(self, queue, pid_args, current_pid = None, period=1, measurement_period=2): @@ -400,6 +414,7 @@ def measure(self, queue, pid_args, current_pid = None, period=1, measurement_per # launch in separate threads because they won't have the same frequency metrics_gpu = gpu_power.get_nvidia_gpu_power(pid_list) self.log_usage(metrics_gpu, pid_list) + metrics['temperature'] = psutil.sensors_temperatures() if self.rapl_available: metrics['cpu'] = rapl_power.get_metrics(pid_list, memory_usage=True, measure_rapl=True, measure_cpu_usage=True, period=period) else: @@ -613,7 +628,7 @@ def total_power_draw(self): abs_nvidia_power = self.total_('nvidia_draw_absolute') return total_intel_power + abs_nvidia_power - def display_curves(self, metric_names): + def display_curves(self, metric_names, saveto=None): """ Input: metric_names : list of metric names : [metric_name1, metric_name2,...] @@ -638,8 +653,10 @@ def display_curves(self, metric_names): ax.format_xdata = mdates.DateFormatter('%H:%M:%S') plt.xticks(rotation=45) plt.legend() - plt.show() - + if saveto: + plt.savefig(saveto) + else: + plt.show() def display_2_curves(self, metric_name1, metric_name2): """ @@ -649,14 +666,14 @@ def display_2_curves(self, metric_name1, metric_name2): #if curve is None: # raise Exception('invalide metric name') if isinstance(curve,list): - df = pd.DataFrame(curve) + df = pd.DataFrame( [item for sublist in curve for item in sublist] ) df['date_datetime'] = [ datetime.datetime.fromtimestamp(d) for d in df['date'] ] df['date_datetime'] = pd.to_datetime(df['date_datetime']) ax.plot(df['date_datetime'], df['value'], label=metric_name1) ax.set_ylabel(metric_name1, color="blue",fontsize=14) else: for device_id, metric in curve.items(): - df = pd.DataFrame(metric) + df = pd.DataFrame( [item for sublist in metric for item in sublist] ) df['date_datetime'] = [ datetime.datetime.fromtimestamp(d) for d in df['date'] ] df['date_datetime'] = pd.to_datetime(df['date_datetime']) ax.plot(df['date_datetime'], df['value'],label=metric_name1+":"+device_id) @@ -666,14 +683,14 @@ def display_2_curves(self, metric_name1, metric_name2): ax2 = ax.twinx() curve = self.get_curve(metric_name2) if isinstance(curve,list): - df = pd.DataFrame(curve) + df = pd.DataFrame( [item for sublist in curve for item in sublist] ) df['date_datetime'] = [ datetime.datetime.fromtimestamp(d) for d in df['date'] ] df['date_datetime'] = pd.to_datetime(df['date_datetime']) ax2.plot(df['date_datetime'], df['value'], label=metric_name2, color="red") ax2.set_ylabel(metric_name2, color="red",fontsize=14) else: for device_id, metric in curve.items(): - df = pd.DataFrame(metric) + df = pd.DataFrame( [item for sublist in metric for item in sublist] ) df['date_datetime'] = [ datetime.datetime.fromtimestamp(d) for d in df['date'] ] df['date_datetime'] = pd.to_datetime(df['date_datetime']) ax2.plot(df['date_datetime'], df['value'],label=metric_name2+":"+device_id, color='red') @@ -723,6 +740,7 @@ def get_summary(self, start=None, end=None): summary['cpu']['mem_use_abs'] = self.average_('per_process_mem_use_abs',start=start, end=end) summary['cpu']['mem_use_uss'] = self.average_('per_process_mem_use_uss',start=start, end=end) summary['cpu']['absolute_cpu_time_per_pid'] = self.total_('absolute_cpu_time_per_pid',start=start, end=end) + summary['cpu']['average_cpu_use'] = self.average_('per_process_cpu_uses',start=start, end=end) if self.gpu_metrics is not None: summary['gpu'] = {} summary['gpu']['abs_nvidia_power'] = self.total_('nvidia_draw_absolute',start=start, end=end) diff --git a/deep_learning_power_measure/power_measure/prometheus_client.py b/deep_learning_power_measure/power_measure/prometheus_client.py index 112734f..7c9c9a6 100644 --- a/deep_learning_power_measure/power_measure/prometheus_client.py +++ b/deep_learning_power_measure/power_measure/prometheus_client.py @@ -11,8 +11,12 @@ class PrometheusClient(): it serves the data with a flask app on the /metrics route Experiment will call this class """ - def __init__(self): + def __init__(self, port=None): #self.app = Flask(__name__) + if port == None: + self.port = 5001 + else: + self.port = port self.wattemeter_exec = None self.gauges = {} for metric, description in metric_metadata.items(): @@ -60,4 +64,4 @@ def save_wattmeter_metrics(self): logging.warning('You are trying to log wattmeter metric to prometheus, but this has not been implemented. Skipping.') def run(self): - self.app.run(host = 'localhost', port=5001) \ No newline at end of file + self.app.run(host = 'localhost', port=self.port) \ No newline at end of file diff --git a/deep_learning_power_measure/power_measure/rapl_power.py b/deep_learning_power_measure/power_measure/rapl_power.py index 701fff4..255742d 100644 --- a/deep_learning_power_measure/power_measure/rapl_power.py +++ b/deep_learning_power_measure/power_measure/rapl_power.py @@ -7,9 +7,8 @@ from . import rapl def is_rapl_compatible(): - """ - Check if rapl logs are available on this machine. - """ + """Check if rapl logs are available on this machine.""" + if not os.path.isdir(rapl.rapl_dir): return (False, "cannot find rapl directory in "+rapl.rapl_dir) if not (os.path.isfile('/sys/class/powercap/intel-rapl/intel-rapl:0/energy_uj') and os.access('/sys/class/powercap/intel-rapl/intel-rapl:0/energy_uj', os.R_OK)): @@ -188,6 +187,11 @@ def get_percent_uses(infos1, infos2, zombies, process_list): def get_cpu_uses(process_list, period=2.0): """Extracts the relative number of cpu clock attributed to each process + + Compute for each process p in process_list t over the period + - relative cpu usage : ( cpu time of p ) / (cpu time of the whole system) + - absolute cpu usage : cpu time of p + Args: process_list : list of process [pid1, pid2,...] for which the cpu use will be measured