Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add aggregate resource usage graphs #198

Merged
merged 3 commits into from
Apr 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions do.sh
Original file line number Diff line number Diff line change
Expand Up @@ -406,13 +406,19 @@ function mine_data() {
resource_usage_logs=$(find ${out_dir}/logs -name process-stats.json \
| grep -E 'ovn-tester|ovn-central-az[0-2]-')
python3 ${topdir}/utils/process-stats.py \
resource-usage-report-central.html ${resource_usage_logs}
-o resource-usage-report-central.html ${resource_usage_logs}

# Collecting stats only for 3 workers to avoid bloating the report.
resource_usage_logs=$(find ${out_dir}/logs -name process-stats.json \
| grep ovn-scale | head -3)
python3 ${topdir}/utils/process-stats.py \
resource-usage-report-worker.html ${resource_usage_logs}
-o resource-usage-report-worker.html ${resource_usage_logs}

# Preparing reports for aggregate resource usage.
resource_usage_logs=$(find ${out_dir}/logs -name process-stats.json)
python3 ${topdir}/utils/process-stats.py --aggregate \
-o resource-usage-report-aggregate.html ${resource_usage_logs}

deactivate

popd
Expand Down
11 changes: 9 additions & 2 deletions ovn-fake-multinode-utils/process-monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def monitor(suffix: str, out_file: str, exit_file: str) -> None:
for p in psutil.process_iter():
if any(name in p.name() for name in process_names):
processes.add(p)
elif any(
elif p.name() != 'monitor' and any(
name in part
for part in p.cmdline()
for name in process_names
Expand All @@ -36,7 +36,14 @@ def monitor(suffix: str, out_file: str, exit_file: str) -> None:
tme = time.time()
for p in processes:
try:
name = p.name() + "-" + suffix + "-" + str(p.pid)
name = p.name()
for arg in p.cmdline():
if arg.endswith('.pid') or arg.endswith('.py'):
name = arg.split('/')[-1].split('.')[0]
break
dceara marked this conversation as resolved.
Show resolved Hide resolved

name = name + "|" + suffix + "|" + str(p.pid)

# cpu_percent(seconds) call will block
# for the amount of seconds specified.
cpu = p.cpu_percent(0.5)
Expand Down
116 changes: 104 additions & 12 deletions utils/process-stats.py
Original file line number Diff line number Diff line change
@@ -1,45 +1,118 @@
import argparse
import json
import logging
import os
import pandas as pd
import plotly.express as px
import sys

from datetime import datetime
from typing import Dict, List


FORMAT = '%(asctime)s |%(levelname)s| %(message)s'
logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=FORMAT)
log = logging.getLogger(__name__)


def read_file(filename: str) -> Dict:
with open(filename, "r") as file:
return json.load(file)


def resource_stats_generate(filename: str, data: Dict) -> None:
def aggregated(df: pd.DataFrame) -> (pd.DataFrame, int):
column_names = list(df.columns)
value_name = column_names[2]

log.info(f'Pivot and interpolate {value_name} ...')
df = df.pivot_table(
index='Time', columns='Process', values=value_name, aggfunc='mean'
).interpolate(method='time', limit_direction='both')

result = pd.DataFrame(index=df.index)
processes = {p.split('|')[0] for p in df.columns}

log.info(f'Aggregating {value_name} ...')
for p in processes:
df_filtered = df.filter(regex='^' + p)
result[p + '|sum'] = df_filtered.sum(axis=1)
result[p + '|mean'] = df_filtered.mean(axis=1)
result[p + '|max'] = df_filtered.max(axis=1)
result[p + '|min'] = df_filtered.min(axis=1)

result['ovn|sum'] = df.filter(regex=r'^ovn.*\|ovn-(central|scale).*').sum(
axis=1
)
ovn_max = result['ovn|sum'].astype('int').max()

result['ovs|sum'] = df.filter(regex=r'^ovs.*\|ovn-(central|scale).*').sum(
axis=1
)

result = result.astype('int').reset_index().melt(id_vars=['Time'])
result.columns = column_names
result = result.sort_values(['Process', 'Time'])

return result, ovn_max


def resource_stats_generate(
filename: str, data: Dict, aggregate: bool
) -> None:
rss: List[List] = []
cpu: List[List] = []

log.info('Preprocessing ...')
for ts, time_slice in sorted(data.items()):
tme = pd.Timestamp.fromtimestamp(float(ts)).round('1s')
for name, res in time_slice.items():
tme = datetime.fromtimestamp(float(ts))
rss_mb = int(res['rss']) >> 20
rss.append([tme, name, rss_mb])
cpu.append([tme, name, float(res['cpu'])])

log.info('Creating DataFrame ...')
df_rss = pd.DataFrame(rss, columns=['Time', 'Process', 'RSS (MB)'])
df_cpu = pd.DataFrame(cpu, columns=['Time', 'Process', 'CPU (%)'])

if aggregate:
df_rss, max_sum_rss = aggregated(df_rss)
df_cpu, max_sum_cpu = aggregated(df_cpu)

log.info('Creating charts ...')
rss_chart = px.line(
df_rss,
x='Time',
y='RSS (MB)',
color='Process',
title='Resident Set Size',
title=('Aggregate ' if aggregate else '') + 'Resident Set Size',
)
cpu_chart = px.line(
df_cpu, x='Time', y='CPU (%)', color='Process', title='CPU usage'
df_cpu,
x='Time',
y='CPU (%)',
color='Process',
title=('Aggregate ' if aggregate else '') + 'CPU usage',
)

log.info(f'Writing HTML to {filename} ...')
with open(filename, 'w') as report_file:
report_file.write('<html>')
if aggregate:
report_file.write(
igsilya marked this conversation as resolved.
Show resolved Hide resolved
f'''
<table border="1" class="dataframe">
<tbody>
<tr>
<td>Max(Sum(OVN RSS))</td>
<td> {max_sum_rss} MB</td>
</tr>
<tr>
<td>Max(Sum(OVN CPU))</td>
<td> {max_sum_cpu} %</td>
</tr>
</tbody>
</table>
'''
)
report_file.write(
rss_chart.to_html(
full_html=False,
Expand All @@ -60,17 +133,36 @@ def resource_stats_generate(filename: str, data: Dict) -> None:


if __name__ == '__main__':
if len(sys.argv) < 3:
print(f'Usage: {sys.argv[0]} output-file input-file [input-file ...]')
sys.exit(1)
parser = argparse.ArgumentParser(
description='Generate resource usage charts.'
)
parser.add_argument(
'--aggregate', action='store_true', help='generate aggregate charts'
)
parser.add_argument(
'-o', '--output', required=True, help='file to write an HTML result'
)
dceara marked this conversation as resolved.
Show resolved Hide resolved
parser.add_argument(
'input_files',
metavar='input-file',
type=str,
nargs='+',
help='JSON file with recorded process statistics',
)

if os.path.isfile(sys.argv[1]):
print(f'Output file {sys.argv[1]} already exists')
args = parser.parse_args()

if os.path.isfile(args.output):
log.fatal(f'Output file {args.output} already exists')
sys.exit(2)

log.info(f'Processing stats from {len(args.input_files)} files.')

log.info('Reading ...')
data: Dict = {}
for f in sys.argv[2:]:
for f in args.input_files:
d = read_file(f)
data.update(d)

resource_stats_generate(sys.argv[1], data)
resource_stats_generate(args.output, data, args.aggregate)
log.info('Done.')
Loading