diff --git a/Makefile b/Makefile
index 1ba0b43..d402199 100644
--- a/Makefile
+++ b/Makefile
@@ -10,7 +10,7 @@ BINARY_COLLECT := perf-collect
BINARY_POSTPROCESS := perf-postprocess
default: dist
-.PHONY: test default dist format format_check style_error_check check dist/version_file dist/$(SOURCE_PACKAGE)
+.PHONY: test default dist format format_check style_error_check pytype check dist/version_file dist/$(SOURCE_PACKAGE)
clean_dir:
rm -rf build/*
@@ -78,6 +78,9 @@ style_error_check:
# ignore long lines and conflicts with black, i.e., black wins
flake8 *.py src --ignore=E501,W503,E203
-check: format_check style_error_check
+pytype: *.py src/*.py
+ pytype ./*.py
+
+check: format_check style_error_check pytype
dist: check dist/$(PACKAGE_EXTERNAL)
diff --git a/_version.txt b/_version.txt
index 0b1f1ed..fd9d1a5 100644
--- a/_version.txt
+++ b/_version.txt
@@ -1 +1 @@
-1.2.13
+1.2.14
diff --git a/perf-collect.py b/perf-collect.py
index 2c552bf..bd949ec 100644
--- a/perf-collect.py
+++ b/perf-collect.py
@@ -5,6 +5,7 @@
# SPDX-License-Identifier: BSD-3-Clause
###########################################################################################################
+import json
import logging
import os
import platform
@@ -39,7 +40,7 @@ def write_metadata(
muxinterval,
thread,
socket,
- metadata_only=False,
+ psi,
):
tsc_freq = str(perf_helpers.get_tsc_freq())
data = ""
@@ -47,11 +48,6 @@ def write_metadata(
validate_file(outcsv)
with open(outcsv, "r") as original:
time_stamp = original.readline()
- if metadata_only and time_stamp.startswith("### META DATA ###"):
- logging.warning(
- "Not prepending metadata, already present in %s " % (outcsv)
- )
- return
data = original.read()
with open(outcsv, "w") as modified:
modified.write("### META DATA ###,\n")
@@ -120,6 +116,7 @@ def write_metadata(
modified.write("cpusets" + cpusets + ",\n")
modified.write("Percore mode," + threadmode + ",\n")
modified.write("Persocket mode," + socketmode + ",\n")
+ modified.write("PSI," + json.dumps(psi) + "\n")
modified.write("PerfSpect version," + perf_helpers.get_tool_version() + ",\n")
modified.write("### PERF EVENTS ###" + ",\n")
for e in collection_events:
@@ -137,6 +134,30 @@ def write_metadata(
modified.write(data)
+def get_psi():
+ psi = []
+ for resource in ["cpu", "memory", "io"]:
+ with open("/proc/pressure/" + resource) as f:
+ psi.append(f.readline().split()[4].split("=")[1])
+ return psi
+
+
+def supports_psi():
+ psi = []
+ for resource in ["cpu", "memory", "io"]:
+ try:
+ with open("/proc/pressure/" + resource) as _:
+ psi.append(resource)
+ except Exception:
+ pass
+ if len(psi) == 3:
+ logging.info("PSI metrics supported")
+ return True
+ else:
+ logging.info("PSI metrics not supported")
+ return False
+
+
def resource_path(relative_path):
"""Get absolute path to resource, works for dev and for PyInstaller"""
base_path = getattr(sys, "_MEIPASS", os.path.dirname(os.path.abspath(__file__)))
@@ -229,17 +250,21 @@ def validate_file(fname):
nmi_watchdog = perf_helpers.disable_nmi_watchdog()
initial_pmus = perf_helpers.pmu_contention_detect()
interval = 5000
+ collect_psi = False
if args.thread:
logging.info("Run mode: thread")
+ collect_psi = supports_psi()
elif args.socket:
logging.info("Run mode: socket")
+ collect_psi = supports_psi()
elif args.pid is not None:
logging.info("Run mode: pid")
elif args.cid is not None:
logging.info("Run mode: cid")
else:
logging.info("Run mode: system")
+ collect_psi = supports_psi()
if args.muxinterval > 1000:
crash("Input argument muxinterval is too large, max is [1s or 1000ms]")
@@ -279,6 +304,25 @@ def validate_file(fname):
else:
crash("Unknown application type")
+ # get perf events to collect
+ sys_devs = perf_helpers.get_sys_devices()
+ if (
+ "uncore_cha" not in sys_devs
+ and "uncore_cbox" not in sys_devs
+ and "uncore_upi" not in sys_devs
+ and "uncore_qpi" not in sys_devs
+ and "uncore_imc" not in sys_devs
+ ):
+ logging.info("disabling uncore (possibly in a vm?)")
+ have_uncore = False
+ if arch == "icelake":
+ logging.warning(
+ "Due to lack of vPMU support, TMA L1 events will not be collected"
+ )
+ if arch == "sapphirerapids" or arch == "emeraldrapids":
+ logging.warning(
+ "Due to lack of vPMU support, TMA L1 & L2 events will not be collected"
+ )
events, collection_events = prep_events.prepare_perf_events(
eventfile,
(
@@ -305,26 +349,6 @@ def validate_file(fname):
if args.cid is not None:
cgroups = perf_helpers.get_cgroups(args.cid)
- # get perf events to collect
- sys_devs = perf_helpers.get_sys_devices()
- if (
- "uncore_cha" not in sys_devs
- and "uncore_cbox" not in sys_devs
- and "uncore_upi" not in sys_devs
- and "uncore_qpi" not in sys_devs
- and "uncore_imc" not in sys_devs
- ):
- logging.info("disabling uncore (possibly in a vm?)")
- have_uncore = False
- if arch == "icelake":
- logging.warning(
- "Due to lack of vPMU support, TMA L1 events will not be collected"
- )
- if arch == "sapphirerapids" or arch == "emeraldrapids":
- logging.warning(
- "Due to lack of vPMU support, TMA L1 & L2 events will not be collected"
- )
-
if args.thread or args.socket or args.pid is not None or args.cid is not None:
logging.info("Not collecting uncore events in this run mode")
@@ -367,20 +391,33 @@ def validate_file(fname):
if args.verbose:
logging.info(cmd)
try:
+ psi = []
start = time.time()
- subprocess.call(perfargs) # nosec
+ perf = subprocess.Popen(perfargs) # nosec
+ while perf.poll() is None:
+ if collect_psi:
+ psi.append(get_psi())
+ time.sleep(interval / 1000)
end = time.time()
if end - start < 7:
logging.warning(
"PerfSpect was run for a short duration, some events might be zero or blank because they never got scheduled"
)
- logging.info("Collection complete! Calculating TSC frequency now")
+
+ except subprocess.SubprocessError as e:
+ perf.kill() # type: ignore
+ crash("Failed to start perf\n" + str(e))
except KeyboardInterrupt:
- logging.info("Collection stopped! Caculating TSC frequency now")
- except Exception:
- crash("perf encountered errors")
+ perf.kill() # type: ignore
+ except Exception as e:
+ perf.kill() # type: ignore
+ crash(str(e) + "\nperf encountered errors")
+
+ logging.info("Collection complete!")
cpuid_info = perf_helpers.get_cpuid_info(procinfo)
+ if collect_psi:
+ psi.append(get_psi())
write_metadata(
args.outcsv,
collection_events,
@@ -390,7 +427,7 @@ def validate_file(fname):
args.muxinterval,
args.thread,
args.socket,
- False,
+ list(map(list, zip(*psi))),
)
os.chmod(args.outcsv, 0o666) # nosec
diff --git a/perf-postprocess.py b/perf-postprocess.py
index b2eb3cc..0208829 100644
--- a/perf-postprocess.py
+++ b/perf-postprocess.py
@@ -309,6 +309,8 @@ def get_metadata_as_dict(meta_data_lines):
meta_data["SOCKET_CORES"] = []
cores = ((line.split("\n")[0]).split(",")[1]).split(";")[:-1]
meta_data["SOCKET_CORES"].append(cores)
+ elif line.startswith("PSI"):
+ meta_data["PSI"] = json.loads(line.split("PSI,")[1])
return meta_data
@@ -623,8 +625,14 @@ def write_html(time_series_df, perf_mode, out_file_path):
["MEMORY", "metric_TMA_..Memory_Bound(%)"],
["BADSPECULATION", "metric_TMA_Bad_Speculation(%)"],
["RETIRING", "metric_TMA_Retiring(%)"],
+ ["PSI_CPU", "cpu stall (us)"],
+ ["PSI_MEM", "memory stall (us)"],
+ ["PSI_IO", "io stall (us)"],
]:
- html = html.replace(number[0], str(avg.loc[number[1], 0]))
+ try:
+ html = html.replace(number[0], str(avg.loc[number[1], 0]))
+ except Exception:
+ html = html.replace(number[0], "0")
with open(
os.path.splitext(out_file_path)[0] + ".html", "w", encoding="utf-8"
@@ -827,6 +835,23 @@ def generate_metrics(
len(errors["MISSING EVENTS"]) > 0 or len(errors["ZERO DIVISION"]) > 0
):
crash("Failing due to postprocessing errors")
+
+ # add psi
+ if len(meta_data["PSI"]) > 0 and perf_mode == Mode.System:
+ psi_len = range(len(time_series_df.columns))
+ time_series_df.loc["cpu stall (us)"] = [
+ int(meta_data["PSI"][0][x + 1]) - int(meta_data["PSI"][0][x])
+ for x in psi_len
+ ]
+ time_series_df.loc["memory stall (us)"] = [
+ int(meta_data["PSI"][1][x + 1]) - int(meta_data["PSI"][1][x])
+ for x in psi_len
+ ]
+ time_series_df.loc["io stall (us)"] = [
+ int(meta_data["PSI"][2][x + 1]) - int(meta_data["PSI"][2][x])
+ for x in psi_len
+ ]
+
generate_metrics_time_series(time_series_df, perf_mode, out_file_path)
generate_metrics_averages(time_series_df, perf_mode, out_file_path)
if perf_mode == Mode.System:
diff --git a/requirements.txt b/requirements.txt
index d4ee382..9afde2e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,6 @@
black
flake8
+pytype
simpleeval
pandas
plotly
diff --git a/src/base.html b/src/base.html
index 8db599a..981b766 100644
--- a/src/base.html
+++ b/src/base.html
@@ -276,6 +276,7 @@
variant="scrollable"
>
+
@@ -380,6 +381,84 @@
+
+
+
+ Pressure Stall Information (PSI)
+
+
+ Your workload spent an average of {(PSI_MEM + PSI_CPU + PSI_IO) / 50000}% of time stalled waiting on a hardware resource.
+
+
+ -
+
+ Stall is the amount of time in (in microseconds) that any task was waiting on a given resource.
+
+
+ -
+
+ When CPU, memory or IO devices are contended, workloads experience latency spikes, throughput losses, and run the risk of OOM kills.
+
+
+ -
+
+ Low or zero PSI means you are paying for capacity you don't use. You might be able to downsize your server.
+
+
+ -
+
+ High PSI means you should optimize your code or upgrade your server in whichever category is the largest.
+
+
+ -
+
+ Learn more here
+
+
+
+
+
+
+
+
+
+
@@ -423,7 +502,7 @@
@@ -467,7 +546,7 @@
diff --git a/src/perf_helpers.py b/src/perf_helpers.py
index 57581de..326912d 100644
--- a/src/perf_helpers.py
+++ b/src/perf_helpers.py
@@ -167,8 +167,10 @@ def disable_nmi_watchdog():
else:
logging.info("nmi_watchdog disabled!")
return nmi_watchdog_status
- except (subprocess.CalledProcessError, ValueError) as e:
+ except subprocess.CalledProcessError as e:
crash(e.output + "\nFailed to disable nmi_watchdog.")
+ except ValueError as e:
+ crash(f"Failed to disable watchdog: {e}")
# enable nmi watchdog
@@ -182,8 +184,10 @@ def enable_nmi_watchdog():
logging.warning("Failed to re-enable nmi_watchdog!")
else:
logging.info("nmi_watchdog enabled!")
- except (subprocess.CalledProcessError, ValueError) as e:
+ except subprocess.CalledProcessError as e:
logging.warning(e.output + "\nFailed to re-enable nmi_watchdog!")
+ except ValueError as e:
+ logging.warning(f"Failed to re-enable nmi_watchdog: {e}")
# set/reset perf event mux interval for pmu events