From bb54833668f47ded4167020e46b518ab493a6b01 Mon Sep 17 00:00:00 2001 From: hilldani Date: Thu, 13 Apr 2023 14:13:17 -0700 Subject: [PATCH] improve logging and change output file path and ownership (#26) --- .gitignore | 3 +- README.md | 4 +- _version.txt | 2 +- events/spr.txt | 4 +- perf-collect.py | 90 ++++++++++++++++---------------------- perf-postprocess.py | 55 ++++++++++++++--------- src/common.py | 24 ++++++++-- src/perf_helpers.py | 36 +++------------ src/prepare_perf_events.py | 25 ++++------- src/report.py | 4 +- 10 files changed, 114 insertions(+), 133 deletions(-) diff --git a/.gitignore b/.gitignore index 1d112fd..182c049 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ build/* dist/* src/libtsc.so -__pycache__ \ No newline at end of file +__pycache__ +*.log \ No newline at end of file diff --git a/README.md b/README.md index fe9aa4a..9f6811d 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ perf-collect: Collects harware events perf-postprocess: Calculates high level metrics from hardware events -- `perf-postprocess -r results/perfstat.csv` +- `./perf-postprocess` ## Quick start (requires perf installed) @@ -27,7 +27,7 @@ perf-postprocess: Calculates high level metrics from hardware events wget -qO- https://github.com/intel/PerfSpect/releases/latest/download/perfspect.tgz | tar xvz cd perfspect sudo ./perf-collect --timeout 10 -sudo ./perf-postprocess -r results/perfstat.csv --html perfstat.html +./perf-postprocess ``` ## Deploy in Kubernetes diff --git a/_version.txt b/_version.txt index 3c43790..c04c650 100644 --- a/_version.txt +++ b/_version.txt @@ -1 +1 @@ -1.2.6 +1.2.7 diff --git a/events/spr.txt b/events/spr.txt index 7eb3a04..4c18970 100644 --- a/events/spr.txt +++ b/events/spr.txt @@ -58,7 +58,7 @@ cpu/event=0x00,umask=0x85,period=10000003,name='PERF_METRICS.BRANCH_MISPREDICTS' cpu/event=0x00,umask=0x84,period=10000003,name='PERF_METRICS.HEAVY_OPERATIONS'/; cpu/event=0xad,umask=0x10,period=1000003,name='INT_MISC.UOP_DROPPING'/, -cpu/event=0xad,umask=0x40,frontend=0x7,period=1000003,name='INT_MISC.UNKNOWN_BRANCH_CYCLES'/, +cpu/event=0xad,umask=0x40,period=1000003,name='INT_MISC.UNKNOWN_BRANCH_CYCLES'/, cpu/event=0xa6,umask=0x21,cmask=0x05,period=2000003,name='EXE_ACTIVITY.BOUND_ON_LOADS'/, cpu-cycles; @@ -89,7 +89,7 @@ cpu/event=0x48,umask=0x01,period=1000003,name='L1D_PEND_MISS.PENDING'/, cpu/event=0x03,umask=0x88,period=100003,name='LD_BLOCKS.NO_SR'/, cpu-cycles; -cpu/event=0xc2,umask=0x04,frontend=0x8,period=2000003,name='UOPS_RETIRED.MS'/, +cpu/event=0xc2,umask=0x04,period=2000003,name='UOPS_RETIRED.MS'/, cpu/event=0xec,umask=0x02,period=2000003,name='CPU_CLK_UNHALTED.DISTRIBUTED'/, cpu-cycles; diff --git a/perf-collect.py b/perf-collect.py index 69fe6e0..5ce4e36 100644 --- a/perf-collect.py +++ b/perf-collect.py @@ -5,11 +5,9 @@ # SPDX-License-Identifier: BSD-3-Clause ########################################################################################################### -from __future__ import print_function import logging import os import platform -import re import sys import subprocess # nosec import shlex # nosec @@ -17,14 +15,8 @@ from src import perf_helpers from src import prepare_perf_events as prep_events from src.common import crash +from src import common -logging.basicConfig( - format="%(asctime)s %(levelname)s: %(message)s", - datefmt="%H:%M:%S", - level=logging.NOTSET, - handlers=[logging.FileHandler("debug.log"), logging.StreamHandler(sys.stdout)], -) -log = logging.getLogger(__name__) SUPPORTED_ARCHITECTURES = [ "Broadwell", @@ -54,7 +46,9 @@ def write_metadata( with open(outcsv, "r") as original: time_stamp = original.readline() if metadata_only and time_stamp.startswith("### META DATA ###"): - log.warning("Not prepending metadata, already present in %s " % (outcsv)) + logging.warning( + "Not prepending metadata, already present in %s " % (outcsv) + ) return data = original.read() with open(outcsv, "w") as modified: @@ -164,13 +158,8 @@ def validate_file(fname): crash(str(fname) + " not accessible") -def is_safe_file(fname, substr): - """verify if file name/format is accurate""" - if not fname.endswith(substr): - crash(str(fname) + " isn't appropriate format") - - if __name__ == "__main__": + common.configure_logging(".") if platform.system() != "Linux": crash("PerfSpect currently supports Linux only") @@ -178,8 +167,7 @@ def is_safe_file(fname, substr): script_path = os.path.dirname(os.path.realpath(__file__)) if "_MEI" in script_path: script_path = script_path.rsplit("/", 1)[0] - result_dir = script_path + "/results" - default_output_file = result_dir + "/perfstat.csv" + default_output_file = "perfstat.csv" parser = ArgumentParser(description="perf-collect: Time series dump of PMUs") duration = parser.add_mutually_exclusive_group() @@ -225,7 +213,7 @@ def is_safe_file(fname, substr): "--outcsv", type=str, default=default_output_file, - help="perf stat output in csv format, default=results/perfstat.csv", + help="perf stat output in csv format, default=perfstat.csv", ) parser.add_argument( "-v", @@ -237,7 +225,7 @@ def is_safe_file(fname, substr): if args.version: print(perf_helpers.get_tool_version()) - sys.exit(0) + sys.exit() if os.geteuid() != 0: crash("Must run PerfSpect as root, please re-run") @@ -249,7 +237,7 @@ def is_safe_file(fname, substr): nmi_watchdog = f_nmi.read() if int(nmi_watchdog) != 0: f_nmi.write("0") - log.info("nmi_watchdog disabled") + logging.info("nmi_watchdog disabled") except FileNotFoundError: pass @@ -278,11 +266,13 @@ def is_safe_file(fname, substr): elif arch == "sapphirerapids": eventfile = "spr.txt" + if eventfile is None: + crash(f"failed to match architecture ({arch}) to event file name.") + # Convert path of event file to relative path if being packaged by pyInstaller into a binary if getattr(sys, "frozen", False): basepath = getattr(sys, "_MEIPASS", os.path.dirname(os.path.abspath(__file__))) eventfilename = eventfile - is_safe_file(eventfile, ".txt") eventfile = os.path.join(basepath, eventfile) elif __file__: eventfile = script_path + "/events/" + eventfile @@ -290,16 +280,10 @@ def is_safe_file(fname, substr): else: crash("Unknown application type") - if args.outcsv == default_output_file: - # create results dir - if not os.path.exists(result_dir): - os.mkdir(result_dir) - perf_helpers.fix_path_ownership(result_dir) - else: - if not perf_helpers.validate_outfile(args.outcsv): - crash( - "Output filename not accepted. Filename should be a .csv without special characters" - ) + if not perf_helpers.validate_outfile(args.outcsv): + crash( + "Output filename not accepted. Filename should be a .csv without special characters" + ) mux_intervals = perf_helpers.get_perf_event_mux_interval() if args.muxinterval > 0: @@ -311,22 +295,21 @@ def is_safe_file(fname, substr): cgroups = perf_helpers.get_cgroups_from_cids(args.cid.split(",")) num_cgroups = len(cgroups) - try: - reg = r"^[0-9]*\.[0-9][0-9]*" - kernel = perf_helpers.get_version().split("Linux version")[1].lstrip() - significant_kernel_version = re.match(reg, kernel).group(0) - full_kernel_version = kernel - except Exception as e: - log.exception(e) - crash("Unable to get kernel version") - # get perf events to collect collection_events = [] imc, cha, upi = perf_helpers.get_imc_cacheagent_count() have_uncore = True if imc == 0 and cha == 0 and upi == 0: - log.info("disabling uncore (possibly in a vm?)") + logging.info("disabling uncore (possibly in a vm?)") have_uncore = False + if arch == "icelake": + logging.warning( + "Due to lack of vPMU support, TMA L1 events will not be collected" + ) + if arch == "sapphirerapids": + logging.warning( + "Due to lack of vPMU support, TMA L1 & L2 events will not be collected" + ) events, collection_events = prep_events.prepare_perf_events( eventfile, ( @@ -341,7 +324,7 @@ def is_safe_file(fname, substr): collection_type = "-a" if not args.thread and not args.socket else "-a -A" # start perf stat if args.pid and args.timeout: - log.info("Only CPU/core events will be enabled with pid option") + logging.info("Only CPU/core events will be enabled with pid option") cmd = "perf stat -I %d -x , --pid %s -e %s -o %s sleep %d" % ( interval, args.pid, @@ -351,7 +334,7 @@ def is_safe_file(fname, substr): ) elif args.pid: - log.info("Only CPU/core events will be enabled with pid option") + logging.info("Only CPU/core events will be enabled with pid option") cmd = "perf stat -I %d -x , --pid %s -e %s -o %s" % ( interval, args.pid, @@ -359,7 +342,7 @@ def is_safe_file(fname, substr): args.outcsv, ) elif args.cid and args.timeout: - log.info("Only CPU/core events will be enabled with cid option") + logging.info("Only CPU/core events will be enabled with cid option") perf_format = prep_events.get_cgroup_events_format( cgroups, events, len(collection_events) ) @@ -370,7 +353,7 @@ def is_safe_file(fname, substr): args.timeout, ) elif args.cid: - log.info("Only CPU/core events will be enabled with cid option") + logging.info("Only CPU/core events will be enabled with cid option") perf_format = prep_events.get_cgroup_events_format( cgroups, events, len(collection_events) ) @@ -402,13 +385,13 @@ def is_safe_file(fname, substr): validate_perfargs(perfargs) perf_helpers.pmu_contention_detect(msrs=initial_pmus, detect=True) if args.verbose: - log.info(cmd) + logging.info(cmd) try: - log.info("Collecting perf stat for events in : %s" % eventfilename) + logging.info("Collecting perf stat for events in : %s" % eventfilename) subprocess.call(perfargs) # nosec - log.info("Collection complete! Calculating TSC frequency now") + logging.info("Collection complete! Calculating TSC frequency now") except KeyboardInterrupt: - log.info("Collection stopped! Caculating TSC frequency now") + logging.info("Collection stopped! Caculating TSC frequency now") except Exception: crash("perf encountered errors") @@ -425,13 +408,14 @@ def is_safe_file(fname, substr): False, ) + os.chmod(args.outcsv, 0o666) # nosec + # reset nmi_watchdog to what it was before running perfspect with open("/proc/sys/kernel/nmi_watchdog", "w") as f_nmi: if int(nmi_watchdog) != 0: f_nmi.write(nmi_watchdog) - log.info("nmi_watchdog re-enabled") + logging.info("nmi_watchdog re-enabled") perf_helpers.set_perf_event_mux_interval(True, 1, mux_intervals) - log.info("perf stat dumped to %s" % args.outcsv) - perf_helpers.fix_path_ownership(result_dir, True) + logging.info("perf stat dumped to %s" % args.outcsv) diff --git a/perf-postprocess.py b/perf-postprocess.py index 63af1a0..c8c4f39 100644 --- a/perf-postprocess.py +++ b/perf-postprocess.py @@ -6,8 +6,8 @@ ########################################################################################################### import json -import logging import numpy as np +import logging import os import pandas as pd import re @@ -16,17 +16,10 @@ from enum import Enum from simpleeval import simple_eval from src.common import crash +from src import common from src import perf_helpers from src import report -logging.basicConfig( - format="%(asctime)s %(levelname)s: %(message)s", - datefmt="%H:%M:%S", - level=logging.NOTSET, - handlers=[logging.FileHandler("debug.log"), logging.StreamHandler(sys.stdout)], -) -log = logging.getLogger(__name__) - class Mode(Enum): System = 1 @@ -73,8 +66,8 @@ def get_args(script_path): "-r", "--rawfile", type=str, - default=None, - help="Raw CSV output from perf-collect", + default="perfstat.csv", + help="Raw CSV output from perf-collect, default=perfstat.csv", ) parser.add_argument( "--version", "-V", help="display version information", action="store_true" @@ -83,8 +76,8 @@ def get_args(script_path): "-o", "--outfile", type=str, - default=script_path + "/results/metric_out.csv", - help="perf stat outputs in csv format, default=results/metric_out.csv", + default=script_path + "/metric_out.csv", + help="perf stat outputs in csv format, default=metric_out.csv", ) parser.add_argument( "-v", @@ -560,7 +553,7 @@ def generate_metrics_averages(time_series_df, perf_mode, out_file_path): def log_skip_metric(metric, instance, msg): - log.warning( + logging.warning( msg + ': metric "' + metric["name"] @@ -583,6 +576,13 @@ def generate_metrics( ): time_slice_groups = perf_data_df.groupby("ts", sort=False) time_metrics_result = {} + errors = { + "MISSING DATA": 0, + "ZERO DIVISION": 0, + "MISSING EVENTS": 0, + "MULTIPLE GROUPS": 0, + } + missing_events = set() for time_slice, item in time_slice_groups: time_slice_df = time_slice_groups.get_group(time_slice) current_group_indx = 0 @@ -640,6 +640,7 @@ def generate_metrics( or "]" in expressions_to_evaluate[instance] ): if verbose: + errors["MISSING DATA"] += 1 log_skip_metric( m, expressions_to_evaluate[instance], @@ -657,6 +658,7 @@ def generate_metrics( ) except ZeroDivisionError: if verbose: + errors["ZERO DIVISION"] += 1 log_skip_metric( m, expressions_to_evaluate[instance], @@ -668,7 +670,8 @@ def generate_metrics( break # no need to check other groups if not single_group: if verbose: - log.warning('MULTIPLE GROUPS: metric "' + m["name"] + '"') + errors["MULTIPLE GROUPS"] += 1 + logging.warning('MULTIPLE GROUPS: metric "' + m["name"] + '"') # get events from multiple groups remaining_events_to_find = list(non_constant_mertics) expressions_to_evaluate = {} @@ -697,6 +700,7 @@ def generate_metrics( or "]" in expressions_to_evaluate[instance] ): if verbose: + errors["MISSING DATA"] += 1 log_skip_metric( m, expressions_to_evaluate[instance], @@ -714,6 +718,7 @@ def generate_metrics( ) except ZeroDivisionError: if verbose: + errors["ZERO DIVISION"] += 1 log_skip_metric( m, expressions_to_evaluate[instance], @@ -724,17 +729,22 @@ def generate_metrics( metrics_results[m["name"] + sub_txt] = float(result) else: # some events are missing if verbose: - log.warning( + errors["MISSING EVENTS"] += 1 + logging.warning( 'MISSING EVENTS: metric "' + m["name"] + '" events "' + str(remaining_events_to_find) + '"' ) + missing_events.update(remaining_events_to_find) continue # skip metric time_metrics_result[time_slice] = metrics_results time_series_df = pd.DataFrame(time_metrics_result) - + if verbose: + for error in errors: + logging.warning("Total " + error + ": " + str(errors[error])) + logging.warning("Missing events: " + str(missing_events)) generate_metrics_time_series(time_series_df, perf_mode, out_file_path) generate_metrics_averages(time_series_df, perf_mode, out_file_path) return @@ -812,7 +822,7 @@ def generate_raw_events_percore(perf_data_df, out_file_path): to_drop.append(metric) metric_per_cpu_frame.drop(to_drop, inplace=True) - core_raw_file_name = get_extra_out_file(out_file_path, "cr", excelsheet=False) + core_raw_file_name = get_extra_out_file(out_file_path, "cr") metric_per_cpu_frame.to_csv(core_raw_file_name) return @@ -828,6 +838,7 @@ def generate_raw_events(perf_data_df, out_file_path, perf_mode): if __name__ == "__main__": + common.configure_logging(".") script_path = os.path.dirname(os.path.realpath(__file__)) if "_MEI" in script_path: script_path = script_path.rsplit("/", 1)[0] @@ -884,7 +895,9 @@ def generate_raw_events(perf_data_df, out_file_path, perf_mode): perf_mode, args.verbose, ) - log.info("Generated results file(s) in: " + out_file_path.rsplit("/", 1)[0]) + logging.info( + "Generated results file(s) in: " + out_file_path.rsplit("/", 1)[0] + ) if args.html: report.write_html( cid_out_file_path, @@ -905,7 +918,7 @@ def generate_raw_events(perf_data_df, out_file_path, perf_mode): perf_mode, args.verbose, ) - log.info("Generated results file(s) in: " + out_file_path.rsplit("/", 1)[0]) + logging.info("Generated results file(s) in: " + out_file_path.rsplit("/", 1)[0]) if args.html: report.write_html( out_file_path, @@ -913,4 +926,4 @@ def generate_raw_events(perf_data_df, out_file_path, perf_mode): meta_data["constants"]["CONST_ARCH"], args.html, ) - log.info("Done!") + logging.info("Done!") diff --git a/src/common.py b/src/common.py index 3b3baaa..1eae35d 100644 --- a/src/common.py +++ b/src/common.py @@ -1,9 +1,27 @@ import logging import sys - -log = logging.getLogger(__name__) +import os def crash(msg): - log.error(msg) + logging.error(msg) sys.exit(1) + + +def configure_logging(output_dir): + run_file = os.path.split(sys.argv[0])[1] + program_name = os.path.splitext(run_file)[0] + logfile = f"{os.path.join(output_dir, program_name)}.log" + # create the log file if it doesn't already exist so that we can allow + # writes from any user in case program is run under sudo and then later + # without sudo + if not os.path.exists(logfile): + with open(logfile, "w+"): + pass + os.chmod(logfile, 0o666) # nosec + logging.basicConfig( + level=logging.NOTSET, + format="%(asctime)s %(levelname)s: %(message)s", + handlers=[logging.FileHandler(logfile), logging.StreamHandler(sys.stdout)], + ) + return logfile diff --git a/src/perf_helpers.py b/src/perf_helpers.py index 5619258..81849ab 100644 --- a/src/perf_helpers.py +++ b/src/perf_helpers.py @@ -22,7 +22,6 @@ version = "PerfSpect_DEV_VERSION" -log = logging.getLogger(__name__) # get tool version info @@ -153,15 +152,6 @@ def set_perf_event_mux_interval(reset, interval_ms, mux_interval): f_mux.write(str(val)) -# extend uncore events to all cores -def enumerate_uncore(event, n): - event_list = [] - for i in range(n): - tmp = event + "_" + str(i) - event_list.append(tmp) - return event_list - - # read the MSR register and return the value in dec format def readmsr(msr, cpu=0): f = os.open("/dev/cpu/%d/msr" % (cpu,), os.O_RDONLY) @@ -194,16 +184,16 @@ def pmu_contention_detect( try: value = readmsr(int(r, 16)) if msrs[r]["value"] is not None and value != msrs[r]["value"]: - log.info("PMU in use: " + msrs[r]["name"]) + logging.warning("PMU in use: " + msrs[r]["name"]) warn = True msrs[r]["value"] = value except IOError: pass if detect: if warn: - log.info("output could be inaccurate") + logging.warning("output could be inaccurate") else: - log.info("PMUs not in use") + logging.info("PMUs not in use") return msrs @@ -215,7 +205,7 @@ def get_version(): version = f.read() version = version.split("#")[0] except EnvironmentError as e: - log.warning(str(e), UserWarning) + logging.warning(str(e), UserWarning) return version @@ -226,7 +216,7 @@ def get_cpuinfo(): try: fo = open("/proc/cpuinfo", "r") except EnvironmentError as e: - log.warning(str(e), UserWarning) + logging.warning(str(e), UserWarning) else: for line in fo: try: @@ -389,7 +379,7 @@ def get_cgroups_from_cids(cids): p.stdout.close() except subprocess.SubprocessError as e: - crash("failed to open ps subprocess: " + e.output) + crash("failed to open ps subprocess: " + str(e)) out, err = p2.communicate() if err: crash(f"error reading cgroups: {err}") @@ -413,17 +403,3 @@ def get_comm_from_cid(cids, cgroups): for index, cid in enumerate(cids): cnamelist += cgroups[index] + "=" + cid + "," return cnamelist - - -def fix_path_ownership(path, recursive=False): - """change the ownership of the results folder when executed with sudo previleges""" - if not recursive: - uid = os.environ.get("SUDO_UID") - gid = os.environ.get("SUDO_GID") - if uid: - os.chown(path, int(uid), int(gid)) - else: - for dirpath, _, filenames in os.walk(path): - fix_path_ownership(dirpath) - for filename in filenames: - fix_path_ownership(os.path.join(dirpath, filename)) diff --git a/src/prepare_perf_events.py b/src/prepare_perf_events.py index 1f13cc3..10dd7c1 100644 --- a/src/prepare_perf_events.py +++ b/src/prepare_perf_events.py @@ -8,15 +8,12 @@ import logging import os import re -import sys import subprocess # nosec import src.perf_helpers as helper - -log = logging.getLogger(__name__) +from src.common import crash # test if the event can be collected, check supported events in perf list -# def filter_func(event, imc_count, cha_count, cbox_count, upi_count, perf_list): def filter_func(event, perf_list): tmp_list = event.split("/") name = helper.get_dev_name(tmp_list[0]) @@ -124,8 +121,7 @@ def get_cgroup_events_format(cgroups, events, num_events): def prepare_perf_events(event_file, cpu_only): if not os.path.isfile(event_file): - log.error("event file not found") - sys.exit(1) + crash("event file not found") start_group = "'{" end_group = "}'" @@ -142,12 +138,10 @@ def prepare_perf_events(event_file, cpu_only): ["perf", "list"], universal_newlines=True ) except FileNotFoundError: - log.error("Please install Linux perf and re-run") - sys.exit(1) + crash("Please install Linux perf and re-run") except subprocess.CalledProcessError as e: - log.error(f"Error calling Linux perf, error code: {e.returncode}") - sys.exit(1) + crash(f"Error calling Linux perf, error code: {e.returncode}") unsupported_events = [] for line in fin: @@ -163,10 +157,9 @@ def prepare_perf_events(event_file, cpu_only): else: collection_events.append(line) if any("cpu-cycles" in event for event in unsupported_events): - log.error("PMU's not available. Run in a full socket VM or baremetal") - sys.exit(1) + crash("PMU's not available. Run in a full socket VM or baremetal") if len(unsupported_events) > 0: - log.warning( + logging.warning( str("Perf unsupported events not counted: " + str(unsupported_events)) ) @@ -208,12 +201,10 @@ def prepare_perf_events(event_file, cpu_only): fin.close() group = group[:-1] if len(event_names) == 0: - log.error("No supported events found on this platform.") - sys.exit(1) + crash("No supported events found on this platform.") # being conservative not letting the collection to proceed if fixed counters aren't suported on the platform if len(unsupported_events) >= len(core_event): - log.error( + crash( "Most core counters aren't supported on this platform, unable to collect PMUs" ) - sys.exit(1) return group, event_names diff --git a/src/report.py b/src/report.py index ad48daf..838cd64 100644 --- a/src/report.py +++ b/src/report.py @@ -10,8 +10,6 @@ from src import icicle from yattag import Doc, indent -log = logging.getLogger(__name__) - def write_html(tma_inp, perf_mode, arch, html_report_out, data_type="both"): if data_type not in ("tma", "basic", "both"): @@ -51,4 +49,4 @@ def write_html(tma_inp, perf_mode, arch, html_report_out, data_type="both"): result = indent(doc.getvalue()) with open(html_report_out, "w") as file: file.write(result) - log.info(f"static HTML file written at {html_report_out}") + logging.info(f"static HTML file written at {html_report_out}")