From 110fe4f6e3137f244d807c97d4c1ab409cfdf41e Mon Sep 17 00:00:00 2001 From: Daniel Hill Date: Thu, 15 Jun 2023 15:41:24 -0700 Subject: [PATCH] updated html, tma detection, graphql (#43) * Create codeql.yml * Update codeql.yml * updated html and tma detection --- .github/workflows/codeql.yml | 52 ++++++++++ LICENSE | 2 +- _version.txt | 2 +- events/metric_bdx.json | 8 +- events/metric_skx_clx.json | 10 +- events/metric_spr.json | 4 +- perf-collect.py | 61 +++++++++--- perf-postprocess.py | 31 +++++- src/base.html | 178 ++++++++++++++++++++++++++++++++--- src/perf_helpers.py | 72 ++------------ src/prepare_perf_events.py | 2 - 11 files changed, 315 insertions(+), 107 deletions(-) create mode 100644 .github/workflows/codeql.yml diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml new file mode 100644 index 0000000..93c923c --- /dev/null +++ b/.github/workflows/codeql.yml @@ -0,0 +1,52 @@ +# For most projects, this workflow file will not need changing; you simply need +# to commit it to your repository. +# +# You may wish to alter this file to override the set of languages analyzed, +# or to provide custom queries or build logic. +# +# ******** NOTE ******** +# We have attempted to detect the languages in your repository. Please check +# the `language` matrix defined below to confirm you have the correct set of +# supported CodeQL languages. +# +name: "CodeQL" + +on: + push: + branches: [ "master" ] + pull_request: + # The branches below must be a subset of the branches above + branches: [ "master" ] + schedule: + - cron: '26 9 * * 3' + +jobs: + analyze: + name: Analyze + runs-on: 'ubuntu-latest' + timeout-minutes: 360 + permissions: + actions: read + contents: read + security-events: write + + strategy: + fail-fast: false + matrix: + language: [ 'cpp', 'python', 'javascript' ] + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + - name: Initialize CodeQL + uses: github/codeql-action/init@v2 + with: + languages: ${{ matrix.language }} + - run: | + pip3 install -r requirements.txt + make + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v2 + with: + category: "/language:${{matrix.language}}" diff --git a/LICENSE b/LICENSE index ebc07cb..9885fdd 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ BSD 3-Clause License -Copyright (C) 2021 Intel Corporation +Copyright (C) 2021-2023 Intel Corporation Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: diff --git a/_version.txt b/_version.txt index f0bb29e..3a3cd8c 100644 --- a/_version.txt +++ b/_version.txt @@ -1 +1 @@ -1.3.0 +1.3.1 diff --git a/events/metric_bdx.json b/events/metric_bdx.json index f50dbcc..df3176e 100644 --- a/events/metric_bdx.json +++ b/events/metric_bdx.json @@ -298,22 +298,22 @@ "expression": "100 * ( ( ( ( [CYCLE_ACTIVITY.STALLS_TOTAL] + [UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC] - ( [UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC] if ( ( [instructions] / ( [cpu-cycles] ) ) > 1.8 ) else [UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC] ) - ( [RS_EVENTS.EMPTY_CYCLES] if ( ( ( 4 ) * [IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE] / ( ( 4 ) * ( ( [CPU_CLK_UNHALTED.THREAD_ANY] / 2 ) if [HYPERTHREADING_ON] else ( [cpu-cycles] ) ) ) ) > 0.1 ) else 0 ) + [RESOURCE_STALLS.SB] ) ) - [RESOURCE_STALLS.SB] - [CYCLE_ACTIVITY.STALLS_MEM_ANY] ) / ( [cpu-cycles] ) )" }, { - "name": "metric_TMA_......0_Port_Utilized(%)", + "name": "metric_TMA_......Ports_Utilized_0(%)", "expression": "100 * (([UOPS_EXECUTED.CORE_i1_c1] / [const_thread_count]) if ([const_thread_count] > 1) else ([RS_EVENTS.EMPTY_CYCLES] if ([CYCLE_ACTIVITY.STALLS_TOTAL] - ([IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE] / ([CPU_CLK_UNHALTED.THREAD_ANY] / [const_thread_count])) ) > 0.1 else 0)) / ([CPU_CLK_UNHALTED.THREAD_ANY] / [const_thread_count]) ", "origin": "perfspect" }, { - "name": "metric_TMA_......1_Port_Utilized(%)", + "name": "metric_TMA_......Ports_Utilized_1(%)", "expression": "100 * (([UOPS_EXECUTED.CORE_c1] - [UOPS_EXECUTED.CORE_c2]) / [const_thread_count]) / ([CPU_CLK_UNHALTED.THREAD_ANY] / [const_thread_count])", "origin": "perfspect" }, { - "name": "metric_TMA_......2_Port_Utilized(%)", + "name": "metric_TMA_......Ports_Utilized_2(%)", "expression": "100 * (([UOPS_EXECUTED.CORE_c2] - [UOPS_EXECUTED.CORE_c3]) / [const_thread_count]) / ([CPU_CLK_UNHALTED.THREAD_ANY] / [const_thread_count])", "origin": "perfspect" }, { - "name": "metric_TMA_......3m_Ports_Utilized(%)", + "name": "metric_TMA_......Ports_Utilized_3m(%)", "expression": "100 * ([UOPS_EXECUTED.CORE_c3] / [const_thread_count]) / ([CPU_CLK_UNHALTED.THREAD_ANY] / [const_thread_count])", "origin": "perfspect" }, diff --git a/events/metric_skx_clx.json b/events/metric_skx_clx.json index 98d4112..e56f777 100644 --- a/events/metric_skx_clx.json +++ b/events/metric_skx_clx.json @@ -185,12 +185,10 @@ }, { "name": "metric_% Uops delivered from decoded Icache (DSB)", - "expression1": "1", "expression": "100 * ([IDQ.DSB_UOPS] / ([IDQ.DSB_UOPS] + [IDQ.MITE_UOPS] + [IDQ.MS_UOPS] + [LSD.UOPS]) )" }, { "name": "metric_% Uops delivered from legacy decode pipeline (MITE)", - "expression1": "1", "expression": "100 * ([IDQ.MITE_UOPS] / ([IDQ.DSB_UOPS] + [IDQ.MITE_UOPS] + [IDQ.MS_UOPS] + [LSD.UOPS]) )" }, { @@ -371,22 +369,22 @@ "expression": "100 * ( ( [EXE_ACTIVITY.EXE_BOUND_0_PORTS] + ( [EXE_ACTIVITY.1_PORTS_UTIL] + ( ( [UOPS_RETIRED.RETIRE_SLOTS] ) / ( ( 4 ) * ( ( [CPU_CLK_UNHALTED.THREAD_ANY] / 2 ) if [HYPERTHREADING_ON] else ( [cpu-cycles] ) ) ) ) * [EXE_ACTIVITY.2_PORTS_UTIL] ) ) / ( [cpu-cycles] ) if ( [ARITH.DIVIDER_ACTIVE] < ( [CYCLE_ACTIVITY.STALLS_TOTAL] - [CYCLE_ACTIVITY.STALLS_MEM_ANY] ) ) else ( [EXE_ACTIVITY.1_PORTS_UTIL] + ( ( [UOPS_RETIRED.RETIRE_SLOTS] ) / ( ( 4 ) * ( ( [CPU_CLK_UNHALTED.THREAD_ANY] / 2 ) if [HYPERTHREADING_ON] else ( [cpu-cycles] ) ) ) ) * [EXE_ACTIVITY.2_PORTS_UTIL] ) / ( [cpu-cycles] ) )" }, { - "name": "metric_TMA_......0_Port_Utilized(%)", + "name": "metric_TMA_......Ports_Utilized_0(%)", "expression": "100 * (([UOPS_EXECUTED.CORE_CYCLES_NONE] / 2) if ([const_thread_count] > 1) else [EXE_ACTIVITY.EXE_BOUND_0_PORTS]) / ([CPU_CLK_UNHALTED.THREAD_ANY] / [const_thread_count])", "origin": "perfspect" }, { - "name": "metric_TMA_......1_Port_Utilized(%)", + "name": "metric_TMA_......Ports_Utilized_1(%)", "expression": "100 * ((([UOPS_EXECUTED.CORE_CYCLES_GE_1] - [UOPS_EXECUTED.CORE_CYCLES_GE_2]) / 2) if ([const_thread_count] > 1) else [EXE_ACTIVITY.1_PORTS_UTIL]) / ([CPU_CLK_UNHALTED.THREAD_ANY] / [const_thread_count])", "origin": "perfspect" }, { - "name": "metric_TMA_......2_Port_Utilized(%)", + "name": "metric_TMA_......Ports_Utilized_2(%)", "expression": "100 * ((([UOPS_EXECUTED.CORE_CYCLES_GE_2] - [UOPS_EXECUTED.CORE_CYCLES_GE_3]) / 2) if ([const_thread_count] > 1) else [EXE_ACTIVITY.2_PORTS_UTIL]) / ([CPU_CLK_UNHALTED.THREAD_ANY] / [const_thread_count])", "origin": "perfspect" }, { - "name": "metric_TMA_......3m_Ports_Utilized(%)", + "name": "metric_TMA_......Ports_Utilized_3m(%)", "expression": "100 * [UOPS_EXECUTED.CORE_CYCLES_GE_3] / [CPU_CLK_UNHALTED.THREAD_ANY]", "origin": "perfspect" }, diff --git a/events/metric_spr.json b/events/metric_spr.json index 7f01e68..a1d1908 100644 --- a/events/metric_spr.json +++ b/events/metric_spr.json @@ -88,14 +88,12 @@ "expression": "100 * [cstate_pkg/c6-residency/] * [CORES_PER_SOCKET] / [TSC]", "origin": "perfspect" }, - { + { "name": "metric_% Uops delivered from decoded Icache (DSB)", - "expression1": "1", "expression": "100 * ([IDQ.DSB_UOPS] / ([IDQ.DSB_UOPS] + [IDQ.MITE_UOPS] + [IDQ.MS_UOPS] + [LSD.UOPS]) )" }, { "name": "metric_% Uops delivered from legacy decode pipeline (MITE)", - "expression1": "1", "expression": "100 * ([IDQ.MITE_UOPS] / ([IDQ.DSB_UOPS] + [IDQ.MITE_UOPS] + [IDQ.MS_UOPS] + [LSD.UOPS]) )" }, { diff --git a/perf-collect.py b/perf-collect.py index 00ad101..3a37f3f 100644 --- a/perf-collect.py +++ b/perf-collect.py @@ -55,8 +55,7 @@ def write_metadata( modified.write("CORES_PER_SOCKET," + str(perf_helpers.get_cpu_count()) + ",\n") modified.write("SOCKET_COUNT," + str(perf_helpers.get_socket_count()) + ",\n") modified.write("HYPERTHREADING_ON," + str(perf_helpers.get_ht_status()) + ",\n") - imc, upi = perf_helpers.get_imc_upi_count() - cha = perf_helpers.get_cha_count() + imc, cha, upi = perf_helpers.get_imc_cha_upi_count() modified.write("IMC count," + str(imc) + ",\n") modified.write("CHAS_PER_SOCKET," + str(cha) + ",\n") modified.write("UPI count," + str(upi) + ",\n") @@ -158,6 +157,38 @@ def supports_psi(): return False +def tma_supported(): + perf_out = "" + try: + perf = subprocess.Popen( + shlex.split( + "perf stat -a -e '{cpu/event=0x00,umask=0x04,period=10000003,name='TOPDOWN.SLOTS'/,cpu/event=0x00,umask=0x81,period=10000003,name='PERF_METRICS.BAD_SPECULATION'/}' sleep .1" + ), + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + ) + perf_out = perf.communicate()[0].decode() + except subprocess.CalledProcessError: + return False + + try: + events = { + a.split()[1]: int(a.split()[0].replace(",", "")) + for a in filter( + lambda x: "TOPDOWN.SLOTS" in x or "PERF_METRICS.BAD_SPECULATION" in x, + perf_out.split("\n"), + ) + } + except Exception: + return False + + # This is a perf artifact of no vPMU support + if events["TOPDOWN.SLOTS"] == events["PERF_METRICS.BAD_SPECULATION"]: + return False + + return True + + def resource_path(relative_path): """Get absolute path to resource, works for dev and for PyInstaller""" base_path = getattr(sys, "_MEIPASS", os.path.dirname(os.path.abspath(__file__))) @@ -247,7 +278,6 @@ def validate_file(fname): # disable nmi watchdog before collecting perf nmi_watchdog = perf_helpers.disable_nmi_watchdog() - initial_pmus = perf_helpers.pmu_contention_detect() interval = 5000 collect_psi = False @@ -303,7 +333,14 @@ def validate_file(fname): else: crash("Unknown application type") + # check if pmu available + if "cpu-cycles" not in perf_helpers.get_perf_list(): + crash( + "PMU's not available. Run baremetal or in a VM which exposes PMUs (sometimes full socket)" + ) + # get perf events to collect + include_tma = True sys_devs = perf_helpers.get_sys_devices() if ( "uncore_cha" not in sys_devs @@ -314,13 +351,16 @@ def validate_file(fname): ): logging.info("disabling uncore (possibly in a vm?)") have_uncore = False - if arch == "icelake" and initial_pmus["0x30c"]["value"] is None: + + if arch == "icelake": + include_tma = tma_supported() + if not include_tma: logging.warning( "Due to lack of vPMU support, TMA L1 events will not be collected" ) - if (arch == "sapphirerapids" or arch == "emeraldrapids") and initial_pmus[ - "0x30c" - ]["value"] is None: + if arch == "sapphirerapids" or arch == "emeraldrapids": + include_tma = tma_supported() + if not include_tma: logging.warning( "Due to lack of vPMU support, TMA L1 & L2 events will not be collected" ) @@ -334,7 +374,7 @@ def validate_file(fname): or not have_uncore ), args.pid is not None or args.cid is not None, - initial_pmus["0x30c"]["value"] is not None, + include_tma, ) if not perf_helpers.validate_outfile(args.outcsv): @@ -361,9 +401,9 @@ def validate_file(fname): logging.info("Cores per socket: " + str(perf_helpers.get_cpu_count())) logging.info("Socket: " + str(perf_helpers.get_socket_count())) logging.info("Hyperthreading on: " + str(perf_helpers.get_ht_status())) - imc, upi = perf_helpers.get_imc_upi_count() + imc, cha, upi = perf_helpers.get_imc_cha_upi_count() logging.info("IMC count: " + str(imc)) - logging.info("CHA per socket: " + str(perf_helpers.get_cha_count())) + logging.info("CHA per socket: " + str(cha)) logging.info("UPI count: " + str(upi)) logging.info("PerfSpect version: " + perf_helpers.get_tool_version()) if args.verbose: @@ -390,7 +430,6 @@ def validate_file(fname): perfargs = shlex.split(cmd) validate_perfargs(perfargs) - perf_helpers.pmu_contention_detect(msrs=initial_pmus, detect=True) if args.verbose: logging.info(cmd) psi = [] diff --git a/perf-postprocess.py b/perf-postprocess.py index bc3bca8..8a00e2b 100644 --- a/perf-postprocess.py +++ b/perf-postprocess.py @@ -231,6 +231,7 @@ def get_all_data_lines(input_file_path): def get_metadata_as_dict(meta_data_lines): meta_data = {} meta_data["constants"] = {} + meta_data["metadata"] = {} for line in meta_data_lines: if line.startswith("SYSTEM_TSC_FREQ"): meta_data["constants"]["SYSTEM_TSC_FREQ"] = ( @@ -311,6 +312,26 @@ def get_metadata_as_dict(meta_data_lines): meta_data["SOCKET_CORES"].append(CPUs) elif line.startswith("PSI"): meta_data["PSI"] = json.loads(line.split("PSI,")[1]) + + for line in meta_data_lines: + for info in [ + "SYSTEM_TSC_FREQ (MHz)", + "CORES_PER_SOCKET", + "SOCKET_COUNT", + "HYPERTHREADING_ON", + "IMC count", + "CHAS_PER_SOCKET", + "UPI count", + "Architecture", + "Model", + "kernel version", + "PerfSpect version", + ]: + if info in line: + meta_data["metadata"][info] = line.split(",", 1)[1] + if meta_data["metadata"][info][-1] == ",": + meta_data["metadata"][info] = meta_data["metadata"][info][:-1] + return meta_data @@ -577,7 +598,7 @@ def row(df, name): return "[]" -def write_html(time_series_df, perf_mode, out_file_path): +def write_html(time_series_df, perf_mode, out_file_path, meta_data): html_file = "base.html" if getattr(sys, "frozen", False): basepath = getattr(sys, "_MEIPASS", os.path.dirname(os.path.abspath(__file__))) @@ -612,10 +633,14 @@ def write_html(time_series_df, perf_mode, out_file_path): html = html.replace(metric[0], row(time_series_df, metric[1])) avg = time_series_df.mean(numeric_only=True, axis=1).to_frame() + html = html.replace( + "ALLMETRICS", json.dumps(avg.reset_index().to_dict("records")) + ) + html = html.replace("METADATA", json.dumps(list(meta_data["metadata"].items()))) for number in [ ["FRONTEND", "metric_TMA_Frontend_Bound(%)"], ["BACKEND", "metric_TMA_Backend_Bound(%)"], - ["CORE", "metric_TMA_..Core_Bound(%)"], + ["COREDATA", "metric_TMA_..Core_Bound(%)"], ["MEMORY", "metric_TMA_..Memory_Bound(%)"], ["BADSPECULATION", "metric_TMA_Bad_Speculation(%)"], ["RETIRING", "metric_TMA_Retiring(%)"], @@ -945,7 +970,7 @@ def generate_metrics( generate_metrics_time_series(time_series_df, perf_mode, out_file_path) generate_metrics_averages(time_series_df, perf_mode, out_file_path) if perf_mode == Mode.System: - write_html(time_series_df, perf_mode, out_file_path) + write_html(time_series_df, perf_mode, out_file_path, meta_data) return diff --git a/src/base.html b/src/base.html index e98af55..58880b4 100644 --- a/src/base.html +++ b/src/base.html @@ -1,3 +1,7 @@ + @@ -41,6 +45,14 @@ Link, Tabs, Icon, + Table, + TableBody, + TableCell, + TableContainer, + TableHead, + TableRow, + Tooltip, + Paper, } = MaterialUI; // Create a theme instance. @@ -145,20 +157,100 @@ function App() { const [systemTabs, setSystemTabs] = React.useState(0); - const [open, setOpen] = React.useState(true); - const [openlink, setOpenlink] = React.useState(false); + const [openlink, setOpenlink] = React.useState(true); const handleChange = (event, newSystemTabs) => { setSystemTabs(newSystemTabs); }; - const handleClose = () => { - setOpen(false) - setOpenlink(true) - }; + const handleCloselink = () => { setOpenlink(false) }; + const all_metrics = ALLMETRICS + const meta_data = METADATA + const description = { + "metric_CPU operating frequency (in GHz)": "CPU operating frequency (in GHz)", + "metric_CPU utilization %": "Percentage of time spent in the active CPU power state C0", + "metric_CPI": "Cycles per instruction retired; indicating how much time each executed instruction took; in units of cycles.", + "metric_L1D MPI (includes data+rfo w/ prefetches)": "Ratio of number of requests missing L1 data cache (includes data+rfo w/ prefetches) to the total number of completed instructions", + "metric_L1D demand data read hits per instr": "Ratio of number of demand load requests hitting in L1 data cache to the total number of completed instructions ", + "metric_L1-I code read misses (w/ prefetches) per instr": "Ratio of number of code read requests missing in L1 instruction cache (includes prefetches) to the total number of completed instructions", + "metric_L2 demand data read hits per instr": "Ratio of number of completed demand load requests hitting in L2 cache to the total number of completed instructions ", + "metric_L2 MPI (includes code+data+rfo w/ prefetches)": "Ratio of number of requests missing L2 cache (includes code+data+rfo w/ prefetches) to the total number of completed instructions", + "metric_L2 demand data read MPI": "Ratio of number of completed data read request missing L2 cache to the total number of completed instructions", + "metric_L2 demand code MPI": "Ratio of number of code read request missing L2 cache to the total number of completed instructions", + "metric_LLC code read MPI (demand+prefetch)": "Ratio of number of code read requests missing last level core cache (includes demand w/ prefetches) to the total number of completed instructions", + "metric_LLC data read MPI (demand+prefetch)": "Ratio of number of data read requests missing last level core cache (includes demand w/ prefetches) to the total number of completed instructions", + "metric_NUMA %_Reads addressed to local DRAM": "Memory read that miss the last level cache (LLC) addressed to local DRAM as a percentage of total memory read accesses, does not include LLC prefetches.", + "metric_NUMA %_Reads addressed to remote DRAM": "Memory reads that miss the last level cache (LLC) addressed to remote DRAM as a percentage of total memory read accesses, does not include LLC prefetches.", + "metric_uncore frequency GHz": "Uncore operating frequency in GHz", + "metric_% Uops delivered from decoded Icache (DSB)": "Uops delivered from decoded instruction cache (decoded stream buffer or DSB) as a percent of total uops delivered to Instruction Decode Queue", + "metric_% Uops delivered from legacy decode pipeline (MITE)": "Uops delivered from legacy decode pipeline (Micro-instruction Translation Engine or MITE) as a percent of total uops delivered to Instruction Decode Queue", + "metric_memory bandwidth read (MB/sec)": "DDR memory read bandwidth (MB/sec)", + "metric_memory bandwidth write (MB/sec)": "DDR memory write bandwidth (MB/sec)", + "metric_memory bandwidth total (MB/sec)": "DDR memory bandwidth (MB/sec)", + "metric_TMA_Frontend_Bound(%)": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound.", + "metric_TMA_....ICache_Misses(%)": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses.", + "metric_TMA_....ITLB_Misses(%)": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses.", + "metric_TMA_....Branch_Resteers(%)": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers. Branch Resteers estimates the Frontend delay in fetching operations from corrected path; following all sorts of miss-predicted branches. For example; branchy code with lots of miss-predictions might get categorized under Branch Resteers. Note the value of this node may overlap with its siblings.", + "metric_TMA_......Mispredicts_Resteers(%)": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage. ", + "metric_TMA_......Clears_Resteers(%)": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Machine Clears. ", + "metric_TMA_....MITE(%)": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck.", + "metric_TMA_....DSB(%)": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.", + "metric_TMA_Bad_Speculation(%)": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.", + "metric_TMA_..Branch_Mispredicts(%)": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path.", + "metric_TMA_..Machine_Clears(%)": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes.", + "metric_TMA_Backend_Bound(%)": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.", + "metric_TMA_..Memory_Bound(%)": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).", + "metric_TMA_....L1_Bound(%)": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache. The L1 data cache typically has the shortest latency. However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache.", + "metric_TMA_......DTLB_Load(%)": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss.", + "metric_TMA_....L2_Bound(%)": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads. Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance.", + "metric_TMA_....L3_Bound(%)": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core. Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance.", + "metric_TMA_......Data_Sharing(%)": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses. Data shared by multiple Logical Processors (even just read shared) may cause increased access latency due to cache coherency. Excessive data sharing can drastically harm multithreaded performance.", + "metric_TMA_......MEM_Bandwidth(%)": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that).", + "metric_TMA_......MEM_Latency(%)": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that).", + "metric_TMA_....Store_Bound(%)": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write. Even though store accesses do not typically stall out-of-order CPUs; there are few cases where stores can lead to actual stalls. This metric will be flagged should RFO stores be a bottleneck.", + "metric_TMA_..Core_Bound(%)": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).", + "metric_TMA_....Ports_Utilization(%)": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related). Two distinct categories can be attributed into this metric: (1) heavy data-dependency among contiguous instructions would manifest in this metric - such cases are often referred to as low Instruction Level Parallelism (ILP). (2) Contention on some hardware execution unit other than Divider. For example; when there are too many multiply operations.", + "metric_TMA_Retiring(%)": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. ", + "metric_TMA_..Light_Operations(%)": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved.", + "metric_TMA_....FP_Arith(%)": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired). Note this metric's value may exceed its parent due to use of \"Uops\" CountDomain and FMA double-counting.", + "metric_TMA_..Heavy_Operations(%)": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.", + "metric_IO_bandwidth_disk_or_network_writes (MB/sec)": "Bandwidth of IO reads that are initiated by end device controllers that are requesting memory from the CPU.", + "metric_IO_bandwidth_disk_or_network_reads (MB/sec)": "Bandwidth of IO writes that are initiated by end device controllers that are writing memory to the CPU.", + "metric_TMA_..Fetch_Latency(%)": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period.", + "metric_TMA_......Unknown_Branches(%)": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (e.g. first time the branch is fetched or hitting BTB capacity limit).", + "metric_TMA_..Fetch_Bandwidth(%)": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend.", + "metric_TMA_......FP_Scalar(%)": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired. May overcount due to FMA double counting.", + "metric_TMA_......FP_Vector(%)": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths. May overcount due to FMA double counting.", + "metric_UPI Data transmit BW (MB/sec) (only data)": "Intel(R) Ultra Path Interconnect (UPI) data transmit bandwidth (MB/sec)", + "metric_TMA_......Lock_Latency(%)": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations. Due to the microarchitecture handling of locks; they are classified as L1_Bound regardless of what memory source satisfied them.", + "metric_TMA_......False_Sharing(%)": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing. False Sharing is a multithreading hiccup; where multiple Logical Processors contend on different data-elements mapped into the same cache line. ", + "metric_Average LLC data read miss latency (in ns)": "Average latency of a last level cache (LLC) demand and prefetch data read miss (read memory access) in nano seconds", + "metric_Average LLC data read miss latency for LOCAL requests (in ns)": "Average latency of a last level cache (LLC) demand and prefetch data read miss (read memory access) addressed to local memory in nano seconds", + "metric_Average LLC data read miss latency for REMOTE requests (in ns)": "Average latency of a last level cache (LLC) demand and prefetch data read miss (read memory access) addressed to remote memory in nano seconds", + "metric_ITLB MPI": "Ratio of number of completed page walks (for all page sizes) caused by a code fetch to the total number of completed instructions. This implies it missed in the ITLB (Instruction TLB) and further levels of TLB.", + "metric_ITLB large page MPI": "Ratio of number of completed page walks (for 2 megabyte and 4 megabyte page sizes) caused by a code fetch to the total number of completed instructions. This implies it missed in the Instruction Translation Lookaside Buffer (ITLB) and further levels of TLB.", + "metric_DTLB load MPI": "Ratio of number of completed page walks (for all page sizes) caused by demand data loads to the total number of completed instructions. This implies it missed in the DTLB and further levels of TLB.", + "metric_DTLB 2MB large page load MPI": "Ratio of number of completed page walks (for 2 megabyte page sizes) caused by demand data loads to the total number of completed instructions. This implies it missed in the Data Translation Lookaside Buffer (DTLB) and further levels of TLB.", + "metric_DTLB store MPI": "Ratio of number of completed page walks (for all page sizes) caused by demand data stores to the total number of completed instructions. This implies it missed in the DTLB and further levels of TLB.", + "metric_Average LLC demand data read miss latency (in ns)": "Average latency of a last level cache (LLC) demand data read miss (read memory access) in nano seconds", + "metric_Average LLC demand data read miss latency for LOCAL requests (in ns)": "Average latency of a last level cache (LLC) demand data read miss (read memory access) addressed to local memory in nano seconds", + "metric_Average LLC demand data read miss latency for REMOTE requests (in ns)": "Average latency of a last level cache (LLC) demand data read miss (read memory access) addressed to remote memory in nano seconds", + "metric_ITLB (2nd level) MPI": "Ratio of number of completed page walks (for all page sizes) caused by a code fetch to the total number of completed instructions. This implies it missed in the ITLB (Instruction TLB) and further levels of TLB.", + "metric_DTLB (2nd level) load MPI": "Ratio of number of completed page walks (for all page sizes) caused by demand data loads to the total number of completed instructions. This implies it missed in the DTLB and further levels of TLB.", + "metric_DTLB (2nd level) 2MB large page load MPI": "Ratio of number of completed page walks (for 2 megabyte page sizes) caused by demand data loads to the total number of completed instructions. This implies it missed in the Data Translation Lookaside Buffer (DTLB) and further levels of TLB.", + "metric_DTLB (2nd level) store MPI": "Ratio of number of completed page walks (for all page sizes) caused by demand data stores to the total number of completed instructions. This implies it missed in the DTLB and further levels of TLB.", + "metric_TMA_....DRAM_Bound(%)": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads. Better caching can improve the latency and increase performance.", + "metric_TMA_......Ports_Utilized_0(%)": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise). Long-latency instructions like divides may contribute to this metric.", + "metric_TMA_......Ports_Utilized_1(%)": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). This can be due to heavy data-dependency among software instructions; or over oversubscribing a particular hardware resource. In some other cases with high 1_Port_Utilized and L1_Bound; this metric can point to L1 data-cache latency bottleneck that may not necessarily manifest with complete execution starvation (due to the short L1 latency e.g. walking a linked list) - looking at the assembly can be helpful.", + "metric_TMA_......Ports_Utilized_2(%)": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). Loop Vectorization -most compilers feature auto-Vectorization options today- reduces pressure on the execution ports as multiple elements are calculated with same uop.", + "metric_TMA_......Ports_Utilized_3m(%)": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise).", + "metric_TMA_....Microcode_Sequencer(%)": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit. The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided.", + "metric_TMA_Info_System_SMT_2T_Utilization": "Fraction of cycles where both hardware Logical Processors were active", + } + const base_line = { xAxis: { name: "time (s)" @@ -274,9 +366,6 @@ return (
- - For the full list of metrics please see the csv output files. This contains explanations of a few key metrics to help get you started. - star }}> @@ -293,9 +382,11 @@ + + -
+
+ + + + + + Metadata + Value + + + + {meta_data.map((row) => ( + + + {row[0]} + + {row[1]} + + ))} + +
+
+
+ + + + + + Value + Metric + + + + {all_metrics.map((row) => ( + + + + + {row.metrics} + + + + + + {Math.round(Number(row["0"]) * 100) / 100} + + + + ))} + +
+
+
); diff --git a/src/perf_helpers.py b/src/perf_helpers.py index 326912d..87eecd3 100644 --- a/src/perf_helpers.py +++ b/src/perf_helpers.py @@ -10,7 +10,6 @@ import logging import os import re -import struct import subprocess # nosec import time from ctypes import cdll, CDLL @@ -95,31 +94,22 @@ def get_sys_devices(): # get imc and uncore counts # TODO:fix for memory config with some channels populated -def get_imc_upi_count(): +def get_imc_cha_upi_count(): sys_devs = get_sys_devices() + cha_count = 0 imc_count = 0 upi_count = 0 + if "uncore_cha" in sys_devs: + cha_count = int(sys_devs["uncore_cha"]) + if "uncore_cbox" in sys_devs: + cha_count = int(sys_devs["uncore_cbox"]) if "uncore_upi" in sys_devs: upi_count = int(sys_devs["uncore_upi"]) if "uncore_qpi" in sys_devs: upi_count = int(sys_devs["uncore_qpi"]) if "uncore_imc" in sys_devs: imc_count = int(sys_devs["uncore_imc"]) - return imc_count, upi_count - - -# get CHA count -def get_cha_count(): - cha_msrs = { - "0x396": "uncore client cha count", - "0x702": "uncore cha count", - "0x2FFE": "uncore cha count spr", - } - for msr in cha_msrs.keys(): - result = read_msr(int(msr, 16)) - if result is not None and result != 0: - return result - return 0 + return imc_count, cha_count, upi_count # get imc channel ids, channel ids are not consecutive in some cases (observed on bdw) @@ -208,54 +198,6 @@ def set_perf_event_mux_interval(reset, interval_ms, mux_interval): f_mux.write(str(val)) -# read the MSR register and return the value in dec format -def read_msr(msr, cpu=0): - fName = f"/dev/cpu/{cpu}/msr" - try: - with open(fName, "rb") as f: - f.seek(msr) - result = struct.unpack("Q", f.read(8))[0] - except OSError: - result = None - return result - - -# detect if PMU counters are in use -def pmu_contention_detect( - msrs={ - "0x309": {"name": "instructions", "value": None}, - "0x30a": {"name": "cpu cycles", "value": None}, - "0x30b": {"name": "ref cycles", "value": None}, - "0x30c": {"name": "topdown slots", "value": None}, - "0xc1": {"name": "general purpose PMU 1", "value": None}, - "0xc2": {"name": "general purpose PMU 2", "value": None}, - "0xc3": {"name": "general purpose PMU 3", "value": None}, - "0xc4": {"name": "general purpose PMU 4", "value": None}, - "0xc5": {"name": "general purpose PMU 5", "value": None}, - "0xc6": {"name": "general purpose PMU 6", "value": None}, - "0xc7": {"name": "general purpose PMU 7", "value": None}, - "0xc8": {"name": "general purpose PMU 8", "value": None}, - }, - detect=False, -): - warn = False - for r in msrs: - try: - value = read_msr(int(r, 16)) - if msrs[r]["value"] is not None and value != msrs[r]["value"]: - logging.warning("PMU in use: " + msrs[r]["name"]) - warn = True - msrs[r]["value"] = value - except IOError: - pass - if detect: - if warn: - logging.warning("output could be inaccurate") - else: - logging.info("PMUs not in use") - return msrs - - # get linux kernel version def get_version(): version = "" diff --git a/src/prepare_perf_events.py b/src/prepare_perf_events.py index d4e7c3c..d51e028 100644 --- a/src/prepare_perf_events.py +++ b/src/prepare_perf_events.py @@ -143,8 +143,6 @@ def filter_events(event_file, cpu_only, PID_CID_mode, TMA_supported): collection_events[-1] = end_event[:-1] + ";" else: collection_events.append(line) - if any("cpu-cycles" in event for event in unsupported_events): - crash("PMU's not available. Run in a full socket VM or baremetal") if len(unsupported_events) > 0: logging.warning( f"Perf unsupported events not counted: {unsupported_events}"