Skip to content

Commit

Permalink
updated TMA flags and postprocess refactor (#41)
Browse files Browse the repository at this point in the history
  • Loading branch information
hilldani authored Jun 13, 2023
1 parent 4e43047 commit f9fece6
Show file tree
Hide file tree
Showing 13 changed files with 416 additions and 244 deletions.
16 changes: 4 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
# PerfSpect · [![Build](https://github.com/intel/PerfSpect/actions/workflows/build.yml/badge.svg)](https://github.com/intel/PerfSpect/actions/workflows/build.yml)[![License](https://img.shields.io/badge/License-BSD--3-blue)](https://github.com/intel/PerfSpect/blob/master/LICENSE)

[Quick Start](#quick-start-requires-perf-installed) | [Output](#output) | [Requirements](#requirements) | [Build from source](#build-from-source) | [Caveats](#caveats) | [How to contribute](#how-to-contribute)
[Quick Start](#quick-start-requires-perf-installed) | [Output](#output) | [Requirements](#requirements) | [Build from source](#build-from-source)

PerfSpect is a system performance characterization tool built on top of linux perf. Most metrics and events come from [perfmon](https://github.com/intel/perfmon) and [TMA v4.5](https://www.intel.com/content/www/us/en/docs/vtune-profiler/cookbook/2023-1/top-down-microarchitecture-analysis-method.html). It contains two parts:

perf-collect: Collects harware events at a 5 second output interval with practically zero overhead since PMU's run in counting mode.
perf-collect: Collects hardware events at a 5 second output interval with practically zero overhead since PMU's run in counting mode.

- Collection mode:
- `sudo ./perf-collect` _default system wide_
- `sudo ./perf-collect --socket`
- `sudo ./perf-collect --thread`
- `sudo ./perf-collect --cpu`
- `sudo ./perf-collect --pid <process-id>`
- `sudo ./perf-collect --cid` _by default, selects the 5 containers using the most CPU at start of perf-collect. To monitor specific containers provide up to 5 comma separated cids i.e. <cid_1>,<cid_2>_
- Duration:
Expand Down Expand Up @@ -79,12 +79,4 @@ Requires recent python. On successful build, binaries will be created in `dist`
```
pip3 install -r requirements.txt
make
```

## Caveats

1. The tool can collect only the counters supported by underlying linux perf version.

## How to contribute

Create a pull request on github.com/intel/PerfSpect with your patch. Please make sure your patch is building without errors. A maintainer will contact you if there are questions or concerns.
```
2 changes: 1 addition & 1 deletion _version.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.2.14
1.3.0
10 changes: 9 additions & 1 deletion events/bdx.txt
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,14 @@ cpu-cycles:k,
ref-cycles:k,
instructions:k;

cpu/event=0xd0,umask=0x21,cmask=0x00,period=100007,name='MEM_INST_RETIRED.LOCK_LOADS'/,
cpu/event=0x79,umask=0x08,cmask=0x00,period=2000003,name='IDQ.DSB_UOPS'/,
cpu/event=0x79,umask=0x04,cmask=0x00,period=2000003,name='IDQ.MITE_UOPS'/,
cpu/event=0xa8,umask=0x01,cmask=0x00,period=2000003,name='LSD.UOPS'/,
cpu-cycles,
ref-cycles,
instructions;

#C6
cstate_core/c6-residency/;
cstate_pkg/c6-residency/;
Expand All @@ -145,7 +153,7 @@ cbox/event=0x35,umask=0x3,filter_opc=0x180,name='UNC_C_TOR_INSERTS.MISS_OPCODE.0
cbox/event=0x35,umask=0x3,filter_opc=0x181,name='UNC_C_TOR_INSERTS.MISS_OPCODE.0x181'/;
cbox/event=0x35,umask=0x3,filter_opc=0x182,name='UNC_C_TOR_INSERTS.MISS_OPCODE.0x182'/;
cbox/event=0x35,umask=0x3,filter_opc=0x190,name='UNC_C_TOR_INSERTS.MISS_OPCODE.0x190'/;
cbox/event=0x35,umask=0x3,filter_opc=0x191,name='UNC_C_TOR_INSERTS.MISS_OPCODE.0x191'/,
cbox/event=0x35,umask=0x3,filter_opc=0x191,name='UNC_C_TOR_INSERTS.MISS_OPCODE.0x191'/;
cbox/event=0x35,umask=0x3,filter_opc=0x192,name='UNC_C_TOR_INSERTS.MISS_OPCODE.0x192'/;
cbox/event=0x35,umask=0x3,filter_opc=0x180,tid_en=1,filter_tid=0x3e,name='UNC_C_TOR_INSERTS.MISS_OPCODE.tid.0x180'/;
cbox/event=0x36,umask=0x3,filter_opc=0x182,name='UNC_C_TOR_OCCUPANCY.MISS_OPCODE.0x182'/;
Expand Down
13 changes: 10 additions & 3 deletions events/clx_skx.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ instructions;
cpu/event=0x0e,umask=0x01,period=2000003,name='UOPS_ISSUED.ANY'/,
cpu/event=0xc2,umask=0x02,period=2000003,name='UOPS_RETIRED.RETIRE_SLOTS'/,
cpu/event=0x0d,umask=0x01,period=2000003,name='INT_MISC.RECOVERY_CYCLES_ANY'/,
cpu/event=0x0d,umask=0x01,period=2000003,name='INT_MISC.RECOVERY_CYCLES'/;
cpu/event=0x0d,umask=0x01,period=2000003,name='INT_MISC.RECOVERY_CYCLES'/,
cpu-cycles,
ref-cycles,
instructions;
Expand Down Expand Up @@ -156,6 +156,13 @@ cpu-cycles:k,
ref-cycles:k,
instructions:k;

cpu/event=0x79,umask=0x08,cmask=0x00,period=2000003,name='IDQ.DSB_UOPS'/,
cpu/event=0x79,umask=0x04,cmask=0x00,period=2000003,name='IDQ.MITE_UOPS'/,
cpu/event=0xa8,umask=0x01,cmask=0x00,period=2000003,name='LSD.UOPS'/,
cpu-cycles,
ref-cycles,
instructions;

cpu/event=0x79,umask=0x24,cmask=0x01,period=2000003,name='IDQ.ALL_MITE_CYCLES_ANY_UOPS'/,
cpu/event=0x79,umask=0x24,cmask=0x04,period=2000003,name='IDQ.ALL_MITE_CYCLES_4_UOPS'/,
cpu/event=0xb7,umask=0x01,offcore_rsp=0x10003C0002,period=100003,name='OCR.DEMAND_RFO.L3_HIT.HITM_OTHER_CORE'/,
Expand All @@ -175,15 +182,15 @@ instructions;
cpu/event=0xd2,umask=0x02,cmask=0x00,period=20011,name='MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT'/,
cpu/event=0xd2,umask=0x04,cmask=0x00,period=20011,name='MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM'/,
cpu/event=0xb7,umask=0x01,offcore_rsp=0x3F840007F7,name='OCR.ALL_READS.L3_MISS_LOCAL_DRAM.ANY_SNOOP'/,
cpu/event=0xb7,umask=0x01,offcore_rsp=0x3FB80007F7,name='OCR.ALL_READS.L3_MISS_LOCAL_DRAM.ANY_SNOOP_ocr_msr_3fB80007f7'/;
cpu/event=0xb7,umask=0x01,offcore_rsp=0x3FB80007F7,name='OCR.ALL_READS.L3_MISS_LOCAL_DRAM.ANY_SNOOP_ocr_msr_3fB80007f7'/,
cpu-cycles,
ref-cycles,
instructions;

cpu/event=0xb1,umask=0x10,cmask=0x00,period=2000003,name='UOPS_EXECUTED.X87'/,
cpu/event=0xb1,umask=0x01,cmask=0x00,period=2000003,name='UOPS_EXECUTED.THREAD'/,
cpu/event=0xb7,umask=0x01,offcore_rsp=0x103FC007F7,name='OCR.ALL_READS.L3_MISS.REMOTE_HITM'/,
cpu/event=0xb7,umask=0x01,offcore_rsp=0x083FC007F7,name='OCR.ALL_READS.L3_MISS.REMOTE_HIT_FORWARD'/;
cpu/event=0xb7,umask=0x01,offcore_rsp=0x083FC007F7,name='OCR.ALL_READS.L3_MISS.REMOTE_HIT_FORWARD'/,
cpu-cycles,
ref-cycles,
instructions;
Expand Down
4 changes: 3 additions & 1 deletion events/icx.txt
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ instructions;
cpu/event=0xd1,umask=0x80,cmask=0x00,period=100003,name='MEM_LOAD_RETIRED.LOCAL_PMM'/,
cpu/event=0xd3,umask=0x10,cmask=0x00,period=100007,name='MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM'/,
cpu/event=0xa3,umask=0x06,cmask=0x06,period=1000003,name='CYCLE_ACTIVITY.STALLS_L3_MISS'/,
cpu/event=0xa3,umask=0x0c,cmask=0x0c,period=1000003,name='CYCLE_ACTIVITY.STALLS_L1D_MISS'/,
cpu-cycles,
ref-cycles,
instructions;
Expand All @@ -160,7 +161,8 @@ instructions:k;

cpu/event=0x60,umask=0x04,cmask=0x01,period=1000003,name='OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO'/,
cpu/event=0xb7,umask=0x01,cmask=0x00,offcore_rsp=0x10003C0002,name='OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM'/,
cpu/event=0xa3,umask=0x0c,cmask=0x0c,period=1000003,name='CYCLE_ACTIVITY.STALLS_L1D_MISS'/,
cpu/event=0x79,umask=0x08,cmask=0x00,period=2000003,name='IDQ.DSB_UOPS'/,
cpu/event=0xa8,umask=0x01,cmask=0x00,period=2000003,name='LSD.UOPS'/,
cpu-cycles,
ref-cycles,
instructions;
Expand Down
13 changes: 13 additions & 0 deletions events/metric_bdx.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,11 @@
"expression": "[cpu-cycles:k] / [instructions:k]",
"origin": "perfspect"
},
{
"name": "metric_locks retired per instr",
"expression": "[MEM_INST_RETIRED.LOCK_LOADS] / [instructions]",
"origin": "perfmon website"
},
{
"name": "metric_L1D MPI (includes data+rfo w/ prefetches)",
"expression": "[L1D.REPLACEMENT] / [instructions]"
Expand Down Expand Up @@ -155,6 +160,14 @@
"expression": "100 * [cstate_pkg/c6-residency/] * [CORES_PER_SOCKET] / [TSC]",
"origin": "perfspect"
},
{
"name": "metric_% Uops delivered from decoded Icache (DSB)",
"expression": "100 * ([IDQ.DSB_UOPS] / [UOPS_ISSUED.ANY])"
},
{
"name": "metric_% Uops delivered from legacy decode pipeline (MITE)",
"expression": "100 * ([IDQ.MITE_UOPS] / [UOPS_ISSUED.ANY])"
},
{
"name": "metric_memory bandwidth read (MB/sec)",
"expression": "([UNC_M_CAS_COUNT.RD] * 64 / 1000000) / 1"
Expand Down
13 changes: 13 additions & 0 deletions events/metric_icx.json
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,11 @@
"expression": "[instructions] / 1000000000",
"origin": "perfspect"
},
{
"name": "metric_locks retired per instr",
"expression": "[MEM_INST_RETIRED.LOCK_LOADS] / [instructions]",
"origin": "perfmon website"
},
{
"name": "metric_L1D MPI (includes data+rfo w/ prefetches)",
"expression": "[L1D.REPLACEMENT] / [instructions]"
Expand Down Expand Up @@ -88,6 +93,14 @@
"expression": "100 * [cstate_pkg/c6-residency/] * [CORES_PER_SOCKET] / [TSC]",
"origin": "perfspect"
},
{
"name": "metric_% Uops delivered from decoded Icache (DSB)",
"expression": "100 * ([IDQ.DSB_UOPS] / ([IDQ.DSB_UOPS] + [IDQ.MITE_UOPS] + [IDQ.MS_UOPS] + [LSD.UOPS]) )"
},
{
"name": "metric_% Uops delivered from legacy decode pipeline (MITE)",
"expression": "100 * ([IDQ.MITE_UOPS] / ([IDQ.DSB_UOPS] + [IDQ.MITE_UOPS] + [IDQ.MS_UOPS] + [LSD.UOPS]) )"
},
{
"name": "metric_core % cycles in non AVX license",
"expression": "(100 * [CORE_POWER.LVL0_TURBO_LICENSE]) / ([CORE_POWER.LVL0_TURBO_LICENSE] + [CORE_POWER.LVL1_TURBO_LICENSE] + [CORE_POWER.LVL2_TURBO_LICENSE])",
Expand Down
15 changes: 15 additions & 0 deletions events/metric_skx_clx.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,11 @@
"expression": "[cpu-cycles:k] / [instructions:k]",
"origin": "perfspect"
},
{
"name": "metric_locks retired per instr",
"expression": "[MEM_INST_RETIRED.LOCK_LOADS] / [instructions]",
"origin": "perfmon website"
},
{
"name": "metric_L1D MPI (includes data+rfo w/ prefetches)",
"expression": "[L1D.REPLACEMENT] / [instructions]"
Expand Down Expand Up @@ -178,6 +183,16 @@
"expression": "100 * [cstate_pkg/c6-residency/] * [CORES_PER_SOCKET] / [TSC]",
"origin": "perfspect"
},
{
"name": "metric_% Uops delivered from decoded Icache (DSB)",
"expression1": "1",
"expression": "100 * ([IDQ.DSB_UOPS] / ([IDQ.DSB_UOPS] + [IDQ.MITE_UOPS] + [IDQ.MS_UOPS] + [LSD.UOPS]) )"
},
{
"name": "metric_% Uops delivered from legacy decode pipeline (MITE)",
"expression1": "1",
"expression": "100 * ([IDQ.MITE_UOPS] / ([IDQ.DSB_UOPS] + [IDQ.MITE_UOPS] + [IDQ.MS_UOPS] + [LSD.UOPS]) )"
},
{
"name": "metric_core % cycles in non AVX license",
"expression": "(100 * [CORE_POWER.LVL0_TURBO_LICENSE]) / ([CORE_POWER.LVL0_TURBO_LICENSE] + [CORE_POWER.LVL1_TURBO_LICENSE] + [CORE_POWER.LVL2_TURBO_LICENSE])",
Expand Down
15 changes: 15 additions & 0 deletions events/metric_spr.json
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,11 @@
"expression": "[instructions] / 1000000000",
"origin": "perfspect"
},
{
"name": "metric_locks retired per instr",
"expression": "[MEM_INST_RETIRED.LOCK_LOADS] / [instructions]",
"origin": "perfmon website"
},
{
"name": "metric_L1D MPI (includes data+rfo w/ prefetches)",
"expression": "[L1D.REPLACEMENT] / [instructions]"
Expand Down Expand Up @@ -82,6 +87,16 @@
"name": "metric_package c6 residency %",
"expression": "100 * [cstate_pkg/c6-residency/] * [CORES_PER_SOCKET] / [TSC]",
"origin": "perfspect"
},
{
"name": "metric_% Uops delivered from decoded Icache (DSB)",
"expression1": "1",
"expression": "100 * ([IDQ.DSB_UOPS] / ([IDQ.DSB_UOPS] + [IDQ.MITE_UOPS] + [IDQ.MS_UOPS] + [LSD.UOPS]) )"
},
{
"name": "metric_% Uops delivered from legacy decode pipeline (MITE)",
"expression1": "1",
"expression": "100 * ([IDQ.MITE_UOPS] / ([IDQ.DSB_UOPS] + [IDQ.MITE_UOPS] + [IDQ.MS_UOPS] + [LSD.UOPS]) )"
},
{
"name": "metric_core initiated local dram read bandwidth (MB/sec)",
Expand Down
8 changes: 8 additions & 0 deletions events/spr.txt
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,14 @@ cpu-cycles,
ref-cycles,
instructions;

cpu/event=0x79,umask=0x08,cmask=0x00,period=2000003,name='IDQ.DSB_UOPS'/,
cpu/event=0x79,umask=0x04,period=100003,name='IDQ.MITE_UOPS'/,
cpu/event=0x79,umask=0x20,period=100003,name='IDQ.MS_UOPS'/,
cpu/event=0xa8,umask=0x01,cmask=0x00,period=2000003,name='LSD.UOPS'/,
cpu-cycles,
ref-cycles,
instructions;

cpu/event=0xd3,umask=0x08,cmask=0x00,period=100007,name='MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD'/,
cpu/event=0xd3,umask=0x04,cmask=0x00,period=100007,name='MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM'/,
cpu/event=0x2a,umask=0x01,offcore_rsp=0x1030004477,name='OCR.READS_TO_CORE.REMOTE_CACHE.SNOOP_HITM'/,
Expand Down
31 changes: 17 additions & 14 deletions perf-collect.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def write_metadata(
cpuname,
cpuid_info,
muxinterval,
thread,
cpu,
socket,
psi,
):
Expand Down Expand Up @@ -69,7 +69,7 @@ def write_metadata(
modified.write(str(c) + ";")
modified.write("\n")
modified.write("Perf event mux Interval ms," + str(muxinterval) + ",\n")
threadmode = "enabled" if thread else "disabled"
cpumode = "enabled" if cpu else "disabled"
socketmode = "enabled" if socket else "disabled"
if args.cid is not None:
cgname = "enabled,"
Expand Down Expand Up @@ -114,7 +114,7 @@ def write_metadata(
cpusets = ",disabled"

modified.write("cpusets" + cpusets + ",\n")
modified.write("Percore mode," + threadmode + ",\n")
modified.write("Percpu mode," + cpumode + ",\n")
modified.write("Persocket mode," + socketmode + ",\n")
modified.write("PSI," + json.dumps(psi) + "\n")
modified.write("PerfSpect version," + perf_helpers.get_tool_version() + ",\n")
Expand Down Expand Up @@ -204,18 +204,17 @@ def validate_file(fname):
"-p", "--pid", type=str, default=None, help="perf-collect on selected PID(s)"
)
runmode.add_argument(
"-c",
"--cid",
help="perf-collect on up to 5 cgroups. Provide comma separated cids like e19f4fb59,6edca29db (by default, selects the 5 containers using the most CPU)",
type=str,
nargs="?",
const="",
)
runmode.add_argument(
"--thread", help="Collect for thread metrics", action="store_true"
"-c", "--cpu", help="Collect for cpu metrics", action="store_true"
)
runmode.add_argument(
"--socket", help="Collect for socket metrics", action="store_true"
"-s", "--socket", help="Collect for socket metrics", action="store_true"
)
parser.add_argument(
"-m",
Expand Down Expand Up @@ -252,8 +251,8 @@ def validate_file(fname):
interval = 5000
collect_psi = False

if args.thread:
logging.info("Run mode: thread")
if args.cpu:
logging.info("Run mode: cpu")
collect_psi = supports_psi()
elif args.socket:
logging.info("Run mode: socket")
Expand Down Expand Up @@ -315,11 +314,13 @@ def validate_file(fname):
):
logging.info("disabling uncore (possibly in a vm?)")
have_uncore = False
if arch == "icelake":
if arch == "icelake" and initial_pmus["0x30c"]["value"] is None:
logging.warning(
"Due to lack of vPMU support, TMA L1 events will not be collected"
)
if arch == "sapphirerapids" or arch == "emeraldrapids":
if (arch == "sapphirerapids" or arch == "emeraldrapids") and initial_pmus[
"0x30c"
]["value"] is None:
logging.warning(
"Due to lack of vPMU support, TMA L1 & L2 events will not be collected"
)
Expand All @@ -328,11 +329,12 @@ def validate_file(fname):
(
args.pid is not None
or args.cid is not None
or args.thread
or args.cpu
or args.socket
or not have_uncore
),
args.pid is not None or args.cid is not None,
initial_pmus["0x30c"]["value"] is not None,
)

if not perf_helpers.validate_outfile(args.outcsv):
Expand All @@ -349,7 +351,7 @@ def validate_file(fname):
if args.cid is not None:
cgroups = perf_helpers.get_cgroups(args.cid)

if args.thread or args.socket or args.pid is not None or args.cid is not None:
if args.cpu or args.socket or args.pid is not None or args.cid is not None:
logging.info("Not collecting uncore events in this run mode")

# log some metadata
Expand All @@ -368,7 +370,7 @@ def validate_file(fname):
logging.info("/sys/devices/: " + str(sys_devs))

# build perf stat command
collection_type = "-a" if not args.thread and not args.socket else "-a -A"
collection_type = "-a" if not args.cpu and not args.socket else "-a -A"
cmd = f"perf stat -I {interval} -x , {collection_type} -o {args.outcsv}"
if args.pid:
cmd += f" --pid {args.pid}"
Expand All @@ -392,6 +394,7 @@ def validate_file(fname):
if args.verbose:
logging.info(cmd)
psi = []
logging.info("Collection started!")
start = time.time()
try:
perf = subprocess.Popen(perfargs) # nosec
Expand Down Expand Up @@ -423,7 +426,7 @@ def validate_file(fname):
cpuname,
cpuid_info,
args.muxinterval,
args.thread,
args.cpu,
args.socket,
list(map(list, zip(*psi))),
)
Expand Down
Loading

0 comments on commit f9fece6

Please sign in to comment.