adding metadata to logs and bug fixes (#39)

* adding metadata to logs and bug fixes * update version
intel · Jun 6, 2023 · b0a5cf5 · b0a5cf5
1 parent f41e963
commit b0a5cf5
Show file tree

Hide file tree

Showing 9 changed files with 122 additions and 53 deletions.
diff --git a/.github/ISSUE_TEMPLATE/1-support-bugs.yml b/.github/ISSUE_TEMPLATE/1-support-bugs.yml
@@ -0,0 +1,30 @@
+name: 🐛 Bug Report/Support
+description: Ask a question or report an issue
+labels: [bug]
+body:
+  - type: markdown
+    attributes:
+      value: |
+        Thank you for submitting a bug report. It helps make PerfSpect better.
+
+        Please try to include as much information as possible.
+  - type: textarea
+    attributes:
+      label: Verbose output from perf-collect
+      render: shell
+      description: Copy the output of `./perf-collect` with `-v` flag (it will automatically format as a code block)
+  - type: textarea
+    attributes:
+      label: Verbose output from perf-postprocess
+      render: shell
+      description: Copy the output of `./perf-postprocess` with `-v` flag (it will automatically format as a code block)
+  - type: textarea
+    attributes:
+      label: What steps can reproduce the bug?
+      description: Explain the bug, system setup, and provide a code snippet that can reproduce it.
+    validations:
+      required: true
+  - type: textarea
+    attributes:
+      label: Additional information
+      description: Is there anything else you think we should know?
diff --git a/.github/ISSUE_TEMPLATE/2-feature-request.yml b/.github/ISSUE_TEMPLATE/2-feature-request.yml
@@ -0,0 +1,21 @@
+name: 🚀 Feature Request
+description: Suggest an idea, feature, or enhancement
+labels: [enhancement]
+body:
+  - type: markdown
+    attributes:
+      value: |
+        Thank you for submitting an idea. It helps make PerfSpect better.
+  - type: textarea
+    attributes:
+      label: What is the problem this feature would solve?
+    validations:
+      required: true
+  - type: textarea
+    attributes:
+      label: What is the feature you are proposing to solve the problem?
+    validations:
+      required: true
+  - type: textarea
+    attributes:
+      label: What alternatives have you considered?
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1 @@
+blank_issues_enabled: true
diff --git a/README.md b/README.md
@@ -4,7 +4,7 @@
 
 PerfSpect is a system performance characterization tool built on top of linux perf. Most metrics and events come from [perfmon](https://github.com/intel/perfmon) and [TMA v4.5](https://www.intel.com/content/www/us/en/docs/vtune-profiler/cookbook/2023-1/top-down-microarchitecture-analysis-method.html). It contains two parts:
 
-perf-collect: Collects harware events
+perf-collect: Collects harware events at a 5 second output interval with practically zero overhead since PMU's run in counting  mode.
 
 - Collection mode:
   - `sudo ./perf-collect` _default system wide_

diff --git a/_version.txt b/_version.txt
@@ -1 +1 @@
-1.2.12
+1.2.13
diff --git a/perf-collect.py b/perf-collect.py
@@ -93,20 +93,15 @@ def write_metadata(
                 cg_path_found = False
                 for path in cgroup_paths:
                     try:
-                        cpu_set_file = open(path, "r")
-                        cg_path_found = True
-                        # no need to check other paths
-                        break
+                        with open(path, "r") as cpu_set_file:
+                            cg_path_found = True
+                            cpu_set = cpu_set_file.read()
+                            cpu_set = cpu_set.strip()
+                            cpu_set = cpu_set.replace(",", "+")
+                            break
                     except FileNotFoundError:
-                        # check next path
                         continue
 
-                if cg_path_found:
-                    cpu_set = cpu_set_file.read()
-                    cpu_set_file.close()
-                    cpu_set = cpu_set.strip()
-                    cpu_set = cpu_set.replace(",", "+")
-
                 if not cg_path_found or cpu_set == "":
                     # A missing path or an empty cpu-set in v2 indicates that the container is running on all CPUs
                     cpu_set = "0-" + str(
@@ -284,6 +279,18 @@ def validate_file(fname):
     else:
         crash("Unknown application type")
 
+    events, collection_events = prep_events.prepare_perf_events(
+        eventfile,
+        (
+            args.pid is not None
+            or args.cid is not None
+            or args.thread
+            or args.socket
+            or not have_uncore
+        ),
+        args.pid is not None or args.cid is not None,
+    )
+
     if not perf_helpers.validate_outfile(args.outcsv):
         crash(
             "Output filename not accepted. Filename should be a .csv without special characters"
@@ -299,7 +306,6 @@ def validate_file(fname):
         cgroups = perf_helpers.get_cgroups(args.cid)
 
     # get perf events to collect
-    collection_events = []
     sys_devs = perf_helpers.get_sys_devices()
     if (
         "uncore_cha" not in sys_devs
@@ -318,21 +324,24 @@ def validate_file(fname):
             logging.warning(
                 "Due to lack of vPMU support, TMA L1 & L2 events will not be collected"
             )
-    events, collection_events = prep_events.prepare_perf_events(
-        eventfile,
-        (
-            args.pid is not None
-            or args.cid is not None
-            or args.thread
-            or args.socket
-            or not have_uncore
-        ),
-        args.pid is not None or args.cid is not None,
-    )
 
     if args.thread or args.socket or args.pid is not None or args.cid is not None:
         logging.info("Not collecting uncore events in this run mode")
 
+    # log some metadata
+    logging.info("Architecture: " + arch)
+    logging.info("Model: " + cpuname)
+    logging.info("Kernel version: " + perf_helpers.get_version())
+    logging.info("Cores per socket: " + str(perf_helpers.get_cpu_count()))
+    logging.info("Socket: " + str(perf_helpers.get_socket_count()))
+    logging.info("Hyperthreading on: " + str(perf_helpers.get_ht_status()))
+    imc, upi = perf_helpers.get_imc_upi_count()
+    logging.info("IMC count: " + str(imc))
+    logging.info("CHA per socket: " + str(perf_helpers.get_cha_count()))
+    logging.info("UPI count: " + str(upi))
+    logging.info("PerfSpect version: " + perf_helpers.get_tool_version())
+    logging.info("/sys/devices/: " + str(sys_devs))
+
     # build perf stat command
     collection_type = "-a" if not args.thread and not args.socket else "-a -A"
     cmd = f"perf stat -I {interval} -x , {collection_type} -o {args.outcsv}"
@@ -358,13 +367,12 @@ def validate_file(fname):
     if args.verbose:
         logging.info(cmd)
     try:
-        logging.info("Collecting perf stat for events in : %s" % eventfilename)
         start = time.time()
         subprocess.call(perfargs)  # nosec
         end = time.time()
-        if end - start < 5.2:
+        if end - start < 7:
             logging.warning(
-                "PerfSpect was run for less than 5 seconds, some events make be zero because they didn't get scheduled"
+                "PerfSpect was run for a short duration, some events might be zero or blank because they never got scheduled"
             )
         logging.info("Collection complete! Calculating TSC frequency now")
     except KeyboardInterrupt:

diff --git a/perf-postprocess.py b/perf-postprocess.py
@@ -129,6 +129,10 @@ def get_args(script_path):
 # for socket or thread: add rows for each 2nd hyper thread with same values as 1st thread
 def get_fixed_c6_residency_fields(perf_data_lines, perf_mode):
     # handle special case events: c6-residency
+    # if hyperthreading is disabled, no fixing is required
+    if meta_data["constants"]["HYPERTHREADING_ON"] == 0:
+        return perf_data_lines
+
     new_perf_data_lines = []
     if meta_data["constants"]["CONST_THREAD_COUNT"] == 2:
         for fields in perf_data_lines:
@@ -216,8 +220,11 @@ def get_all_data_lines(input_file_path):
                     fields = line.split(",")
                     perf_data_lines.append(fields)
 
-        infile.close()
-        return meta_data_lines, perf_events_lines, perf_data_lines
+    if len(perf_data_lines) == 0:
+        crash(
+            "perfstat.csv contains no perf event data, try collecting for a longer time"
+        )
+    return meta_data_lines, perf_events_lines, perf_data_lines
 
 
 # get_metadata
@@ -416,6 +423,7 @@ def get_socket_number(sockets_dict, core):
 
 
 def extract_dataframe(perf_data_lines, meta_data, perf_mode):
+    logging.info("Formatting event data")
     # parse event data into dataframe and set header names
     perf_data_df = pd.DataFrame(perf_data_lines)
     if "CGROUPS" in meta_data and meta_data["CGROUPS"] == "enabled":
@@ -657,10 +665,13 @@ def generate_metrics(
     }
     prev_time_slice = 0
     group_to_start_end_indexes = {}
+    logging.info("processing " + str(time_slice_groups.ngroups) + " samples")
     for time_slice, item in time_slice_groups:
         time_slice_float = float(time_slice)
         if time_slice_float - prev_time_slice < 4.5:
             logging.warning("throwing out last sample because it was too short")
+            if time_slice_groups.ngroups == 1:
+                crash("no remaining samples")
             continue
         time_slice_df = time_slice_groups.get_group(time_slice).copy()
         # normalize by difference between current time slice and previous time slice

diff --git a/src/base.html b/src/base.html
@@ -167,7 +167,7 @@
         series: [
           {
             type: 'line',
-            data: CPUUTIL.map((e, i) => [i, e]),
+            data: CPUUTIL.map((e, i) => [i * 5, e]),
           }
         ]
       }
@@ -177,7 +177,7 @@
         series: [
           {
             type: 'line',
-            data: CPIDATA.map((e, i) => [i, e]),
+            data: CPIDATA.map((e, i) => [i * 5, e]),
           }
         ]
       }
@@ -187,7 +187,7 @@
         series: [
           {
             type: 'line',
-            data: CPUFREQ.map((e, i) => [i, e]),
+            data: CPUFREQ.map((e, i) => [i * 5, e]),
           }
         ]
       }
@@ -197,7 +197,7 @@
         series: [
           {
             type: 'line',
-            data: REMOTENUMA.map((e, i) => [i, e]),
+            data: REMOTENUMA.map((e, i) => [i * 5, e]),
           }
         ]
       }
@@ -208,17 +208,17 @@
           {
             name: "L1D",
             type: 'line',
-            data: L1DATA.map((e, i) => [i, e]),
+            data: L1DATA.map((e, i) => [i * 5, e]),
           },
           {
             name: "L2",
             type: 'line',
-            data: L2DATA.map((e, i) => [i, e]),
+            data: L2DATA.map((e, i) => [i * 5, e]),
           },
           {
             name: "LLC Data",
             type: 'line',
-            data: LLCDATA.map((e, i) => [i, e]),
+            data: LLCDATA.map((e, i) => [i * 5, e]),
           },
         ]
       }
@@ -229,17 +229,17 @@
           {
             name: "Read",
             type: 'line',
-            data: READDATA.map((e, i) => [i, e]),
+            data: READDATA.map((e, i) => [i * 5, e]),
           },
           {
             name: "Write",
             type: 'line',
-            data: WRITEDATA.map((e, i) => [i, e]),
+            data: WRITEDATA.map((e, i) => [i * 5, e]),
           },
           {
             name: "Total",
             type: 'line',
-            data: TOTALDATA.map((e, i) => [i, e]),
+            data: TOTALDATA.map((e, i) => [i * 5, e]),
           },
         ]
       }
@@ -249,7 +249,7 @@
         series: [
           {
             type: 'line',
-            data: PKGPOWER.map((e, i) => [i, e]),
+            data: PKGPOWER.map((e, i) => [i * 5, e]),
           }
         ]
       }
@@ -259,7 +259,7 @@
         series: [
           {
             type: 'line',
-            data: DRAMPOWER.map((e, i) => [i, e]),
+            data: DRAMPOWER.map((e, i) => [i * 5, e]),
           }
         ]
       }

diff --git a/src/perf_helpers.py b/src/perf_helpers.py
@@ -269,19 +269,17 @@ def get_cpuinfo():
     cpuinfo = []
     temp_dict = {}
     try:
-        fo = open("/proc/cpuinfo", "r")
+        with open("/proc/cpuinfo", "r") as fo:
+            for line in fo:
+                try:
+                    key, value = list(map(str.strip, line.split(":", 1)))
+                except ValueError:
+                    cpuinfo.append(temp_dict)
+                    temp_dict = {}
+                else:
+                    temp_dict[key] = value
     except EnvironmentError as e:
         logging.warning(str(e), UserWarning)
-    else:
-        for line in fo:
-            try:
-                key, value = list(map(str.strip, line.split(":", 1)))
-            except ValueError:
-                cpuinfo.append(temp_dict)
-                temp_dict = {}
-            else:
-                temp_dict[key] = value
-        fo.close()
     return cpuinfo