Merge pull request #114 from olcf/nick-fix-log-level

Fix the --loglevel flag
olcf · Dec 21, 2023 · 9acbe49 · 9acbe49
2 parents 01bd1b6 + f6462b7
commit 9acbe49
Show file tree

Hide file tree

Showing 9 changed files with 87 additions and 50 deletions.
diff --git a/harness/bin/test_harness_driver.py b/harness/bin/test_harness_driver.py
@@ -94,6 +94,10 @@ def create_parser():
                            required=False,
                            type=str,
                            help='Annotate test status with given harness launch id')
+    my_parser.add_argument('--loglevel',
+                           default='CRITICAL',
+                           choices=['DEBUG', 'debug', 'INFO', 'info', 'WARNING', 'warning', 'ERROR', 'error', 'CRITICAL', 'critical'],
+                           help='Control the level of information printed to stdout.')
     my_parser.add_argument('-r', '--resubmit',
                            help='Have the application test batch script resubmit itself, optionally for a total submission count of N. Leave off N for infinite submissions.',
                            action='store', nargs='?', type=int, const=-1, default=False)
@@ -243,8 +247,7 @@ def auto_generated_scripts(harness_config,
                 if job_id != "0":
                     jstatus.log_event(status_file.StatusFile.EVENT_JOB_QUEUED, job_id)
                 else:
-                    print("SUBMIT ERROR: failed to retrieve job id!")
-                    message = f"{messloc} Failed to retrieve the job id."
+                    message = f"{messloc} Submit error, failed to retrieve the job id."
                     a_logger.doCriticalLogging(message)
                     submit_exit_value = 1
 
@@ -274,8 +277,7 @@ def auto_generated_scripts(harness_config,
             run_exit_value = p.returncode
             run_stdout.close()
         else:
-            print("RUN ERROR: failed to retrieve job id!")
-            message = f"{messloc} Failed to retrieve the job id."
+            message = f"{messloc} Run error, failed to retrieve the job id."
             a_logger.doCriticalLogging(message)
             run_exit_value = 1
 
@@ -293,9 +295,11 @@ def auto_generated_scripts(harness_config,
             mymachine.start_report_executable()
             influx_reported = mymachine.log_to_influx()
             if not influx_reported:
-                print("Results not logged to influxDB")
+                message = f"{messloc} results not logged to InfluxDB."
+                a_logger.doCriticalLogging(message)
         else:
-            print("CHECK ERROR: failed to retrieve job id!")
+            message = f"{messloc} check error, failed to retrieve the job id."
+            a_logger.doCriticalLogging(message)
             check_exit_value = 1
 
     exit_values = {
@@ -394,9 +398,13 @@ def test_harness_driver(argv=None):
     # Instantiate application subtest
     logger_name = get_logger_name()
     fh_filepath = get_path_to_logfile_from_scriptdir(testscripts,unique_id)
-    logger_threshold = "INFO"
-    fh_threshold_log_level = "INFO"
-    ch_threshold_log_level = "CRITICAL"
+    # Logger threshold screens ALL traffic before it gets to file/console handlers. Set to lowest level
+    logger_threshold = MODULE_THRESHOLD_LOG_LEVEL
+    # Always set file handler level to MODULE_THRESHOLD_LOG_LEVEL
+    fh_threshold_log_level = MODULE_THRESHOLD_LOG_LEVEL
+    # loglevel arg controls the console level
+    ch_threshold_log_level = Vargs.loglevel
+    #print(f"In test_harness_driver, creating logger with name={logger_name}, filepath={fh_filepath}")
     a_logger = rgt_logger_factory.create_rgt_logger(
                                          logger_name=logger_name,
                                          fh_filepath=fh_filepath,
@@ -423,7 +431,7 @@ def test_harness_driver(argv=None):
             import shutil
             message = f'The kill file {kill_file} exists. It must be removed to run this test.\n'
             message += "Stopping test cycle."
-            print(message)
+            apptest.doCriticalLogging(message)
             runarchive_dir = apptest.get_path_to_runarchive()
             logging.shutdown()
             shutil.rmtree(runarchive_dir,ignore_errors=True)
@@ -455,9 +463,13 @@ def test_harness_driver(argv=None):
     # We now create the status file if it doesn't already exist.
     logger_name = "status_file."+ app + "__" + test
     fh_filepath = apptest.path_to_status_logfile
-    logger_threshold = "INFO"
-    fh_threshold_log_level = "INFO"
-    ch_threshold_log_level = "CRITICAL"
+    # Logger threshold screens ALL traffic before it gets to file/console handlers. Set to lowest level
+    logger_threshold = MODULE_THRESHOLD_LOG_LEVEL
+    # Always set file handler level to MODULE_THRESHOLD_LOG_LEVEL
+    fh_threshold_log_level = MODULE_THRESHOLD_LOG_LEVEL
+    # loglevel arg controls the console level
+    ch_threshold_log_level = Vargs.loglevel
+    #print(f"In test_harness_driver, creating logger with name={logger_name}, filepath={fh_filepath}")
     sfile_logger = rgt_logger_factory.create_rgt_logger(
                                          logger_name=logger_name,
                                          fh_filepath=fh_filepath,
@@ -493,22 +505,22 @@ def test_harness_driver(argv=None):
     build_exit_value = 0
     if actions['build']:
         build_exit_value = exit_values['build']
-        print(f'build exit value = {build_exit_value}')
+        apptest.doInfoLogging(f'build exit value = {build_exit_value}')
 
     submit_exit_value = 0
     if actions['submit']:
         submit_exit_value = exit_values['submit']
-        print(f'submit exit value = {submit_exit_value}')
+        apptest.doInfoLogging(f'submit exit value = {submit_exit_value}')
 
     run_exit_value = 0
     if actions['run']:
         run_exit_value = exit_values['run']
-        print(f'run exit value = {run_exit_value}')
+        apptest.doInfoLogging(f'run exit value = {run_exit_value}')
 
     check_exit_value = 0
     if actions['check']:
         check_exit_value = exit_values['check']
-        print(f'check exit value = {check_exit_value}')
+        apptest.doInfoLogging(f'check exit value = {check_exit_value}')
 
         # Now read the result from the job_status.txt file.
         jspath = os.path.join(status_dir, layout.job_status_filename)

diff --git a/harness/libraries/a b/harness/libraries/a
@@ -0,0 +1,10 @@
+A quick note on the logging module and the log levels:
+The logger_threshold_level filters *all* log messages before they ever reach the file or console handlers.
+So if you set logger_threshold_level to WARNING, then the file handler will only receive messages >= WARNING.
+Then the fh_threshold_level and ch_threshold_level decide what level of messages to send to the file and console, respectively.
+
+Major changes:
+- Added a `--loglevel` flag to `test_harness_driver.py`
+- Changed the `--loglevel` functionality so that instead of `--loglevel` changing the `logger_threshold_level`, it only changes the console log level.
+    - the filehandler (log file) is hard-coded to INFO, and the `logger_threshold_level` is either set to `NOTSET` or `DEBUG`.
+- 
diff --git a/harness/libraries/apptest.py b/harness/libraries/apptest.py
@@ -85,6 +85,7 @@ def __init__(self,
                                              name_of_subtest])
         self.__number_of_iterations = -1
         self.__myLogger = logger
+        self.__loglevel = self.__myLogger.get_ch_threshold_level()
 
     #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
     #                                                                 @
@@ -524,7 +525,7 @@ def _start_test(self,
             os.remove(pathtokillfile)
 
         # This will automatically build & submit
-        starttestcomand = f"test_harness_driver.py -r -l {launchid}"
+        starttestcomand = f"test_harness_driver.py -r -l {launchid} --loglevel {self.__loglevel}"
         if separate_build_stdio:
             starttestcomand += "--separate-build-stdio"
 
@@ -590,7 +591,7 @@ def _influx_log_mode(self):
                 else:
                     self.logger.doWarningLogging(f"Unable to log all events for {test_id}")
             elif os.path.islink(f"./{test_id}"):
-                self.logger.doInfoLogging(f"Ignoring link in influx_log_mode: {test_id}")
+                self.logger.doDebugLogging(f"Ignoring link in influx_log_mode: {test_id}")
             elif not self._machine_matches(test_id):
                 self.logger.doInfoLogging(f"Skipping test from another machine: {test_id}")
 
@@ -599,7 +600,7 @@ def _influx_log_mode(self):
     def _machine_matches(self, test_id):
         """ Checks if RGT_MACHINE_NAME is the same as the test machine name """
         if not 'RGT_MACHINE_NAME' in os.environ:
-            self.logger.doWarningLogging("RGT_MACHINE_NAME not found in environment. Skipping machine name check.")
+            self.logger.doErrorLogging("RGT_MACHINE_NAME not found in environment. Skipping machine name check.")
             return False
         log_start_status_file = f"{self.get_path_to_test()}/{self.test_status_dirname}/{test_id}/"
         log_start_status_file += f"{StatusFile.EVENT_DICT[StatusFile.EVENT_LOGGING_START][0]}"
@@ -677,6 +678,7 @@ def _log_to_influx(self, influx_test_id, post_run=False):
         def local_send_to_influx(influx_url, influx_event_record_string, headers):
             try:
                 if 'RGT_INFLUX_NO_SEND' in os.environ and os.environ['RGT_INFLUX_NO_SEND'] == '1':
+                    # RGT_INFLUX_NO_SEND explicitly tells the harness to print the Influx Event string, so use print()
                     print(f"RGT_INFLUX_NO_SEND is set, echoing: {influx_event_record_string}")
                 elif not 'requests' in sys.modules:
                     self.logger.doWarningLogging(f"InfluxDB is currently disabled. Reason: 'requests' module was unable to load. Skipping InfluxDB message: {influx_event_record_string}. This can be logged after the run using the harness --mode influx_log or by POSTing this message to the InfluxDB server.")
@@ -688,7 +690,7 @@ def local_send_to_influx(influx_url, influx_event_record_string, headers):
                         return False
                 self.logger.doInfoLogging(f"Successfully sent {influx_event_record_string} to {influx_url}")
             except requests.exceptions.ConnectionError as e:
-                self.logger.doWarningLogging(f"InfluxDB is not reachable. Request not sent: {influx_event_record_string}")
+                self.logger.doErrorLogging(f"InfluxDB is not reachable. Request not sent: {influx_event_record_string}")
                 return False
             except Exception as e:
                 # TODO: add more graceful handling of unreachable influx servers
@@ -742,7 +744,7 @@ def local_send_to_influx(influx_url, influx_event_record_string, headers):
         if post_run:
             run_timestamp = self._get_run_timestamp(influx_test_id)
             if run_timestamp < 0:
-                self.logger.doWarningLogging(f"Run Timestamp invalid for jobID {influx_test_id}: {run_timestamp}")
+                self.logger.doErrorLogging(f"Run Timestamp invalid for jobID {influx_test_id}: {run_timestamp}")
                 os.chdir(currentdir)
                 return False
 
@@ -773,7 +775,7 @@ def local_send_to_influx(influx_url, influx_event_record_string, headers):
                 influx_event_record_string += f" {run_timestamp}"
             # If we've made it this far without do_log_metric set to False, then all our checking has completed 
             if do_log_metric and local_send_to_influx(influx_url, influx_event_record_string, headers):
-                self.logger.doWarningLogging(f"Successfully logged metrics to Influx.")
+                self.logger.doInfoLogging(f"Successfully logged metrics to Influx.")
                 success_log_attempts += 1
             elif do_log_metric:
                 # Then logging failed
@@ -818,15 +820,16 @@ def local_send_to_influx(influx_url, influx_event_record_string, headers):
                             influx_event_record_string += f' {run_timestamp}'
                         if local_send_to_influx(influx_url, influx_event_record_string, headers):
                             success_log_attempts += 1
-                            self.logger.doWarningLogging(f"Successfully logged node health for {node_name} to Influx.")
+                            self.logger.doInfoLogging(f"Successfully logged node health for {node_name} to Influx.")
                         else:
                             failed_log_attempts += 1
-                            self.logger.doWarningLogging(f"Logging node health to Influx failed.")
+                            self.logger.doErrorLogging(f"Logging node health to Influx failed.")
             elif 'RGT_NODE_LOCATION_FILE' in os.environ:
-                self.logger.doWarningLogging(f"Node location file path does not exist: {os.environ['RGT_NODE_LOCATION_FILE']}.")
-                self.logger.doWarningLogging(f"Skipping node health logging. To re-log, remove the .influx_logged file in Run_Archive and run in mode influx_log.")
+                message = f"Node location file path does not exist: {os.environ['RGT_NODE_LOCATION_FILE']}."
+                message += f"\nSkipping node health logging. To re-log, remove the .influx_logged file in Run_Archive and run in mode influx_log."
+                self.logger.doErrorLogging(message)
             else:
-                self.logger.doWarningLogging(f"RGT_NODE_LOCATION_FILE not in os.environ, skipping node health logging.")
+                self.logger.doErrorLogging(f"RGT_NODE_LOCATION_FILE not in os.environ, skipping node health logging.")
 
 
         # We're in Run_Archive. The Influx POST request has succeeded, as far as we know,
@@ -884,7 +887,7 @@ def _get_time_diff_of_status_files(self, start_event_file, end_event_file, test_
             line = next(end_fstr)
             end_timestamp = line.split()[0]
         if len(start_timestamp) <= 1 or len(end_timestamp) <= 1:
-            print(f"Invalid start or end timestamp: {start_timestamp}, {end_timestamp}")
+            self.logger.doErrorLogging(f"Invalid start or end timestamp: {start_timestamp}, {end_timestamp}")
             return -1
         #start_ts_dt = datetime.fromisoformat(start_timestamp)
         #end_ts_dt = datetime.fromisoformat(end_timestamp)
@@ -938,7 +941,7 @@ def is_numeric(s):
                             # Wrap strings in double quotes to send to Influx
                             metrics[metric_name] = f'"{line_splt[1]}"'
                     else:
-                        self.logger.doWarningLogging(f"Found a line in metrics.txt with 0 or >1 equals signs:\n{line.strip()}")
+                        self.logger.doErrorLogging(f"Found a line in metrics.txt with 0 or >1 equals signs:\n{line.strip()}")
         return metrics
 
     def _get_node_health(self, machine_name, app_name, test_name):
@@ -974,14 +977,14 @@ def _get_node_health(self, machine_name, app_name, test_name):
                             node_healths[node_name]['status'] = status_string
                             break
                     if not 'status' in node_healths[node_name]:
-                        self.logger.doWarningLogging(f"Could not find status for the string {line_splt[1]}. Skipping node health logging.")
+                        self.logger.doErrorLogging(f"Could not find the status group for the status string {line_splt[1]}. Skipping node health logging.")
                         return_empty = True
                     node_healths[node_name]['message'] = ''
                     if len(line_splt) >= 3:
                         node_healths[node_name]['message'] = ' '.join(line_splt[2:])
         # If there were errors parsing the node health file, we don't want to try to log anything
         if return_empty:
-            self.logger.doWarningLogging(f"There were errors parsing nodecheck.txt. Not logging node healths.")
+            self.logger.doErrorLogging(f"There were errors parsing nodecheck.txt. Not logging node healths.")
             return {}
         return node_healths
 

diff --git a/harness/libraries/layout_of_apps_directory.py b/harness/libraries/layout_of_apps_directory.py
@@ -242,7 +242,7 @@ def create_test_status(self):
             # Not guarded by a conditional, so that it removes broken links
             os.unlink(latest_lnk)
         except FileNotFoundError as e:
-            self.__logger.doInfoLogging("Could not remove 'latest' link in Status.")
+            self.__logger.doWarningLogging("Could not remove 'latest' link in Status.")
         try_symlink(spath, latest_lnk)
 
         return spath
@@ -269,7 +269,7 @@ def create_test_runarchive(self):
             # Not guarded by a conditional, so that it removes broken links
             os.unlink(latest_lnk)
         except FileNotFoundError as e:
-            self.__logger.doInfoLogging("Could not remove 'latest' link in Run_Archive.")
+            self.__logger.doWarningLogging("Could not remove 'latest' link in Run_Archive.")
         try_symlink(rpath, latest_lnk)
 
         return rpath