Skip to content

Commit

Permalink
Merge pull request #114 from olcf/nick-fix-log-level
Browse files Browse the repository at this point in the history
Fix the --loglevel flag
  • Loading branch information
ddietz89 authored Dec 21, 2023
2 parents 01bd1b6 + f6462b7 commit 9acbe49
Show file tree
Hide file tree
Showing 9 changed files with 87 additions and 50 deletions.
46 changes: 29 additions & 17 deletions harness/bin/test_harness_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,10 @@ def create_parser():
required=False,
type=str,
help='Annotate test status with given harness launch id')
my_parser.add_argument('--loglevel',
default='CRITICAL',
choices=['DEBUG', 'debug', 'INFO', 'info', 'WARNING', 'warning', 'ERROR', 'error', 'CRITICAL', 'critical'],
help='Control the level of information printed to stdout.')
my_parser.add_argument('-r', '--resubmit',
help='Have the application test batch script resubmit itself, optionally for a total submission count of N. Leave off N for infinite submissions.',
action='store', nargs='?', type=int, const=-1, default=False)
Expand Down Expand Up @@ -243,8 +247,7 @@ def auto_generated_scripts(harness_config,
if job_id != "0":
jstatus.log_event(status_file.StatusFile.EVENT_JOB_QUEUED, job_id)
else:
print("SUBMIT ERROR: failed to retrieve job id!")
message = f"{messloc} Failed to retrieve the job id."
message = f"{messloc} Submit error, failed to retrieve the job id."
a_logger.doCriticalLogging(message)
submit_exit_value = 1

Expand Down Expand Up @@ -274,8 +277,7 @@ def auto_generated_scripts(harness_config,
run_exit_value = p.returncode
run_stdout.close()
else:
print("RUN ERROR: failed to retrieve job id!")
message = f"{messloc} Failed to retrieve the job id."
message = f"{messloc} Run error, failed to retrieve the job id."
a_logger.doCriticalLogging(message)
run_exit_value = 1

Expand All @@ -293,9 +295,11 @@ def auto_generated_scripts(harness_config,
mymachine.start_report_executable()
influx_reported = mymachine.log_to_influx()
if not influx_reported:
print("Results not logged to influxDB")
message = f"{messloc} results not logged to InfluxDB."
a_logger.doCriticalLogging(message)
else:
print("CHECK ERROR: failed to retrieve job id!")
message = f"{messloc} check error, failed to retrieve the job id."
a_logger.doCriticalLogging(message)
check_exit_value = 1

exit_values = {
Expand Down Expand Up @@ -394,9 +398,13 @@ def test_harness_driver(argv=None):
# Instantiate application subtest
logger_name = get_logger_name()
fh_filepath = get_path_to_logfile_from_scriptdir(testscripts,unique_id)
logger_threshold = "INFO"
fh_threshold_log_level = "INFO"
ch_threshold_log_level = "CRITICAL"
# Logger threshold screens ALL traffic before it gets to file/console handlers. Set to lowest level
logger_threshold = MODULE_THRESHOLD_LOG_LEVEL
# Always set file handler level to MODULE_THRESHOLD_LOG_LEVEL
fh_threshold_log_level = MODULE_THRESHOLD_LOG_LEVEL
# loglevel arg controls the console level
ch_threshold_log_level = Vargs.loglevel
#print(f"In test_harness_driver, creating logger with name={logger_name}, filepath={fh_filepath}")
a_logger = rgt_logger_factory.create_rgt_logger(
logger_name=logger_name,
fh_filepath=fh_filepath,
Expand All @@ -423,7 +431,7 @@ def test_harness_driver(argv=None):
import shutil
message = f'The kill file {kill_file} exists. It must be removed to run this test.\n'
message += "Stopping test cycle."
print(message)
apptest.doCriticalLogging(message)
runarchive_dir = apptest.get_path_to_runarchive()
logging.shutdown()
shutil.rmtree(runarchive_dir,ignore_errors=True)
Expand Down Expand Up @@ -455,9 +463,13 @@ def test_harness_driver(argv=None):
# We now create the status file if it doesn't already exist.
logger_name = "status_file."+ app + "__" + test
fh_filepath = apptest.path_to_status_logfile
logger_threshold = "INFO"
fh_threshold_log_level = "INFO"
ch_threshold_log_level = "CRITICAL"
# Logger threshold screens ALL traffic before it gets to file/console handlers. Set to lowest level
logger_threshold = MODULE_THRESHOLD_LOG_LEVEL
# Always set file handler level to MODULE_THRESHOLD_LOG_LEVEL
fh_threshold_log_level = MODULE_THRESHOLD_LOG_LEVEL
# loglevel arg controls the console level
ch_threshold_log_level = Vargs.loglevel
#print(f"In test_harness_driver, creating logger with name={logger_name}, filepath={fh_filepath}")
sfile_logger = rgt_logger_factory.create_rgt_logger(
logger_name=logger_name,
fh_filepath=fh_filepath,
Expand Down Expand Up @@ -493,22 +505,22 @@ def test_harness_driver(argv=None):
build_exit_value = 0
if actions['build']:
build_exit_value = exit_values['build']
print(f'build exit value = {build_exit_value}')
apptest.doInfoLogging(f'build exit value = {build_exit_value}')

submit_exit_value = 0
if actions['submit']:
submit_exit_value = exit_values['submit']
print(f'submit exit value = {submit_exit_value}')
apptest.doInfoLogging(f'submit exit value = {submit_exit_value}')

run_exit_value = 0
if actions['run']:
run_exit_value = exit_values['run']
print(f'run exit value = {run_exit_value}')
apptest.doInfoLogging(f'run exit value = {run_exit_value}')

check_exit_value = 0
if actions['check']:
check_exit_value = exit_values['check']
print(f'check exit value = {check_exit_value}')
apptest.doInfoLogging(f'check exit value = {check_exit_value}')

# Now read the result from the job_status.txt file.
jspath = os.path.join(status_dir, layout.job_status_filename)
Expand Down
10 changes: 10 additions & 0 deletions harness/libraries/a
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
A quick note on the logging module and the log levels:
The logger_threshold_level filters *all* log messages before they ever reach the file or console handlers.
So if you set logger_threshold_level to WARNING, then the file handler will only receive messages >= WARNING.
Then the fh_threshold_level and ch_threshold_level decide what level of messages to send to the file and console, respectively.

Major changes:
- Added a `--loglevel` flag to `test_harness_driver.py`
- Changed the `--loglevel` functionality so that instead of `--loglevel` changing the `logger_threshold_level`, it only changes the console log level.
- the filehandler (log file) is hard-coded to INFO, and the `logger_threshold_level` is either set to `NOTSET` or `DEBUG`.
-
33 changes: 18 additions & 15 deletions harness/libraries/apptest.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ def __init__(self,
name_of_subtest])
self.__number_of_iterations = -1
self.__myLogger = logger
self.__loglevel = self.__myLogger.get_ch_threshold_level()

#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
# @
Expand Down Expand Up @@ -524,7 +525,7 @@ def _start_test(self,
os.remove(pathtokillfile)

# This will automatically build & submit
starttestcomand = f"test_harness_driver.py -r -l {launchid}"
starttestcomand = f"test_harness_driver.py -r -l {launchid} --loglevel {self.__loglevel}"
if separate_build_stdio:
starttestcomand += "--separate-build-stdio"

Expand Down Expand Up @@ -590,7 +591,7 @@ def _influx_log_mode(self):
else:
self.logger.doWarningLogging(f"Unable to log all events for {test_id}")
elif os.path.islink(f"./{test_id}"):
self.logger.doInfoLogging(f"Ignoring link in influx_log_mode: {test_id}")
self.logger.doDebugLogging(f"Ignoring link in influx_log_mode: {test_id}")
elif not self._machine_matches(test_id):
self.logger.doInfoLogging(f"Skipping test from another machine: {test_id}")

Expand All @@ -599,7 +600,7 @@ def _influx_log_mode(self):
def _machine_matches(self, test_id):
""" Checks if RGT_MACHINE_NAME is the same as the test machine name """
if not 'RGT_MACHINE_NAME' in os.environ:
self.logger.doWarningLogging("RGT_MACHINE_NAME not found in environment. Skipping machine name check.")
self.logger.doErrorLogging("RGT_MACHINE_NAME not found in environment. Skipping machine name check.")
return False
log_start_status_file = f"{self.get_path_to_test()}/{self.test_status_dirname}/{test_id}/"
log_start_status_file += f"{StatusFile.EVENT_DICT[StatusFile.EVENT_LOGGING_START][0]}"
Expand Down Expand Up @@ -677,6 +678,7 @@ def _log_to_influx(self, influx_test_id, post_run=False):
def local_send_to_influx(influx_url, influx_event_record_string, headers):
try:
if 'RGT_INFLUX_NO_SEND' in os.environ and os.environ['RGT_INFLUX_NO_SEND'] == '1':
# RGT_INFLUX_NO_SEND explicitly tells the harness to print the Influx Event string, so use print()
print(f"RGT_INFLUX_NO_SEND is set, echoing: {influx_event_record_string}")
elif not 'requests' in sys.modules:
self.logger.doWarningLogging(f"InfluxDB is currently disabled. Reason: 'requests' module was unable to load. Skipping InfluxDB message: {influx_event_record_string}. This can be logged after the run using the harness --mode influx_log or by POSTing this message to the InfluxDB server.")
Expand All @@ -688,7 +690,7 @@ def local_send_to_influx(influx_url, influx_event_record_string, headers):
return False
self.logger.doInfoLogging(f"Successfully sent {influx_event_record_string} to {influx_url}")
except requests.exceptions.ConnectionError as e:
self.logger.doWarningLogging(f"InfluxDB is not reachable. Request not sent: {influx_event_record_string}")
self.logger.doErrorLogging(f"InfluxDB is not reachable. Request not sent: {influx_event_record_string}")
return False
except Exception as e:
# TODO: add more graceful handling of unreachable influx servers
Expand Down Expand Up @@ -742,7 +744,7 @@ def local_send_to_influx(influx_url, influx_event_record_string, headers):
if post_run:
run_timestamp = self._get_run_timestamp(influx_test_id)
if run_timestamp < 0:
self.logger.doWarningLogging(f"Run Timestamp invalid for jobID {influx_test_id}: {run_timestamp}")
self.logger.doErrorLogging(f"Run Timestamp invalid for jobID {influx_test_id}: {run_timestamp}")
os.chdir(currentdir)
return False

Expand Down Expand Up @@ -773,7 +775,7 @@ def local_send_to_influx(influx_url, influx_event_record_string, headers):
influx_event_record_string += f" {run_timestamp}"
# If we've made it this far without do_log_metric set to False, then all our checking has completed
if do_log_metric and local_send_to_influx(influx_url, influx_event_record_string, headers):
self.logger.doWarningLogging(f"Successfully logged metrics to Influx.")
self.logger.doInfoLogging(f"Successfully logged metrics to Influx.")
success_log_attempts += 1
elif do_log_metric:
# Then logging failed
Expand Down Expand Up @@ -818,15 +820,16 @@ def local_send_to_influx(influx_url, influx_event_record_string, headers):
influx_event_record_string += f' {run_timestamp}'
if local_send_to_influx(influx_url, influx_event_record_string, headers):
success_log_attempts += 1
self.logger.doWarningLogging(f"Successfully logged node health for {node_name} to Influx.")
self.logger.doInfoLogging(f"Successfully logged node health for {node_name} to Influx.")
else:
failed_log_attempts += 1
self.logger.doWarningLogging(f"Logging node health to Influx failed.")
self.logger.doErrorLogging(f"Logging node health to Influx failed.")
elif 'RGT_NODE_LOCATION_FILE' in os.environ:
self.logger.doWarningLogging(f"Node location file path does not exist: {os.environ['RGT_NODE_LOCATION_FILE']}.")
self.logger.doWarningLogging(f"Skipping node health logging. To re-log, remove the .influx_logged file in Run_Archive and run in mode influx_log.")
message = f"Node location file path does not exist: {os.environ['RGT_NODE_LOCATION_FILE']}."
message += f"\nSkipping node health logging. To re-log, remove the .influx_logged file in Run_Archive and run in mode influx_log."
self.logger.doErrorLogging(message)
else:
self.logger.doWarningLogging(f"RGT_NODE_LOCATION_FILE not in os.environ, skipping node health logging.")
self.logger.doErrorLogging(f"RGT_NODE_LOCATION_FILE not in os.environ, skipping node health logging.")


# We're in Run_Archive. The Influx POST request has succeeded, as far as we know,
Expand Down Expand Up @@ -884,7 +887,7 @@ def _get_time_diff_of_status_files(self, start_event_file, end_event_file, test_
line = next(end_fstr)
end_timestamp = line.split()[0]
if len(start_timestamp) <= 1 or len(end_timestamp) <= 1:
print(f"Invalid start or end timestamp: {start_timestamp}, {end_timestamp}")
self.logger.doErrorLogging(f"Invalid start or end timestamp: {start_timestamp}, {end_timestamp}")
return -1
#start_ts_dt = datetime.fromisoformat(start_timestamp)
#end_ts_dt = datetime.fromisoformat(end_timestamp)
Expand Down Expand Up @@ -938,7 +941,7 @@ def is_numeric(s):
# Wrap strings in double quotes to send to Influx
metrics[metric_name] = f'"{line_splt[1]}"'
else:
self.logger.doWarningLogging(f"Found a line in metrics.txt with 0 or >1 equals signs:\n{line.strip()}")
self.logger.doErrorLogging(f"Found a line in metrics.txt with 0 or >1 equals signs:\n{line.strip()}")
return metrics

def _get_node_health(self, machine_name, app_name, test_name):
Expand Down Expand Up @@ -974,14 +977,14 @@ def _get_node_health(self, machine_name, app_name, test_name):
node_healths[node_name]['status'] = status_string
break
if not 'status' in node_healths[node_name]:
self.logger.doWarningLogging(f"Could not find status for the string {line_splt[1]}. Skipping node health logging.")
self.logger.doErrorLogging(f"Could not find the status group for the status string {line_splt[1]}. Skipping node health logging.")
return_empty = True
node_healths[node_name]['message'] = ''
if len(line_splt) >= 3:
node_healths[node_name]['message'] = ' '.join(line_splt[2:])
# If there were errors parsing the node health file, we don't want to try to log anything
if return_empty:
self.logger.doWarningLogging(f"There were errors parsing nodecheck.txt. Not logging node healths.")
self.logger.doErrorLogging(f"There were errors parsing nodecheck.txt. Not logging node healths.")
return {}
return node_healths

Expand Down
4 changes: 2 additions & 2 deletions harness/libraries/layout_of_apps_directory.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ def create_test_status(self):
# Not guarded by a conditional, so that it removes broken links
os.unlink(latest_lnk)
except FileNotFoundError as e:
self.__logger.doInfoLogging("Could not remove 'latest' link in Status.")
self.__logger.doWarningLogging("Could not remove 'latest' link in Status.")
try_symlink(spath, latest_lnk)

return spath
Expand All @@ -269,7 +269,7 @@ def create_test_runarchive(self):
# Not guarded by a conditional, so that it removes broken links
os.unlink(latest_lnk)
except FileNotFoundError as e:
self.__logger.doInfoLogging("Could not remove 'latest' link in Run_Archive.")
self.__logger.doWarningLogging("Could not remove 'latest' link in Run_Archive.")
try_symlink(rpath, latest_lnk)

return rpath
Expand Down
Loading

0 comments on commit 9acbe49

Please sign in to comment.