Skip to content

Commit

Permalink
refactor: lazy content loading for LogFileOutput
Browse files Browse the repository at this point in the history
- RHINENG-13810
  By re-writing the _handle_content and parser_content of the
  LogFileOutput, implement a lazy content loading (or postpone the
  content loading to `lines` being called the first time).
  This change won't affect any existing usage of the LogFileOutput and
  its child Parsers.  But it's useful for CCX rules which will load
  thousands of log files for a same Spec in one must-gather or
  insights-operator archive.

Signed-off-by: Xiangce Liu <[email protected]>
  • Loading branch information
xiangce committed Dec 17, 2024
1 parent d7cff2e commit 2a380e0
Showing 1 changed file with 83 additions and 32 deletions.
115 changes: 83 additions & 32 deletions insights/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,9 @@ class Parser(object):
"""

def __init__(self, context):
self.file_path = os.path.join("/", context.relative_path) if context.relative_path is not None else None
self.file_path = (
os.path.join("/", context.relative_path) if context.relative_path is not None else None
)
"""str: Full context path of the input file."""
self.file_name = os.path.basename(context.path) if context.path is not None else None
"""str: Filename portion of the input file."""
Expand Down Expand Up @@ -150,6 +152,7 @@ def inner(children, stack):
if c.children:
c.children = inner(c.children, stack)
return results

return inner(docs, [])


Expand Down Expand Up @@ -340,6 +343,7 @@ class ConfigParser(Parser, ConfigComponent):
Raises:
SkipComponent: When input content is empty.
"""

def parse_content(self, content):
if not content:
raise SkipComponent('Empty content.')
Expand All @@ -359,6 +363,7 @@ class ConfigCombiner(ConfigComponent):
include directives for supplementary configuration files. httpd and nginx
are examples.
"""

def __init__(self, confs, main_file, include_finder):
self.confs = confs
self.main = self.find_main(main_file)
Expand Down Expand Up @@ -396,6 +401,7 @@ class ContainerConfigCombiner(ConfigCombiner):
files with include directives for supplementary configuration files.
httpd and nginx are examples.
"""

def __init__(self, confs, main_file, include_finder, engine, image, container_id):
self.image = image
"""str: The image of the container."""
Expand Down Expand Up @@ -516,8 +522,7 @@ def parse_content(self, content):

# Either only one thing or line or rest starts with comment
# but either way we need to have an equals in the first word.
if (len(words) == 1 or (len(words) > 1 and words[1][0] == '#')) \
and '=' in words[0]:
if (len(words) == 1 or (len(words) > 1 and words[1][0] == '#')) and '=' in words[0]:
key, value = words[0].split('=', 1)
result[key] = value
# Only store lines if they aren't comments or blank
Expand All @@ -527,7 +532,7 @@ def parse_content(self, content):
self.unparsed_lines = unparsed_lines

def keys(self):
""" Return the list of keys (in no order) in the underlying dictionary."""
"""Return the list of keys (in no order) in the underlying dictionary."""
return self.data.keys()


Expand All @@ -544,19 +549,19 @@ class CommandParser(Parser):
"""

__bad_single_lines = [
"no such file or directory",
"not a directory",
"command not found",
"no module named",
"no files found for",
"no such file or directory",
"not a directory",
"command not found",
"no module named",
"no files found for",
]
"""
This variable contains filters for bad responses of the single line
returned from commands defined with command specs.
When adding a new line to the list make sure text is all lower case.
"""
__bad_lines = [
"missing dependencies:",
"missing dependencies:",
]
"""
This variable contains filters for bad responses of the lines
Expand Down Expand Up @@ -606,7 +611,9 @@ def __init__(self, context, extra_bad_lines=None):
`self.__bad_single_lines` and `self.__bad_lines`.
"""
extra_bad_lines = [] if extra_bad_lines is None else extra_bad_lines
valid_lines = self.validate_lines(context.content, self.__bad_single_lines, self.__bad_lines)
valid_lines = self.validate_lines(
context.content, self.__bad_single_lines, self.__bad_lines
)
if valid_lines and extra_bad_lines:
valid_lines = self.validate_lines(context.content, extra_bad_lines, extra_bad_lines)
if not valid_lines:
Expand All @@ -621,6 +628,7 @@ class ContainerParser(CommandParser):
A class specifically for container parser, with the "image" name, the
engine provider and the container ID on the basis of ``Parser``.
"""

def __init__(self, context):
self.image = context.image
"""str: The image of the container."""
Expand Down Expand Up @@ -702,7 +710,11 @@ def parse_content(self, content):
if len(content) > 3:
self.raw = '\n'.join(content)
self.dom = ET.fromstring(self.raw)
self.xmlns = self.dom.tag.strip("{").split("}")[0] if all(c in self.dom.tag for c in ["{", "}"]) else ""
self.xmlns = (
self.dom.tag.strip("{").split("}")[0]
if all(c in self.dom.tag for c in ["{", "}"])
else ""
)
self.data = self.parse_dom()

def get_elements(self, element, xmlns=None):
Expand Down Expand Up @@ -754,6 +766,7 @@ class YAMLParser(Parser, LegacyItemAccess):
"""
A parser class that reads YAML files. Base your own parser on this.
"""

def parse_content(self, content):
try:
if type(content) is list:
Expand All @@ -779,6 +792,7 @@ class JSONParser(Parser, LegacyItemAccess):
"""
A parser class that reads JSON files. Base your own parser on this.
"""

def parse_content(self, content):
try:
if isinstance(content, list):
Expand Down Expand Up @@ -866,6 +880,7 @@ def has_fcoe_edd(line):
strings or False).
"""

def __init__(self, *args, **kwargs):
deprecated(Scannable, "Please use the :class:`insights.core.Parser` instead.", "3.3.0")
super(Scannable, self).__init__(*args, **kwargs)
Expand All @@ -891,6 +906,7 @@ def any(cls, result_key, func):
Sets the `result_key` to the output of `func` if `func` ever returns
truthy
"""

def scanner(self, obj):
current_value = getattr(self, result_key, None)
setattr(self, result_key, current_value or func(obj))
Expand All @@ -903,6 +919,7 @@ def collect(cls, result_key, func):
Sets the `result_key` to an iterable of objects for which `func(obj)`
returns True
"""

def scanner(self, obj):
if not getattr(self, result_key, None):
setattr(self, result_key, [])
Expand All @@ -928,7 +945,7 @@ def parse_content(self, content):

class TextFileOutput(six.with_metaclass(ScanMeta, Parser)):
"""
Class for parsing gerenal text file content.
Class for parsing general text file content.
File content is stored in raw format in the ``lines`` attribute.
Expand Down Expand Up @@ -983,6 +1000,7 @@ class TextFileOutput(six.with_metaclass(ScanMeta, Parser)):
True
"""

def parse_content(self, content):
"""
Use all the defined scanners to search the log file, setting the
Expand Down Expand Up @@ -1018,8 +1036,7 @@ def _valid_search(self, s, check=all):
"""
if isinstance(s, six.string_types):
return lambda l: s in l
elif (isinstance(s, list) and len(s) > 0 and
all(isinstance(w, six.string_types) for w in s)):
elif isinstance(s, list) and len(s) > 0 and all(isinstance(w, six.string_types) for w in s):
return lambda l: check(w in l for w in s)
elif s is not None:
raise TypeError('Search items must be given as a string or a list of strings')
Expand Down Expand Up @@ -1090,6 +1107,7 @@ def token_scan(cls, result_key, token, check=all):
(bool): the property will contain True if a line contained (any
or all) of the tokens given.
"""

def _scan(self):
search_by_expression = self._valid_search(token, check)
return any(search_by_expression(l) for l in self.lines)
Expand All @@ -1112,6 +1130,7 @@ def keep_scan(cls, result_key, token, check=all, num=None, reverse=False):
Returns:
(list): list of dictionaries corresponding to the parsed lines contain the `token`.
"""

def _scan(self):
return self.get(token, check=check, num=num, reverse=reverse)

Expand All @@ -1131,6 +1150,7 @@ def last_scan(cls, result_key, token, check=all):
Returns:
(dict): dictionary corresponding to the last parsed line contains the `token`.
"""

def _scan(self):
ret = self.get(token, check=check, num=1, reverse=True)
return ret[0] if ret else dict()
Expand Down Expand Up @@ -1161,6 +1181,29 @@ class parser :py:class:`TextFileOutput`.
* A None value when there is no timestamp info in the log file
"""

def _handle_content(self, context):
self._lines = None
self._context = context
# pass None as a fake argument for compatibility
self.parse_content(None)

def parse_content(self, content=None):
"""
Use all the defined scanners to search the log file, setting the
properties defined in the scanner. However, do not pre-load the lines.
Keep None as a fake keyword argument for compatibility.
"""
for scanner in self.scanners:
scanner(self)

@property
def lines(self):
if self._lines is None:
# one-shot load all lines here
self._lines = self._context.content
return self._lines

def _parse_line(self, line):
"""
Parse the line into a dictionary and return it. Only wrap with
Expand Down Expand Up @@ -1247,12 +1290,15 @@ def get_after(self, timestamp, s=None):
# character sets. Note that we don't include time zone or other
# outputs (e.g. day-of-year) that don't usually occur in time stamps.
format_conversion_for = {
'a': r'\w{3}', 'A': r'\w+', # Week day name
'a': r'\w{3}',
'A': r'\w+', # Week day name
'w': r'[0123456]', # Week day number
'd': r'([0 ][123456789]|[12]\d|3[01])', # Day of month
'b': r'\w{3}', 'B': r'\w+', # Month name
'b': r'\w{3}',
'B': r'\w+', # Month name
'm': r'([0 ]\d|1[012])', # Month number
'y': r'\d{2}', 'Y': r'\d{4}', # Year
'y': r'\d{2}',
'Y': r'\d{4}', # Year
'H': r'([01 ]\d|2[0123])', # Hour - 24 hour format
'I': r'([0 ]?\d|1[012])', # Hour - 12 hour format
'p': r'\w{2}', # AM / PM
Expand All @@ -1269,9 +1315,7 @@ def replacer(match):
return format_conversion_for[match.group(1)]
else:
raise ParseException(
"get_after does not understand strptime format '{c}'".format(
c=match.group(0)
)
"get_after does not understand strptime format '{c}'".format(c=match.group(0))
)

# Please do not attempt to be tricky and put a regular expression
Expand All @@ -1284,18 +1328,19 @@ def replacer(match):
if isinstance(time_format, dict):
time_format = list(time_format.values())
if isinstance(time_format, six.string_types):
logs_have_year = ('%Y' in time_format or '%y' in time_format)
logs_have_year = '%Y' in time_format or '%y' in time_format
time_re = re.compile('(' + timefmt_re.sub(replacer, time_format) + ')')

# Curry strptime with time_format string.
def test_parser(logstamp):
return datetime.datetime.strptime(logstamp, time_format)

parse_fn = test_parser
elif isinstance(time_format, list):
logs_have_year = all('%Y' in tf or '%y' in tf for tf in time_format)
time_re = re.compile('(' + '|'.join(
timefmt_re.sub(replacer, tf) for tf in time_format
) + ')')
time_re = re.compile(
'(' + '|'.join(timefmt_re.sub(replacer, tf) for tf in time_format) + ')'
)

def test_all_parsers(logstamp):
# One of these must match, because the regex has selected only
Expand All @@ -1306,12 +1351,11 @@ def test_all_parsers(logstamp):
except ValueError:
pass
return ts

parse_fn = test_all_parsers
else:
raise ParseException(
"get_after does not recognise time formats of type {t}".format(
t=type(time_format)
)
"get_after does not recognise time formats of type {t}".format(t=type(time_format))
)

# Most logs will appear in string format, but some logs (e.g.
Expand Down Expand Up @@ -1405,6 +1449,7 @@ class Syslog(LogFileOutput):
the year of the logs will be inferred from the year in your timestamp.
This will also work around December/January crossovers.
"""

time_format = '%b %d %H:%M:%S'

def _parse_line(self, line):
Expand Down Expand Up @@ -1501,8 +1546,11 @@ class IniConfigFile(ConfigParser):
>>> my_config.has_option('logging', 'log')
True
"""

def parse_doc(self, content):
return iniparser.parse_doc("\n".join(content), self, return_defaults=True, return_booleans=False)
return iniparser.parse_doc(
"\n".join(content), self, return_defaults=True, return_booleans=False
)

def parse_content(self, content, allow_no_value=False):
super(IniConfigFile, self).parse_content(content)
Expand Down Expand Up @@ -1584,7 +1632,7 @@ def getboolean(self, section, option):
'true': True,
'false': False,
'on': True,
'off': False
'off': False,
}

if val.lower() not in boolean_states:
Expand Down Expand Up @@ -1658,7 +1706,8 @@ def __contains__(self, section):

def __repr__(self):
return "INI file '{filename}' - sections:{sections}".format(
filename=self.file_name, sections=self.sections())
filename=self.file_name, sections=self.sections()
)


class FileListing(Parser):
Expand Down Expand Up @@ -1743,7 +1792,9 @@ def __init__(self, context):
# the directory name in the output). Obviously if we don't have the
# '-R' flag we should grab this but it's probably not worth parsing
# the flags to ls for this.
deprecated(FileListing, "Please use the :class:`insights.parsers.ls.FileListing instead.", "3.5.0")
deprecated(
FileListing, "Please use the :class:`insights.parsers.ls.FileListing instead.", "3.5.0"
)
self.first_path = None
path_re = re.compile(r'ls_-\w+(?P<path>.*)$')
match = path_re.search(context.path)
Expand Down

0 comments on commit 2a380e0

Please sign in to comment.