refactor: lazy content loading for LogFileOutput

- RHINENG-13810 By re-writing the _handle_content and parser_content of the LogFileOutput, implement a lazy content loading (or postpone the content loading to `lines` being called the first time). This change won't affect any existing usage of the LogFileOutput and its child Parsers. But it's useful for CCX rules which will load thousands of log files for a same Spec in one must-gather or insights-operator archive. Signed-off-by: Xiangce Liu <[email protected]>
RedHatInsights · Dec 17, 2024 · 2a380e0 · 2a380e0
1 parent d7cff2e
commit 2a380e0
Showing 1 changed file with 83 additions and 32 deletions.
diff --git a/insights/core/__init__.py b/insights/core/__init__.py
@@ -77,7 +77,9 @@ class Parser(object):
     """
 
     def __init__(self, context):
-        self.file_path = os.path.join("/", context.relative_path) if context.relative_path is not None else None
+        self.file_path = (
+            os.path.join("/", context.relative_path) if context.relative_path is not None else None
+        )
         """str: Full context path of the input file."""
         self.file_name = os.path.basename(context.path) if context.path is not None else None
         """str: Filename portion of the input file."""
@@ -150,6 +152,7 @@ def inner(children, stack):
                 if c.children:
                     c.children = inner(c.children, stack)
         return results
+
     return inner(docs, [])
 
 
@@ -340,6 +343,7 @@ class ConfigParser(Parser, ConfigComponent):
     Raises:
         SkipComponent: When input content is empty.
     """
+
     def parse_content(self, content):
         if not content:
             raise SkipComponent('Empty content.')
@@ -359,6 +363,7 @@ class ConfigCombiner(ConfigComponent):
     include directives for supplementary configuration files. httpd and nginx
     are examples.
     """
+
     def __init__(self, confs, main_file, include_finder):
         self.confs = confs
         self.main = self.find_main(main_file)
@@ -396,6 +401,7 @@ class ContainerConfigCombiner(ConfigCombiner):
     files with include directives for supplementary configuration files.
     httpd and nginx are examples.
     """
+
     def __init__(self, confs, main_file, include_finder, engine, image, container_id):
         self.image = image
         """str: The image of the container."""
@@ -516,8 +522,7 @@ def parse_content(self, content):
 
             # Either only one thing or line or rest starts with comment
             # but either way we need to have an equals in the first word.
-            if (len(words) == 1 or (len(words) > 1 and words[1][0] == '#')) \
-                    and '=' in words[0]:
+            if (len(words) == 1 or (len(words) > 1 and words[1][0] == '#')) and '=' in words[0]:
                 key, value = words[0].split('=', 1)
                 result[key] = value
             # Only store lines if they aren't comments or blank
@@ -527,7 +532,7 @@ def parse_content(self, content):
         self.unparsed_lines = unparsed_lines
 
     def keys(self):
-        """ Return the list of keys (in no order) in the underlying dictionary."""
+        """Return the list of keys (in no order) in the underlying dictionary."""
         return self.data.keys()
 
 
@@ -544,19 +549,19 @@ class CommandParser(Parser):
     """
 
     __bad_single_lines = [
-            "no such file or directory",
-            "not a directory",
-            "command not found",
-            "no module named",
-            "no files found for",
+        "no such file or directory",
+        "not a directory",
+        "command not found",
+        "no module named",
+        "no files found for",
     ]
     """
     This variable contains filters for bad responses of the single line
     returned from commands defined with command specs.
     When adding a new line to the list make sure text is all lower case.
     """
     __bad_lines = [
-            "missing dependencies:",
+        "missing dependencies:",
     ]
     """
     This variable contains filters for bad responses of the lines
@@ -606,7 +611,9 @@ def __init__(self, context, extra_bad_lines=None):
                 `self.__bad_single_lines` and `self.__bad_lines`.
         """
         extra_bad_lines = [] if extra_bad_lines is None else extra_bad_lines
-        valid_lines = self.validate_lines(context.content, self.__bad_single_lines, self.__bad_lines)
+        valid_lines = self.validate_lines(
+            context.content, self.__bad_single_lines, self.__bad_lines
+        )
         if valid_lines and extra_bad_lines:
             valid_lines = self.validate_lines(context.content, extra_bad_lines, extra_bad_lines)
         if not valid_lines:
@@ -621,6 +628,7 @@ class ContainerParser(CommandParser):
     A class specifically for container parser, with the "image" name, the
     engine provider and the container ID on the basis of ``Parser``.
     """
+
     def __init__(self, context):
         self.image = context.image
         """str: The image of the container."""
@@ -702,7 +710,11 @@ def parse_content(self, content):
         if len(content) > 3:
             self.raw = '\n'.join(content)
             self.dom = ET.fromstring(self.raw)
-            self.xmlns = self.dom.tag.strip("{").split("}")[0] if all(c in self.dom.tag for c in ["{", "}"]) else ""
+            self.xmlns = (
+                self.dom.tag.strip("{").split("}")[0]
+                if all(c in self.dom.tag for c in ["{", "}"])
+                else ""
+            )
             self.data = self.parse_dom()
 
     def get_elements(self, element, xmlns=None):
@@ -754,6 +766,7 @@ class YAMLParser(Parser, LegacyItemAccess):
     """
     A parser class that reads YAML files.  Base your own parser on this.
     """
+
     def parse_content(self, content):
         try:
             if type(content) is list:
@@ -779,6 +792,7 @@ class JSONParser(Parser, LegacyItemAccess):
     """
     A parser class that reads JSON files.  Base your own parser on this.
     """
+
     def parse_content(self, content):
         try:
             if isinstance(content, list):
@@ -866,6 +880,7 @@ def has_fcoe_edd(line):
     strings or False).
 
     """
+
     def __init__(self, *args, **kwargs):
         deprecated(Scannable, "Please use the :class:`insights.core.Parser` instead.", "3.3.0")
         super(Scannable, self).__init__(*args, **kwargs)
@@ -891,6 +906,7 @@ def any(cls, result_key, func):
         Sets the `result_key` to the output of `func` if `func` ever returns
         truthy
         """
+
         def scanner(self, obj):
             current_value = getattr(self, result_key, None)
             setattr(self, result_key, current_value or func(obj))
@@ -903,6 +919,7 @@ def collect(cls, result_key, func):
         Sets the `result_key` to an iterable of objects for which `func(obj)`
         returns True
         """
+
         def scanner(self, obj):
             if not getattr(self, result_key, None):
                 setattr(self, result_key, [])
@@ -928,7 +945,7 @@ def parse_content(self, content):
 
 class TextFileOutput(six.with_metaclass(ScanMeta, Parser)):
     """
-    Class for parsing gerenal text file content.
+    Class for parsing general text file content.
 
     File content is stored in raw format in the ``lines`` attribute.
 
@@ -983,6 +1000,7 @@ class TextFileOutput(six.with_metaclass(ScanMeta, Parser)):
         True
 
     """
+
     def parse_content(self, content):
         """
         Use all the defined scanners to search the log file, setting the
@@ -1018,8 +1036,7 @@ def _valid_search(self, s, check=all):
         """
         if isinstance(s, six.string_types):
             return lambda l: s in l
-        elif (isinstance(s, list) and len(s) > 0 and
-              all(isinstance(w, six.string_types) for w in s)):
+        elif isinstance(s, list) and len(s) > 0 and all(isinstance(w, six.string_types) for w in s):
             return lambda l: check(w in l for w in s)
         elif s is not None:
             raise TypeError('Search items must be given as a string or a list of strings')
@@ -1090,6 +1107,7 @@ def token_scan(cls, result_key, token, check=all):
             (bool): the property will contain True if a line contained (any
             or all) of the tokens given.
         """
+
         def _scan(self):
             search_by_expression = self._valid_search(token, check)
             return any(search_by_expression(l) for l in self.lines)
@@ -1112,6 +1130,7 @@ def keep_scan(cls, result_key, token, check=all, num=None, reverse=False):
         Returns:
             (list): list of dictionaries corresponding to the parsed lines contain the `token`.
         """
+
         def _scan(self):
             return self.get(token, check=check, num=num, reverse=reverse)
 
@@ -1131,6 +1150,7 @@ def last_scan(cls, result_key, token, check=all):
         Returns:
             (dict): dictionary corresponding to the last parsed line contains the `token`.
         """
+
         def _scan(self):
             ret = self.get(token, check=check, num=1, reverse=True)
             return ret[0] if ret else dict()
@@ -1161,6 +1181,29 @@ class parser :py:class:`TextFileOutput`.
     * A None value when there is no timestamp info in the log file
     """
 
+    def _handle_content(self, context):
+        self._lines = None
+        self._context = context
+        # pass None as a fake argument for compatibility
+        self.parse_content(None)
+
+    def parse_content(self, content=None):
+        """
+        Use all the defined scanners to search the log file, setting the
+        properties defined in the scanner.  However, do not pre-load the lines.
+
+        Keep None as a fake keyword argument for compatibility.
+        """
+        for scanner in self.scanners:
+            scanner(self)
+
+    @property
+    def lines(self):
+        if self._lines is None:
+            # one-shot load all lines here
+            self._lines = self._context.content
+        return self._lines
+
     def _parse_line(self, line):
         """
         Parse the line into a dictionary and return it. Only wrap with
@@ -1247,12 +1290,15 @@ def get_after(self, timestamp, s=None):
         # character sets.  Note that we don't include time zone or other
         # outputs (e.g. day-of-year) that don't usually occur in time stamps.
         format_conversion_for = {
-            'a': r'\w{3}', 'A': r'\w+',  # Week day name
+            'a': r'\w{3}',
+            'A': r'\w+',  # Week day name
             'w': r'[0123456]',  # Week day number
             'd': r'([0 ][123456789]|[12]\d|3[01])',  # Day of month
-            'b': r'\w{3}', 'B': r'\w+',  # Month name
+            'b': r'\w{3}',
+            'B': r'\w+',  # Month name
             'm': r'([0 ]\d|1[012])',  # Month number
-            'y': r'\d{2}', 'Y': r'\d{4}',  # Year
+            'y': r'\d{2}',
+            'Y': r'\d{4}',  # Year
             'H': r'([01 ]\d|2[0123])',  # Hour - 24 hour format
             'I': r'([0 ]?\d|1[012])',  # Hour - 12 hour format
             'p': r'\w{2}',  # AM / PM
@@ -1269,9 +1315,7 @@ def replacer(match):
                 return format_conversion_for[match.group(1)]
             else:
                 raise ParseException(
-                    "get_after does not understand strptime format '{c}'".format(
-                        c=match.group(0)
-                    )
+                    "get_after does not understand strptime format '{c}'".format(c=match.group(0))
                 )
 
         # Please do not attempt to be tricky and put a regular expression
@@ -1284,18 +1328,19 @@ def replacer(match):
         if isinstance(time_format, dict):
             time_format = list(time_format.values())
         if isinstance(time_format, six.string_types):
-            logs_have_year = ('%Y' in time_format or '%y' in time_format)
+            logs_have_year = '%Y' in time_format or '%y' in time_format
             time_re = re.compile('(' + timefmt_re.sub(replacer, time_format) + ')')
 
             # Curry strptime with time_format string.
             def test_parser(logstamp):
                 return datetime.datetime.strptime(logstamp, time_format)
+
             parse_fn = test_parser
         elif isinstance(time_format, list):
             logs_have_year = all('%Y' in tf or '%y' in tf for tf in time_format)
-            time_re = re.compile('(' + '|'.join(
-                timefmt_re.sub(replacer, tf) for tf in time_format
-            ) + ')')
+            time_re = re.compile(
+                '(' + '|'.join(timefmt_re.sub(replacer, tf) for tf in time_format) + ')'
+            )
 
             def test_all_parsers(logstamp):
                 # One of these must match, because the regex has selected only
@@ -1306,12 +1351,11 @@ def test_all_parsers(logstamp):
                     except ValueError:
                         pass
                 return ts
+
             parse_fn = test_all_parsers
         else:
             raise ParseException(
-                "get_after does not recognise time formats of type {t}".format(
-                    t=type(time_format)
-                )
+                "get_after does not recognise time formats of type {t}".format(t=type(time_format))
             )
 
         # Most logs will appear in string format, but some logs (e.g.
@@ -1405,6 +1449,7 @@ class Syslog(LogFileOutput):
         the year of the logs will be inferred from the year in your timestamp.
         This will also work around December/January crossovers.
     """
+
     time_format = '%b %d %H:%M:%S'
 
     def _parse_line(self, line):
@@ -1501,8 +1546,11 @@ class IniConfigFile(ConfigParser):
         >>> my_config.has_option('logging', 'log')
         True
     """
+
     def parse_doc(self, content):
-        return iniparser.parse_doc("\n".join(content), self, return_defaults=True, return_booleans=False)
+        return iniparser.parse_doc(
+            "\n".join(content), self, return_defaults=True, return_booleans=False
+        )
 
     def parse_content(self, content, allow_no_value=False):
         super(IniConfigFile, self).parse_content(content)
@@ -1584,7 +1632,7 @@ def getboolean(self, section, option):
             'true': True,
             'false': False,
             'on': True,
-            'off': False
+            'off': False,
         }
 
         if val.lower() not in boolean_states:
@@ -1658,7 +1706,8 @@ def __contains__(self, section):
 
     def __repr__(self):
         return "INI file '{filename}' - sections:{sections}".format(
-            filename=self.file_name, sections=self.sections())
+            filename=self.file_name, sections=self.sections()
+        )
 
 
 class FileListing(Parser):
@@ -1743,7 +1792,9 @@ def __init__(self, context):
         # the directory name in the output).  Obviously if we don't have the
         # '-R' flag we should grab this but it's probably not worth parsing
         # the flags to ls for this.
-        deprecated(FileListing, "Please use the :class:`insights.parsers.ls.FileListing instead.", "3.5.0")
+        deprecated(
+            FileListing, "Please use the :class:`insights.parsers.ls.FileListing instead.", "3.5.0"
+        )
         self.first_path = None
         path_re = re.compile(r'ls_-\w+(?P<path>.*)$')
         match = path_re.search(context.path)