From ddfab60e1593eeb75546b53ac617d7c89fd10c15 Mon Sep 17 00:00:00 2001
From: Xiangce Liu <xiangceliu@redhat.com>
Date: Tue, 17 Dec 2024 20:40:35 +0800
Subject: [PATCH] feat: add a new base parser LazyLogFileOutput

- Doesn't like the LogFileOutput, the LazyLogFileOutput doesn't load the
  content during initialization, its content can only be loaded via calling
  the `do_scan` method explicitly.
- It's useful for the cases which need to load thousands of files belongs
  to one single Spec in one pass of running.
- For implementation's convenience, converted the "scanners" from `list`
  `dict` in which the `result_key` is key and `scanner` is value.  And
  removed the `scanner_keys`.
- Update the LsPci accordingly.
- see RHINENG-13810

Signed-off-by: Xiangce Liu <xiangceliu@redhat.com>
---
 insights/core/__init__.py                | 150 +++++++---
 insights/parsers/lspci.py                |  14 +-
 insights/tests/test_lazylogfileoutput.py | 362 +++++++++++++++++++++++
 3 files changed, 477 insertions(+), 49 deletions(-)
 create mode 100644 insights/tests/test_lazylogfileoutput.py

diff --git a/insights/core/__init__.py b/insights/core/__init__.py
index e87e330e08..959f75a202 100644
--- a/insights/core/__init__.py
+++ b/insights/core/__init__.py
@@ -77,7 +77,9 @@ class Parser(object):
     """
 
     def __init__(self, context):
-        self.file_path = os.path.join("/", context.relative_path) if context.relative_path is not None else None
+        self.file_path = (
+            os.path.join("/", context.relative_path) if context.relative_path is not None else None
+        )
         """str: Full context path of the input file."""
         self.file_name = os.path.basename(context.path) if context.path is not None else None
         """str: Filename portion of the input file."""
@@ -150,6 +152,7 @@ def inner(children, stack):
                 if c.children:
                     c.children = inner(c.children, stack)
         return results
+
     return inner(docs, [])
 
 
@@ -340,6 +343,7 @@ class ConfigParser(Parser, ConfigComponent):
     Raises:
         SkipComponent: When input content is empty.
     """
+
     def parse_content(self, content):
         if not content:
             raise SkipComponent('Empty content.')
@@ -359,6 +363,7 @@ class ConfigCombiner(ConfigComponent):
     include directives for supplementary configuration files. httpd and nginx
     are examples.
     """
+
     def __init__(self, confs, main_file, include_finder):
         self.confs = confs
         self.main = self.find_main(main_file)
@@ -396,6 +401,7 @@ class ContainerConfigCombiner(ConfigCombiner):
     files with include directives for supplementary configuration files.
     httpd and nginx are examples.
     """
+
     def __init__(self, confs, main_file, include_finder, engine, image, container_id):
         self.image = image
         """str: The image of the container."""
@@ -516,8 +522,7 @@ def parse_content(self, content):
 
             # Either only one thing or line or rest starts with comment
             # but either way we need to have an equals in the first word.
-            if (len(words) == 1 or (len(words) > 1 and words[1][0] == '#')) \
-                    and '=' in words[0]:
+            if (len(words) == 1 or (len(words) > 1 and words[1][0] == '#')) and '=' in words[0]:
                 key, value = words[0].split('=', 1)
                 result[key] = value
             # Only store lines if they aren't comments or blank
@@ -527,7 +532,7 @@ def parse_content(self, content):
         self.unparsed_lines = unparsed_lines
 
     def keys(self):
-        """ Return the list of keys (in no order) in the underlying dictionary."""
+        """Return the list of keys (in no order) in the underlying dictionary."""
         return self.data.keys()
 
 
@@ -544,11 +549,11 @@ class CommandParser(Parser):
     """
 
     __bad_single_lines = [
-            "no such file or directory",
-            "not a directory",
-            "command not found",
-            "no module named",
-            "no files found for",
+        "no such file or directory",
+        "not a directory",
+        "command not found",
+        "no module named",
+        "no files found for",
     ]
     """
     This variable contains filters for bad responses of the single line
@@ -556,7 +561,7 @@ class CommandParser(Parser):
     When adding a new line to the list make sure text is all lower case.
     """
     __bad_lines = [
-            "missing dependencies:",
+        "missing dependencies:",
     ]
     """
     This variable contains filters for bad responses of the lines
@@ -606,7 +611,9 @@ def __init__(self, context, extra_bad_lines=None):
                 `self.__bad_single_lines` and `self.__bad_lines`.
         """
         extra_bad_lines = [] if extra_bad_lines is None else extra_bad_lines
-        valid_lines = self.validate_lines(context.content, self.__bad_single_lines, self.__bad_lines)
+        valid_lines = self.validate_lines(
+            context.content, self.__bad_single_lines, self.__bad_lines
+        )
         if valid_lines and extra_bad_lines:
             valid_lines = self.validate_lines(context.content, extra_bad_lines, extra_bad_lines)
         if not valid_lines:
@@ -621,6 +628,7 @@ class ContainerParser(CommandParser):
     A class specifically for container parser, with the "image" name, the
     engine provider and the container ID on the basis of ``Parser``.
     """
+
     def __init__(self, context):
         self.image = context.image
         """str: The image of the container."""
@@ -702,7 +710,11 @@ def parse_content(self, content):
         if len(content) > 3:
             self.raw = '\n'.join(content)
             self.dom = ET.fromstring(self.raw)
-            self.xmlns = self.dom.tag.strip("{").split("}")[0] if all(c in self.dom.tag for c in ["{", "}"]) else ""
+            self.xmlns = (
+                self.dom.tag.strip("{").split("}")[0]
+                if all(c in self.dom.tag for c in ["{", "}"])
+                else ""
+            )
             self.data = self.parse_dom()
 
     def get_elements(self, element, xmlns=None):
@@ -754,6 +766,7 @@ class YAMLParser(Parser, LegacyItemAccess):
     """
     A parser class that reads YAML files.  Base your own parser on this.
     """
+
     def parse_content(self, content):
         try:
             if type(content) is list:
@@ -779,6 +792,7 @@ class JSONParser(Parser, LegacyItemAccess):
     """
     A parser class that reads JSON files.  Base your own parser on this.
     """
+
     def parse_content(self, content):
         try:
             if isinstance(content, list):
@@ -803,8 +817,7 @@ def parse_content(self, content):
 
 class ScanMeta(type):
     def __new__(cls, name, parents, dct):
-        dct["scanners"] = []
-        dct["scanner_keys"] = set()
+        dct["scanners"] = dict()
         return super(ScanMeta, cls).__new__(cls, name, parents, dct)
 
 
@@ -866,6 +879,7 @@ def has_fcoe_edd(line):
     strings or False).
 
     """
+
     def __init__(self, *args, **kwargs):
         deprecated(Scannable, "Please use the :class:`insights.core.Parser` instead.", "3.3.0")
         super(Scannable, self).__init__(*args, **kwargs)
@@ -879,11 +893,10 @@ def _scan(cls, result_key, scanner):
         retrieved later by a rule.
         """
 
-        if result_key in cls.scanner_keys:
+        if result_key in cls.scanners:
             raise ValueError("'%s' is already a registered scanner key" % result_key)
 
-        cls.scanners.append(scanner)
-        cls.scanner_keys.add(result_key)
+        cls.scanners.update({result_key: scanner})
 
     @classmethod
     def any(cls, result_key, func):
@@ -891,6 +904,7 @@ def any(cls, result_key, func):
         Sets the `result_key` to the output of `func` if `func` ever returns
         truthy
         """
+
         def scanner(self, obj):
             current_value = getattr(self, result_key, None)
             setattr(self, result_key, current_value or func(obj))
@@ -903,6 +917,7 @@ def collect(cls, result_key, func):
         Sets the `result_key` to an iterable of objects for which `func(obj)`
         returns True
         """
+
         def scanner(self, obj):
             if not getattr(self, result_key, None):
                 setattr(self, result_key, [])
@@ -922,13 +937,13 @@ def parse(self, content):
 
     def parse_content(self, content):
         for obj in self.parse(content):
-            for scanner in self.scanners:
+            for scanner in self.scanners.values():
                 scanner(self, obj)
 
 
 class TextFileOutput(six.with_metaclass(ScanMeta, Parser)):
     """
-    Class for parsing gerenal text file content.
+    Class for parsing general text file content.
 
     File content is stored in raw format in the ``lines`` attribute.
 
@@ -983,13 +998,14 @@ class TextFileOutput(six.with_metaclass(ScanMeta, Parser)):
         True
 
     """
+
     def parse_content(self, content):
         """
         Use all the defined scanners to search the log file, setting the
         properties defined in the scanner.
         """
         self.lines = content
-        for scanner in self.scanners:
+        for scanner in self.scanners.values():
             scanner(self)
 
     def __contains__(self, s):
@@ -1018,8 +1034,7 @@ def _valid_search(self, s, check=all):
         """
         if isinstance(s, six.string_types):
             return lambda l: s in l
-        elif (isinstance(s, list) and len(s) > 0 and
-              all(isinstance(w, six.string_types) for w in s)):
+        elif isinstance(s, list) and len(s) > 0 and all(isinstance(w, six.string_types) for w in s):
             return lambda l: check(w in l for w in s)
         elif s is not None:
             raise TypeError('Search items must be given as a string or a list of strings')
@@ -1065,15 +1080,14 @@ def scan(cls, result_key, func):
             ValueError: When `result_key` is already a registered scanner key.
         """
 
-        if result_key in cls.scanner_keys:
+        if result_key in cls.scanners:
             raise ValueError("'%s' is already a registered scanner key" % result_key)
 
         def scanner(self):
             result = func(self)
             setattr(self, result_key, result)
 
-        cls.scanners.append(scanner)
-        cls.scanner_keys.add(result_key)
+        cls.scanners.update({result_key: scanner})
 
     @classmethod
     def token_scan(cls, result_key, token, check=all):
@@ -1090,6 +1104,7 @@ def token_scan(cls, result_key, token, check=all):
             (bool): the property will contain True if a line contained (any
             or all) of the tokens given.
         """
+
         def _scan(self):
             search_by_expression = self._valid_search(token, check)
             return any(search_by_expression(l) for l in self.lines)
@@ -1112,6 +1127,7 @@ def keep_scan(cls, result_key, token, check=all, num=None, reverse=False):
         Returns:
             (list): list of dictionaries corresponding to the parsed lines contain the `token`.
         """
+
         def _scan(self):
             return self.get(token, check=check, num=num, reverse=reverse)
 
@@ -1131,6 +1147,7 @@ def last_scan(cls, result_key, token, check=all):
         Returns:
             (dict): dictionary corresponding to the last parsed line contains the `token`.
         """
+
         def _scan(self):
             ret = self.get(token, check=check, num=1, reverse=True)
             return ret[0] if ret else dict()
@@ -1247,12 +1264,15 @@ def get_after(self, timestamp, s=None):
         # character sets.  Note that we don't include time zone or other
         # outputs (e.g. day-of-year) that don't usually occur in time stamps.
         format_conversion_for = {
-            'a': r'\w{3}', 'A': r'\w+',  # Week day name
+            'a': r'\w{3}',
+            'A': r'\w+',  # Week day name
             'w': r'[0123456]',  # Week day number
             'd': r'([0 ][123456789]|[12]\d|3[01])',  # Day of month
-            'b': r'\w{3}', 'B': r'\w+',  # Month name
+            'b': r'\w{3}',
+            'B': r'\w+',  # Month name
             'm': r'([0 ]\d|1[012])',  # Month number
-            'y': r'\d{2}', 'Y': r'\d{4}',  # Year
+            'y': r'\d{2}',
+            'Y': r'\d{4}',  # Year
             'H': r'([01 ]\d|2[0123])',  # Hour - 24 hour format
             'I': r'([0 ]?\d|1[012])',  # Hour - 12 hour format
             'p': r'\w{2}',  # AM / PM
@@ -1269,9 +1289,7 @@ def replacer(match):
                 return format_conversion_for[match.group(1)]
             else:
                 raise ParseException(
-                    "get_after does not understand strptime format '{c}'".format(
-                        c=match.group(0)
-                    )
+                    "get_after does not understand strptime format '{c}'".format(c=match.group(0))
                 )
 
         # Please do not attempt to be tricky and put a regular expression
@@ -1284,18 +1302,19 @@ def replacer(match):
         if isinstance(time_format, dict):
             time_format = list(time_format.values())
         if isinstance(time_format, six.string_types):
-            logs_have_year = ('%Y' in time_format or '%y' in time_format)
+            logs_have_year = '%Y' in time_format or '%y' in time_format
             time_re = re.compile('(' + timefmt_re.sub(replacer, time_format) + ')')
 
             # Curry strptime with time_format string.
             def test_parser(logstamp):
                 return datetime.datetime.strptime(logstamp, time_format)
+
             parse_fn = test_parser
         elif isinstance(time_format, list):
             logs_have_year = all('%Y' in tf or '%y' in tf for tf in time_format)
-            time_re = re.compile('(' + '|'.join(
-                timefmt_re.sub(replacer, tf) for tf in time_format
-            ) + ')')
+            time_re = re.compile(
+                '(' + '|'.join(timefmt_re.sub(replacer, tf) for tf in time_format) + ')'
+            )
 
             def test_all_parsers(logstamp):
                 # One of these must match, because the regex has selected only
@@ -1306,12 +1325,11 @@ def test_all_parsers(logstamp):
                     except ValueError:
                         pass
                 return ts
+
             parse_fn = test_all_parsers
         else:
             raise ParseException(
-                "get_after does not recognise time formats of type {t}".format(
-                    t=type(time_format)
-                )
+                "get_after does not recognise time formats of type {t}".format(t=type(time_format))
             )
 
         # Most logs will appear in string format, but some logs (e.g.
@@ -1358,7 +1376,46 @@ def test_all_parsers(logstamp):
                     yield self._parse_line(line)
 
 
-class Syslog(LogFileOutput):
+class LazyLogFileOutput(LogFileOutput):
+    """
+    Class for parsing log file content.  Doesn't like the LogFileOutput,
+    the LazyLogFileOutput doesn't load the content during initialization,
+    its content can only be loaded via calling the `do_scan` or `lines`
+    explicitly.  It's useful for the cases which need to load thousands of
+    files belongs to one single Spec in one pass of running.
+    Other than the lazy content loading, it's the same as the LogFileOutput.
+    """
+
+    def _handle_content(self, context):
+        self._lines = None
+        self._context = context
+        self._scanned = set()
+
+    def do_scan(self, result_key=None):
+        """
+        Do the actual scanning operations as per the specified `result_key`.
+        When `result_key` is not specified, all registered scanners will be
+        executed.  Each registered scanner can only be executed once.
+        """
+        if result_key:
+            if result_key not in self._scanned and result_key in self.scanners:
+                self.scanners[result_key](self)
+                self._scanned.add(result_key)
+        else:
+            for key, scanner in self.scanners.items():
+                if key not in self._scanned:
+                    self._scanned.add(key)
+                    scanner(self)
+
+    @property
+    def lines(self):
+        if self._lines is None:
+            # one-shot load all content lines here
+            self._lines = self._context.content
+        return self._lines
+
+
+class Syslog(LazyLogFileOutput):
     """Class for parsing syslog file content.
 
     The important function is :func:`get(s)`, which finds all lines with the
@@ -1405,6 +1462,7 @@ class Syslog(LogFileOutput):
         the year of the logs will be inferred from the year in your timestamp.
         This will also work around December/January crossovers.
     """
+
     time_format = '%b %d %H:%M:%S'
 
     def _parse_line(self, line):
@@ -1501,8 +1559,11 @@ class IniConfigFile(ConfigParser):
         >>> my_config.has_option('logging', 'log')
         True
     """
+
     def parse_doc(self, content):
-        return iniparser.parse_doc("\n".join(content), self, return_defaults=True, return_booleans=False)
+        return iniparser.parse_doc(
+            "\n".join(content), self, return_defaults=True, return_booleans=False
+        )
 
     def parse_content(self, content, allow_no_value=False):
         super(IniConfigFile, self).parse_content(content)
@@ -1584,7 +1645,7 @@ def getboolean(self, section, option):
             'true': True,
             'false': False,
             'on': True,
-            'off': False
+            'off': False,
         }
 
         if val.lower() not in boolean_states:
@@ -1658,7 +1719,8 @@ def __contains__(self, section):
 
     def __repr__(self):
         return "INI file '{filename}' - sections:{sections}".format(
-            filename=self.file_name, sections=self.sections())
+            filename=self.file_name, sections=self.sections()
+        )
 
 
 class FileListing(Parser):
@@ -1743,7 +1805,9 @@ def __init__(self, context):
         # the directory name in the output).  Obviously if we don't have the
         # '-R' flag we should grab this but it's probably not worth parsing
         # the flags to ls for this.
-        deprecated(FileListing, "Please use the :class:`insights.parsers.ls.FileListing instead.", "3.5.0")
+        deprecated(
+            FileListing, "Please use the :class:`insights.parsers.ls.FileListing instead.", "3.5.0"
+        )
         self.first_path = None
         path_re = re.compile(r'ls_-\w+(?P<path>.*)$')
         match = path_re.search(context.path)
diff --git a/insights/parsers/lspci.py b/insights/parsers/lspci.py
index dff28da6c4..3753dba1ff 100644
--- a/insights/parsers/lspci.py
+++ b/insights/parsers/lspci.py
@@ -12,6 +12,7 @@
 -------------------------------------
 
 """
+
 import re
 
 from insights.core import CommandParser, TextFileOutput
@@ -73,11 +74,13 @@ class LsPci(CommandParser, TextFileOutput):
             values of `self.data`
 
     """
+
     def parse_content(self, content):
         # Use all the defined scanners to search the log file, setting the
         # properties defined in the scanner.
-        self.lines = [l for l in content if len(l) > 0 and l[0].isdigit()]
-        for scanner in self.scanners:
+        content = get_active_lines(content)
+        self.lines = [l for l in content if l and l[0].isdigit()]
+        for scanner in self.scanners.values():
             scanner(self)
         # Parse kernel driver lines
         self.data = {}
@@ -86,15 +89,13 @@ def parse_content(self, content):
 
         fields = ["Subsystem", "Kernel driver in use", "Kernel modules"]
 
-        for line in get_active_lines(content):
+        for line in content:
             parts = line.split()
 
             if slot_re.match(parts[0]):
                 slot = parts[0]
                 device_details = line.split(None, 1)[-1]  # keep the raw line
-                self.data[slot] = {
-                        'Slot': slot,
-                        'Dev_Details': device_details.lstrip()}
+                self.data[slot] = {'Slot': slot, 'Dev_Details': device_details.lstrip()}
             elif slot and (line.split(":")[0].strip() in fields):
                 parts = line.split(':')
                 self.data[slot][parts[0]] = parts[1].lstrip()
@@ -180,6 +181,7 @@ class LsPciVmmkn(CommandParser, list):
     Attributes:
 
     """
+
     def parse_content(self, content):
         # Remove the white-trailing of the output
         while content and not content[-1].strip():
diff --git a/insights/tests/test_lazylogfileoutput.py b/insights/tests/test_lazylogfileoutput.py
new file mode 100644
index 0000000000..7508d52b5e
--- /dev/null
+++ b/insights/tests/test_lazylogfileoutput.py
@@ -0,0 +1,362 @@
+# -*- coding: UTF-8 -*-
+import pytest
+
+from datetime import datetime
+
+from insights.core import LazyLogFileOutput
+from insights.core.exceptions import ParseException
+from insights.tests import context_wrap
+
+
+class FakeMessagesClass(LazyLogFileOutput):
+    time_format = '%b %d %H:%M:%S'
+
+    def bad_get_after(self, timestamp, lines=None):
+        self.time_format = '%q/%1 %z:%v:%j'
+        return super(FakeMessagesClass, self).get_after(timestamp, lines)
+
+    def superget(self, s):
+        for line in self.lines:
+            if s in line:
+                parts = line.split(None, 6)
+                yield {
+                    'timestamp': ' '.join(parts[0:3]),
+                    'hostname': parts[3],
+                    'service': parts[4][:-1],  # strip colon
+                    'message': parts[5],
+                }
+
+    def listget(self, s):
+        # Silly function to split lines containing s.
+        for line in self.lines:
+            if s in line:
+                yield s.split(None, 6)
+
+
+MESSAGES = """
+Mar 27 03:18:15 system rsyslogd: [origin software="rsyslogd" swVersion="5.8.10" x-pid="1870" x-info="http://www.rsyslog.com"] first
+Mar 27 03:18:16 system rsyslogd-2177: imuxsock lost 141 messages from pid 55082 due to rate-limiting
+Mar 27 03:18:19 system rsyslogd-2177: imuxsock begins to drop messages from pid 55082 due to rate-limiting
+Mar 27 03:18:21 system pulp: pulp.server.db.connection:INFO: Attempting Database connection with seeds = localhost:27017
+Mar 27 03:18:21 system pulp: pulp.server.db.connection:INFO: Connection Arguments: {'max_pool_size': 10}
+Mar 27 03:18:21 system pulp: pulp.server.db.connection:INFO: Database connection established with: seeds = localhost:27017, name = pulp_database
+Mar 27 03:18:22 system rsyslogd-2177: imuxsock lost 145 messages from pid 55082 due to rate-limiting
+Mar 27 03:18:24 system puppet-master[48226]: Setting manifest is deprecated in puppet.conf. See http://links.puppetlabs.com/env-settings-deprecations
+Mar 27 03:18:24 system puppet-master[48226]:    (at /usr/lib/ruby/site_ruby/1.8/puppet/settings.rb:1095:in `issue_deprecations')
+Mar 27 03:18:24 system puppet-master[48226]: Setting modulepath is deprecated in puppet.conf. See http://links.puppetlabs.com/env-settings-deprecations
+Mar 27 03:18:24 system puppet-master[48226]:    (at /usr/lib/ruby/site_ruby/1.8/puppet/settings.rb:1095:in `issue_deprecations')
+Mar 27 03:18:24 system puppet-master[48226]: Setting config_version is deprecated in puppet.conf. See http://links.puppetlabs.com/env-settings-deprecations
+Mar 27 03:18:24 system puppet-master[48226]:    (at /usr/lib/ruby/site_ruby/1.8/puppet/settings.rb:1095:in `issue_deprecations')
+Mar 27 03:18:25 system rsyslogd-2177: imuxsock begins to drop messages from pid 55082 due to rate-limiting
+Mar 27 03:39:43 system pulp: pulp.server.webservices.middleware.exception:ERROR:   File "/usr/lib/python2.6/site-packages/web/application.py", line 230, in handle
+     return self._delegate(fn, self.fvars, args)
+   File "/usr/lib/python2.6/site-packages/web/application.py", line 405, in _delegate
+     return handle_class(f)
+   File "/usr/lib/python2.6/site-packages/web/application.py", line 396, in handle_class
+     return tocall(*args)
+   File "/usr/lib/python2.6/site-packages/pulp/server/webservices/controllers/decorators.py", line 227, in _auth_decorator
+     value = method(self, *args, **kwargs)
+   File "/usr/lib/python2.6/site-packages/pulp/server/webservices/controllers/consumers.py", line 503, in GET
+     profile = manager.get_profile(consumer_id, content_type)
+   File "/usr/lib/python2.6/site-packages/pulp/server/managers/consumer/profile.py", line 120, in get_profile
+     raise MissingResource(profile_id=profile_id)
+MissingResource: Missing resource(s): profile_id={'content_type': u'rpm', 'consumer_id': u'1786cd7f-2ab2-4212-9798-c0a454e97900'}
+Mar 27 03:39:43 system rsyslogd-2177: imuxsock begins to drop messages from pid 55082 due to rate-limiting
+Mar 27 03:39:46 system rsyslogd-2177: imuxsock lost 165 messages from pid 55082 due to rate-limiting
+Mar 27 03:39:49 system rsyslogd-2177: imuxsock begins to drop messages from pid 55082 due to rate-limiting
+Mar 27 03:49:10 system pulp: pulp.server.webservices.middleware.exception:ERROR: Missing resource(s): profile_id={'content_type': u'rpm', 'consumer_id': u'79d5aed1-5631-4f40-b970-585ee974eb87'}
+"""
+
+
+def test_lazylogfileoutput_scanners():
+    # Messages that are present can be kept
+    FakeMessagesClass.keep_scan('puppet_master_logs', ' puppet-master')
+    # Messages that are absent still turn up as an empty list
+    FakeMessagesClass.keep_scan('kernel_logs', ' kernel')
+    # Token scan of something that's present
+    FakeMessagesClass.token_scan(
+        'middleware_exception_present', 'pulp.server.webservices.middleware.exception'
+    )
+    # Token scan of something that's absent
+    FakeMessagesClass.token_scan('cron_present', 'CRONTAB')
+
+    # Check that duplicate scanners raise a value exception
+    with pytest.raises(ValueError) as exc:
+        FakeMessagesClass.keep_scan('kernel_logs', ' kernel')
+    assert "'kernel_logs' is already a registered scanner key" in str(exc)
+
+    def count_lost_messages(self):
+        imuxsock_lines = 0
+        lost_messages = 0
+        for line in self.lines:
+            if 'imuxsock lost' in line:
+                parts = line.split(None)
+                if parts[7].isdigit():
+                    imuxsock_lines += 1
+                    lost_messages += int(parts[7])
+        return "lost {msgs} messages in {lines} lines".format(
+            msgs=lost_messages, lines=imuxsock_lines
+        )
+
+    FakeMessagesClass.scan('lost_messages', count_lost_messages)
+
+    ctx = context_wrap(MESSAGES, path='/var/log/messages')
+    log = FakeMessagesClass(ctx)
+    assert not hasattr(log, 'puppet_master_logs')
+    assert not hasattr(log, 'kernel_logs')
+    assert not hasattr(log, 'middleware_exception_present')
+    assert not hasattr(log, 'cron_present')
+    assert not hasattr(log, 'lost_messages')
+    log.do_scan('puppet_master_logs')
+    assert hasattr(log, 'puppet_master_logs')
+    log.do_scan('kernel_logs')
+    assert hasattr(log, 'kernel_logs')
+    log.do_scan('middleware_exception_present')
+    assert hasattr(log, 'middleware_exception_present')
+    log.do_scan('cron_present')
+    assert hasattr(log, 'cron_present')
+    log.do_scan('lost_messages')
+    assert hasattr(log, 'lost_messages')
+
+    assert len(log.puppet_master_logs) == 6
+    assert (
+        log.puppet_master_logs[0]['raw_message']
+        == 'Mar 27 03:18:24 system puppet-master[48226]: Setting manifest is deprecated in puppet.conf. See http://links.puppetlabs.com/env-settings-deprecations'
+    )
+    assert log.kernel_logs == []
+    assert log.middleware_exception_present
+    assert not log.cron_present
+    assert log.lost_messages == 'lost 451 messages in 3 lines'
+
+
+def test_lazylogfileoutput_scanners_list():
+    # Messages that are present can be kept
+    FakeMessagesClass.keep_scan('puppet_master_manifest_logs', ['puppet-master', 'manifest'])
+    FakeMessagesClass.keep_scan('puppet_master_first', ['puppet-master', 'first'])
+    # Messages that are present can be kept for any of the list
+    FakeMessagesClass.keep_scan('puppet_master_first_any', ['puppet-master', 'first'], check=any)
+    FakeMessagesClass.keep_scan(
+        'puppet_master_first_any_3', ['puppet-master', 'first'], num=3, check=any
+    )
+    # Token scan of something that's absent
+    FakeMessagesClass.token_scan('error_missing', ['ERROR', 'Missing'])
+    # Token scan of something that's absent
+    FakeMessagesClass.token_scan('error_info', ['ERROR', 'info'])
+    # Token scan of something that's absent for any of the list
+    FakeMessagesClass.token_scan('error_info_any', ['ERROR', 'info'], check=any)
+    # Get the last line
+    FakeMessagesClass.last_scan('puppet_master_manifest_last', ['puppet-master', 'manifest'])
+    # Get the last line for any of the list
+    FakeMessagesClass.last_scan(
+        'puppet_master_first_last_any', ['puppet-master', 'first'], check=any
+    )
+
+    ctx = context_wrap(MESSAGES, path='/var/log/messages')
+    log = FakeMessagesClass(ctx)
+
+    assert not hasattr(log, 'puppet_master_manifest_logs')
+    assert not hasattr(log, 'error_missing')
+    assert not hasattr(log, 'puppet_master_first')
+    assert not hasattr(log, 'error_info')
+    assert not hasattr(log, 'puppet_master_first_any')
+    assert not hasattr(log, 'puppet_master_first_any_3')
+    assert not hasattr(log, 'error_info_any')
+    assert not hasattr(log, 'puppet_master_manifest_last')
+    assert not hasattr(log, 'puppet_master_first_last_any')
+    # scan all and load content
+    log.do_scan()
+    assert hasattr(log, 'puppet_master_manifest_logs')
+    assert len(log.puppet_master_manifest_logs) == 1
+    assert hasattr(log, 'error_missing')
+    assert log.error_missing
+
+    assert hasattr(log, 'puppet_master_first')
+    assert len(log.puppet_master_first) == 0
+    assert hasattr(log, 'error_info')
+    assert log.error_info is False
+
+    assert hasattr(log, 'puppet_master_first_any')
+    assert len(log.puppet_master_first_any) == 7
+    assert hasattr(log, 'puppet_master_first_any_3')
+    assert len(log.puppet_master_first_any_3) == 3
+    assert hasattr(log, 'error_info_any')
+    assert log.error_info_any is True
+
+    assert hasattr(log, 'puppet_master_manifest_last')
+    assert 'puppet-master' in log.puppet_master_manifest_last['raw_message']
+    assert 'manifest' in log.puppet_master_manifest_last['raw_message']
+    assert hasattr(log, 'puppet_master_first_last_any')
+    assert 'puppet-master' in log.puppet_master_first_last_any['raw_message']
+
+
+def test_messages_get_after():
+    ctx = context_wrap(MESSAGES, path='/var/log/messages')
+    log = FakeMessagesClass(ctx)
+    assert len(log.lines) == 31
+
+    # Get lines after date
+    # Remember, logs with no date are assumed to be in year we give it
+    assert len(list(log.get_after(datetime(2017, 3, 27, 3, 39, 46)))) == 3
+    # Get subset of lines after date
+    pulp = log.get('pulp')  # includes /pulp/ in traceback
+    assert len(pulp) == 8
+    # Get lines after date, with one keyword
+    after = list(log.get_after(datetime(2017, 3, 27, 3, 20, 30), 'pulp'))
+    assert len(after) == 5
+    # Get lines after date, with one keyword
+    after = list(log.get_after(datetime(2017, 3, 27, 3, 40, 30), 'pulp'))
+    assert len(after) == 1
+
+    # Get lines after date, with one keywords list
+    after = list(log.get_after(datetime(2017, 3, 27, 3, 20, 30), ['pulp', 'ERROR']))
+    assert len(after) == 2
+    after = list(log.get_after(datetime(2017, 3, 27, 3, 40, 30), 'pulp'))
+    assert len(after) == 1
+    # No lines are found
+    assert list(log.get_after(datetime(2017, 3, 27, 3, 49, 46), 'pulp')) == []
+
+    tmp_time = datetime(2017, 3, 27, 3, 39, 46)
+    with pytest.raises(TypeError):
+        list(log.get_after(tmp_time, ['type=', False, 'AVC']))
+    with pytest.raises(TypeError):
+        list(log.get_after(tmp_time, set(['type=', 'AVC'])))
+
+
+def test_messages_get_after_bad_time_format():
+    ctx = context_wrap(MESSAGES, path='/var/log/messages')
+    log = FakeMessagesClass(ctx)
+    assert len(log.lines) == 31
+
+    # The timestamp format should only contain parts that get_after recognises
+    with pytest.raises(ParseException) as exc:
+        assert list(log.bad_get_after(datetime(2017, 3, 27, 3, 39, 46))) is None
+    assert 'get_after does not understand strptime format ' in str(exc)
+
+
+MESSAGES_ROLLOVER_YEAR = """
+Dec 31 21:43:00 duradm13 [CMA]: Logger failed to open catalog file
+Dec 31 22:03:05 duradm13 xinetd[21465]: START: bgssd pid=28021 from=10.20.40.7
+Dec 31 22:03:05 duradm13 xinetd[21465]: EXIT: bgssd status=0 pid=28021 duration=0(sec)
+Dec 31 23:03:07 duradm13 xinetd[21465]: START: bgssd pid=31307 from=10.20.40.7
+Dec 31 23:03:07 duradm13 xinetd[21465]: EXIT: bgssd status=0 pid=31307 duration=0(sec)
+Dec 31 23:07:00 duradm13 [CMA]: Logger failed to open catalog file
+Jan  1 00:00:00 duradm13 [CMA]: Logger failed to open catalog file
+Jan  1 00:03:09 duradm13 xinetd[21465]: START: bgssd pid=2203 from=10.20.40.7
+Jan  1 00:03:09 duradm13 xinetd[21465]: EXIT: bgssd status=0 pid=2203 duration=0(sec)
+Jan  1 00:11:45 duradm13 xinetd[21465]: START: vnetd pid=2670 from=10.20.40.36
+Jan  1 00:11:47 duradm13 xinetd[21465]: START: vnetd pid=2671 from=10.20.40.36
+Jan  1 00:11:48 duradm13 xinetd[21465]: EXIT: vnetd status=0 pid=2671 duration=1(sec)
+Jan  1 01:00:08 duradm13 xinetd[21465]: START: nrpe pid=6189 from=10.20.40.240
+Jan  1 01:00:08 duradm13 xinetd[21465]: EXIT: nrpe status=0 pid=6189 duration=0(sec)
+Jan  1 01:00:27 duradm13 xinetd[21465]: START: nrpe pid=6207 from=10.20.40.240
+Jan  1 01:00:27 duradm13 xinetd[21465]: EXIT: nrpe status=0 pid=6207 duration=0(sec)
+Jan  1 01:00:29 duradm13 xinetd[21465]: START: nrpe pid=6210 from=10.20.40.240
+Jan  1 01:00:29 duradm13 xinetd[21465]: EXIT: nrpe status=0 pid=6210 duration=0(sec)
+""".strip()
+
+
+def test_messages_log_time_wrap():
+    # Check rollover process for when dates in logs without a year go from
+    # December from January.
+    ctx = context_wrap(MESSAGES_ROLLOVER_YEAR, path='/var/log/messages')
+    log = FakeMessagesClass(ctx)
+    assert len(log.lines) == 18
+
+    # If sought date is in January, we should not get logs from December
+    found = list(log.get_after(datetime(2017, 1, 1, 1, 0, 0)))
+    assert len(found) == 6
+    # If sought date is in December, we should get logs from January
+    found = list(log.get_after(datetime(2017, 12, 31, 23, 0, 0)))
+    assert len(found) == 15
+
+
+HTTPD_ACCESS_LOG = """
+192.168.220.42 - - [14/Feb/2016:03:18:54 -0600] "POST /XMLRPC HTTP/1.1" 200 1381 "-" "rhn.rpclib.py/$Revision$"
+192.168.220.42 - - [14/Feb/2016:03:18:54 -0600] "POST /XMLRPC HTTP/1.1" 200 3282 "-" "rhn.rpclib.py/$Revision$"
+192.168.220.42 - - [14/Feb/2016:03:18:54 -0600] "POST /XMLRPC HTTP/1.1" 200 163 "-" "rhn.rpclib.py/$Revision$"
+192.168.220.42 - - [14/Feb/2016:03:18:54 -0600] "POST /XMLRPC HTTP/1.1" 200 138 "-" "rhn.rpclib.py/$Revision$"
+192.168.128.243 - - [14/Feb/2016:03:19:00 -0600] "POST /cobbler_api HTTP/1.1" 200 144 "-" "Java/1.6.0"
+192.168.128.243 - - [14/Feb/2016:03:19:00 -0600] "POST /cobbler_api HTTP/1.1" 200 129 "-" "Java/1.6.0"
+192.168.128.243 - - [14/Feb/2016:03:20:00 -0600] "POST /cobbler_api HTTP/1.1" 200 144 "-" "Java/1.6.0"
+192.168.128.243 - - [14/Feb/2016:03:20:00 -0600] "POST /cobbler_api HTTP/1.1" 200 129 "-" "Java/1.6.0"
+"""
+
+
+class FakeAccessLog(LazyLogFileOutput):
+    time_format = '%d/%b/%Y:%H:%M:%S'
+
+
+def test_logs_with_year():
+    ctx = context_wrap(HTTPD_ACCESS_LOG, path='/var/log/httpd/access_log')
+    log = FakeAccessLog(ctx)
+    assert len(log.lines) == 8
+
+    assert len(list(log.get_after(datetime(2016, 2, 14, 3, 18, 55)))) == 4
+
+
+DATE_CHANGE_MARIADB_LOG = """
+161109  9:25:42 [Warning] SSL error: SSL_CTX_set_default_verify_paths failed
+161109  9:25:42 [Note] WSREP: Service disconnected.
+161109  9:25:43 [Note] WSREP: Some threads may fail to exit.
+161109 14:28:24 InnoDB: Initializing buffer pool, size = 128.0M
+161109 14:28:24 InnoDB: Completed initialization of buffer pool
+2017-01-03 09:36:17 139651251140544 [Note] MariaDB 10.1.5 started successfully
+"""
+
+
+class FakeMariaDBLog(LazyLogFileOutput):
+    time_format = ['%y%m%d %H:%M:%S', '%Y-%m-%d %H:%M:%S']
+
+
+class DictMariaDBLog(LazyLogFileOutput):
+    time_format = {'old': '%y%m%d %H:%M:%S', 'new': '%Y-%m-%d %H:%M:%S'}
+
+
+class BadClassMariaDBLog(LazyLogFileOutput):
+    time_format = FakeAccessLog
+
+
+def test_logs_with_two_timestamps():
+    ctx = context_wrap(DATE_CHANGE_MARIADB_LOG, path='/var/log/mariadb/mariadb.log')
+    log = FakeMariaDBLog(ctx)
+    assert len(log.lines) == 6
+
+    new_ver = list(log.get_after(datetime(2017, 1, 1, 9, 0, 0)))
+    assert len(new_ver) == 1
+    assert (
+        new_ver[0]['raw_message']
+        == '2017-01-03 09:36:17 139651251140544 [Note] MariaDB 10.1.5 started successfully'
+    )
+
+    dict_log = DictMariaDBLog(ctx)
+    new_ver_d = list(dict_log.get_after(datetime(2017, 1, 1, 9, 0, 0)))
+    assert len(new_ver_d) == 1
+    assert (
+        new_ver_d[0]['raw_message']
+        == '2017-01-03 09:36:17 139651251140544 [Note] MariaDB 10.1.5 started successfully'
+    )
+
+    # get_after should only supply a string or list for time_format
+    with pytest.raises(ParseException) as exc:
+        logerr = BadClassMariaDBLog(ctx)
+        assert list(logerr.get_after(datetime(2017, 3, 27, 3, 39, 46))) is None
+    assert 'get_after does not recognise time formats of type ' in str(exc)
+
+
+MILLISECONDS_TOWER_LOG = """
+2020-05-28 19:25:36,892 WARNING  django.request Not Found: /api/v1/me/
+2020-05-28 19:25:46,944 INFO     awx.api.authentication User admin performed a GET to /api/v2/activity_stream/ through the API
+2020-05-28 19:33:03,125 INFO     awx.api.authentication User admin performed a GET to /api/v2/job_templates/7/survey_spec/ through the API
+2020-05-28 19:33:03,413 INFO     awx.api.authentication User admin performed a GET to /api/v2/projects/ through the API
+""".strip()
+
+
+class FakeTowerLog(LazyLogFileOutput):
+    time_format = '%Y-%m-%d %H:%M:%S,%f'
+
+
+def test_logs_with_milliseconds():
+    ctx = context_wrap(MILLISECONDS_TOWER_LOG, path='/var/log/tower/tower.log')
+    log = FakeTowerLog(ctx)
+    assert len(log.lines) == 4
+    assert len(list(log.get_after(datetime(2020, 5, 28, 19, 25, 46, 944)))) == 3