feat: support line filter in spec Cleaner (#4299)

- RHINENG-14668 for each filter, keep 10k matched lines by default for now. don't remove the "grep -F" as it's faster and can be used to reduce the file size before cleaning - Add test to verify the count of matched lines Move all spec relevant tests to tests/specs Move all core relevant tests to tests/core Signed-off-by: Xiangce Liu <[email protected]> (cherry picked from commit 55dc28d) (cherry picked from commit c34edd3)
RedHatInsights · Jan 2, 2025 · 78a64cd · 78a64cd
1 parent 2e957a0
commit 78a64cd
Show file tree

Hide file tree

Showing 29 changed files with 733 additions and 242 deletions.
diff --git a/insights/core/filters.py b/insights/core/filters.py
@@ -36,6 +36,7 @@
 ``INSIGHTS_FILTERS_ENABLED=False``. This means that no datasources will be
 filtered even if filters are defined for them.
 """
+
 import os
 import pkgutil
 import six
@@ -49,8 +50,13 @@
 _CACHE = {}
 FILTERS = defaultdict(set)
 ENABLED = parse_bool(os.environ.get("INSIGHTS_FILTERS_ENABLED"), default=True)
+MATCH_COUNT = 10000
 
 
+# TODO:
+# - support specifying the max match number of filtered lines
+#   add_filter(Messages, "Such an Error", 10)
+# def add_filter(component, patterns, match_count=MATCH_COUNT):
 def add_filter(component, patterns):
     """
     Add a filter or list of filters to a component. When the component is
@@ -66,6 +72,7 @@ def add_filter(component, patterns):
        patterns (str, [str]): A string, list of strings, or set of strings to
             add to the datasource's filters.
     """
+
     def inner(comp, patterns):
         if comp in _CACHE:
             del _CACHE[comp]
@@ -86,7 +93,7 @@ def inner(comp, patterns):
         FILTERS[comp] |= patterns
 
     def get_dependency_datasources(comp):
-        """"Get (all) the first depended datasource"""
+        """Get (all) the first depended datasource"""
         dep_ds = set()
         if plugins.is_datasource(comp):
             dep_ds.add(comp)
@@ -140,6 +147,7 @@ def get_filters(component):
     Returns:
         set: The set of filters defined for the datasource
     """
+
     def inner(c, filters=None):
         filters = filters or set()
 

diff --git a/insights/core/spec_cleaner.py b/insights/core/spec_cleaner.py
@@ -4,14 +4,15 @@
 
 The following processes will be applied to clean the collected specs:
 
-    - Redaction
-      This is a must-be-done operation to all the collected specs.
+- Redaction
+  This is a must-be-done operation to all the collected specs.
 
-    - Obfuscation
-      Obfuscate the IP or Hostname appears in the spec content according to the
-      specs native requirement and user configuration.
+- Obfuscation
+  Obfuscate the IP or Hostname appears in the spec content according to the
+  specs native requirement and user configuration.
 
 """
+
 import logging
 import hashlib
 import json
@@ -55,8 +56,8 @@ class Cleaner(object):
     def __init__(self, config, rm_conf, fqdn=None):
         self.report_dir = '/tmp'
         self.rhsm_facts_file = getattr(
-                config, 'rhsm_facts_file',
-                os.path.join(self.report_dir, 'insights-client.facts'))
+            config, 'rhsm_facts_file', os.path.join(self.report_dir, 'insights-client.facts')
+        )
         # Obfuscation - set: ip and hostname only
         self.obfuscate = set()
         self.obfuscate.add('ip') if config and config.obfuscate else None
@@ -76,7 +77,7 @@ def __init__(self, config, rm_conf, fqdn=None):
         #   Keyword replacement does NOT depend on "obfuscate=True"
         keywords = rm_conf.get('keywords')
         self.kw_db = dict()  # keyword database
-        self.kws = set()     # keywords that have been replaced
+        self.kws = set()  # keywords that have been replaced
         self._keywords2db(keywords)
 
         # Obfuscation
@@ -103,8 +104,8 @@ def __init__(self, config, rm_conf, fqdn=None):
         if config and config.obfuscate_hostname and self.fqdn:
             self._domains2db()
             hashed_hostname = hashlib.sha1(
-                    self.fqdn.encode('utf-8')
-                    if six.PY3 else self.fqdn).hexdigest()[:12]
+                self.fqdn.encode('utf-8') if six.PY3 else self.fqdn
+            ).hexdigest()[:12]
             self.obfuscated_fqdn = '{0}.example.com'.format(hashed_hostname)
             self.hostname_count += 1
             self.hn_db[self.obfuscated_fqdn] = self.fqdn
@@ -137,9 +138,9 @@ def _ip2db(self, ip):
             if v == ip_num:
                 ret_ip = self._int2ip(k)
                 ip_found = True
-        if ip_found:                # the entry already existed
+        if ip_found:  # the entry already existed
             return ret_ip
-        else:                       # the entry did not already exist
+        else:  # the entry did not already exist
             if len(self.ip_db) > 0:
                 new_ip = max(db.keys()) + 1
             else:
@@ -151,7 +152,7 @@ def _ip2db(self, ip):
     def _sub_ip(self, line):
         '''
         This will substitute an obfuscated IP for each instance of a given IP in a file
-        This is called in the self._clean_line function, along with user _sub_* functions to scrub a given
+        This is called in the self.clean_* function, along with user _sub_* functions to scrub a given
         line in a file.
         It scans a given line and if an IP exists, it obfuscates the IP using _ip2db and returns the altered line
         '''
@@ -213,7 +214,7 @@ def _sub_ip_netstat(self, line):
                                 if idx == len(line):
                                     break
                                 c = line[idx]
-                            line = line[0:idx] + line[(idx + numspaces):]
+                            line = line[0:idx] + line[(idx + numspaces) :]
 
                         else:
                             line = line.replace(ip, new_ip)
@@ -232,7 +233,9 @@ def _domains2db(self):
             # we will add the root domain for an FQDN as well.
             if self.domain is not None:
                 self.dn_db[self.obfuscated_domain] = self.domain
-                logger.debug("Obfuscated Domain Created - %s -> %s" % (self.domain, self.obfuscated_domain))
+                logger.debug(
+                    "Obfuscated Domain Created - %s -> %s" % (self.domain, self.obfuscated_domain)
+                )
 
             self.domain_count = len(self.dn_db)
             return True
@@ -252,7 +255,8 @@ def _hn2db(self, hn):
         if hn_found:
             return ret_hn
         else:
-            self.hostname_count += 1  # we have a new hostname, so we increment the counter to get the host ID number
+            # we have a new hostname, so we increment the counter to get the host ID number
+            self.hostname_count += 1
             o_domain = self.obfuscated_domain
             for od, d in self.dn_db.items():
                 if d in hn:  # never false
@@ -279,7 +283,8 @@ def _sub_hostname(self, line):
                         logger.debug("Obfuscating FQDN - %s > %s", hn, new_hn)
                         line = line.replace(hn, new_hn)
             if self.hostname:
-                line = line.replace(self.hostname, self._hn2db(self.fqdn))  # catch any non-fqdn instances of the system hostname
+                # catch any non-fqdn instances of the system hostname
+                line = line.replace(self.hostname, self._hn2db(self.fqdn))
             return line
         except Exception as e:  # pragma: no cover
             logger.warning(e)
@@ -347,6 +352,22 @@ def _redact_line(self, line):
         # 3. keyword replacement redaction
         return self._sub_keywords(line)
 
+    def _filter_line_per_allowlist(self, line, allow_info):
+        # filter line as per the allow list specified by plugins
+        if not line:
+            return line
+        if allow_info:
+            for a_key in list(allow_info.keys()):
+                # keep line when any filter match
+                # FIXME:
+                # Considering performance, din't handle multiple filters in one same line
+                if a_key in line:
+                    allow_info[a_key] -= 1
+                    # stop checking it when enough lines contain the key were found
+                    allow_info.pop(a_key) if allow_info[a_key] == 0 else None
+                    return line
+        # discard line when none filters found
+
     def get_obfuscate_functions(self, filename='', no_obfuscate=None):
         """
         Return the list of required obfuscation function according to the
@@ -359,23 +380,32 @@ def get_obfuscate_functions(self, filename='', no_obfuscate=None):
         # Get the actual obfuscate list setting for this file
         obfs = set(self.obfuscate) - set(no_obfuscate or [])
         # IP obfuscation entry
-        obf_funcs.append(self._sub_ip_netstat if filename.endswith("netstat_-neopa") else self._sub_ip) if "ip" in obfs else None
+        (
+            obf_funcs.append(
+                self._sub_ip_netstat if filename.endswith("netstat_-neopa") else self._sub_ip
+            )
+            if "ip" in obfs
+            else None
+        )
         # Hostname obfuscation entry
         obf_funcs.append(self._sub_hostname) if "hostname" in obfs else None
         return obf_funcs
 
-    def clean_content(self, lines, obf_funcs=None, no_redact=False):
+    def clean_content(self, lines, obf_funcs=None, no_redact=False, allowlist=None):
         """
         Clean lines one by one according to the configuration, the cleaned
         lines will be returned.
         """
+
         def _clean_line(_line):
             # 1. Do Redaction by default, unless "no_redact=True"
             if _line and not no_redact:
                 _line = self._redact_line(_line)
-            # 2. Do Obfuscation as per the "obf_funcs"
-            _line = self._obfuscate_line(_line, obf_funcs or [])
-            return _line
+            # 2. Do filtering as per allowlist got from "filters.yaml"
+            if _line and allowlist is not None:
+                _line = self._filter_line_per_allowlist(_line, allowlist)
+            # 3. Do Obfuscation as per the "obf_funcs"
+            return self._obfuscate_line(_line, obf_funcs or [])
 
         # handle single string
         if not isinstance(lines, list):
@@ -391,7 +421,7 @@ def _clean_line(_line):
         # All lines blank
         return []
 
-    def clean_file(self, _file, no_obfuscate=None, no_redact=False):
+    def clean_file(self, _file, no_obfuscate=None, no_redact=False, allowlist=None):
         """
         Clean a file according to the configuration, the file will be updated
         directly with the cleaned content.
@@ -405,7 +435,12 @@ def clean_file(self, _file, no_obfuscate=None, no_redact=False):
             try:
                 with open(_file, 'r') as fh:
                     raw_data = fh.readlines()
-                    content = self.clean_content(raw_data, obf_funcs, no_redact)
+                    content = self.clean_content(
+                        raw_data,
+                        obf_funcs=obf_funcs,
+                        no_redact=no_redact,
+                        allowlist=allowlist,
+                    )
             except Exception as e:  # pragma: no cover
                 logger.warning(e)
                 raise Exception("Error: Cannot Open File for Cleaning: %s" % _file)
@@ -437,11 +472,7 @@ def generate_rhsm_facts(self):
 
         ip_block = []
         for k, v in self.ip_db.items():
-            ip_block.append(
-                {
-                    'original': self._int2ip(v),
-                    'obfuscated': self._int2ip(k)
-                })
+            ip_block.append({'original': self._int2ip(v), 'obfuscated': self._int2ip(k)})
 
         facts = {
             'insights_client.hostname': self.fqdn,

diff --git a/insights/core/spec_factory.py b/insights/core/spec_factory.py
@@ -61,6 +61,8 @@ def __init__(self):
         self.loaded = False
         self._content = None
         self._exception = None
+        self._filterable = False
+        self._filters = set()
 
     def load(self):
         raise NotImplementedError()
@@ -82,27 +84,34 @@ def _clean_content(self):
         collection.
         """
         content = self.content  # load first for debugging info order
-        if isinstance(self.ctx, HostContext) and self.ds and self.cleaner:
+        if content and isinstance(self.ctx, HostContext) and self.ds and self.cleaner:
             cleans = []
             # Redacting?
             no_red = getattr(self.ds, 'no_redact', False)
             cleans.append("Redact") if not no_red else None
             # Obfuscating?
             no_obf = getattr(self.ds, 'no_obfuscate', [])
             cleans.append("Obfuscate") if len(no_obf) < 2 else None
+            # Filtering?
+            allowlist = None
+            if self._filterable:
+                cleans.append("Filter")
+                allowlist = dict((f, filters.MATCH_COUNT) for f in self._filters)
             # Cleaning - Entry
             if cleans:
-                log.debug("Cleaning (%s) %s", "/".join(cleans), "/" + self.relative_path)
+                log.debug("Cleaning (%s) %s", "/".join(cleans), self.relative_path)
                 content = self.cleaner.clean_content(
-                    content,
+                    content[::-1],  # Scan from bottom
+                    allowlist=allowlist,
                     obf_funcs=self.cleaner.get_obfuscate_functions(self.relative_path, no_obf),
                     no_redact=no_red,
-                )
+                )[::-1]
+                # ^ Reverse to the right order then
                 if len(content) == 0:
                     log.debug("Skipping %s due to empty after cleaning", self.path)
                     raise ContentException("Empty after cleaning: %s" % self.path)
             else:
-                log.debug("Skipping cleaning %s", "/" + self.relative_path)
+                log.debug("Skipping cleaning %s", self.relative_path)
         return content
 
     @property
@@ -195,6 +204,12 @@ def __init__(self, relative_path, root="/", save_as=None, ds=None, ctx=None, cle
         self.relative_path = relative_path.lstrip("/")
         self.save_as = save_as
         self.file_name = os.path.basename(self.path)
+        self._filterable = (
+            any(s.filterable for s in dr.get_registry_points(self.ds))
+            if self.ds and filters.ENABLED
+            else False
+        )
+        self._filters = filters.get_filters(self.ds) if self.ds else set()
 
         self.validate()
 
@@ -205,12 +220,7 @@ def validate(self):
         # 2. Check only when collecting
         if isinstance(self.ctx, HostContext):
             # 2.1 No Filters for 'filterable=True' Specs
-            if (
-                self.ds
-                and filters.ENABLED
-                and any(s.filterable for s in dr.get_registry_points(self.ds))
-                and not filters.get_filters(self.ds)
-            ):
+            if self._filterable and not self._filters:
                 raise NoFilterException("Skipping %s due to no filters." % dr.get_name(self.ds))
             # 2.2 Customer Prohibits Collection
             if not blacklist.allow_file("/" + self.relative_path):
@@ -298,10 +308,13 @@ class TextFileProvider(FileProvider):
     """
 
     def create_args(self):
+        """
+        The "grep" is faster and can be used shrink the size of file.
+        """
         args = []
-        _filters = "\n".join(filters.get_filters(self.ds))
-        if _filters:
-            args.append(["grep", "-F", _filters, self.path])
+        if self._filters:
+            log.debug("Pre-filtering %s", self.relative_path)
+            args.append(["grep", "-F", "\n".join(self._filters), self.path])
 
         return args
 
@@ -390,6 +403,12 @@ def __init__(
         self._misc_settings()
         self._content = None
         self._env = self.create_env()
+        self._filterable = (
+            any(s.filterable for s in dr.get_registry_points(self.ds))
+            if self.ds and filters.ENABLED
+            else False
+        )
+        self._filters = filters.get_filters(self.ds) if self.ds else set()
 
         self.validate()
 
@@ -405,12 +424,7 @@ def validate(self):
         # 2. Check only when collecting
         if isinstance(self.ctx, HostContext):
             # 2.1 No Filters for 'filterable=True' Specs
-            if (
-                self.ds
-                and filters.ENABLED
-                and any(s.filterable for s in dr.get_registry_points(self.ds))
-                and not filters.get_filters(self.ds)
-            ):
+            if self._filterable and not self._filters:
                 raise NoFilterException("Skipping %s due to no filters." % dr.get_name(self.ds))
             # 2.2 Customer Prohibits Collection
             if not blacklist.allow_command(self.cmd):
@@ -420,10 +434,9 @@ def validate(self):
     def create_args(self):
         command = [shlex.split(self.cmd)]
 
-        if self.split:
-            _filters = "\n".join(filters.get_filters(self.ds))
-            if _filters:
-                command.append(["grep", "-F", _filters])
+        if self.split and self._filters:
+            log.debug("Pre-filtering  %s", self.relative_path)
+            command.append(["grep", "-F", "\n".join(self._filters)])
 
         return command
 

diff --git a/insights/tests/test_component_metadata.py → ...hts/tests/core/test_component_metadata.py b/insights/tests/test_component_metadata.py → ...hts/tests/core/test_component_metadata.py