feat: enhance add_filter to specify the max matched lines

- added a new keyword argument "max_match=MAX_MATCH" to `add_filter`. By declaring the max_match, at most `max_match` number of lines that contain the patterns will be filtered out in the collection. For redundant declarations the maximum `max_match` will be kept as the final scanning number. If no `max_match` is declared when `add_filter`, the filters.MAX_MATCH (=10000) will be take as the default value. - added a new keyword argument "with_matches=False" to the `get_filters`. When "with_matches=True" is specified, the return value of the `get_filters` will be dict in which the max scanning numbers for each filter pattern are included as the dict value. - update the exiting tests - RHINENG-14669 Signed-off-by: Xiangce Liu <[email protected]>
RedHatInsights · Dec 30, 2024 · 81aba8c · 81aba8c
1 parent 56c600f
commit 81aba8c
Show file tree

Hide file tree

Showing 24 changed files with 415 additions and 200 deletions.
diff --git a/insights/core/filters.py b/insights/core/filters.py
@@ -48,16 +48,12 @@
 from insights.util import parse_bool
 
 _CACHE = {}
-FILTERS = defaultdict(set)
+FILTERS = defaultdict(dict)
 ENABLED = parse_bool(os.environ.get("INSIGHTS_FILTERS_ENABLED"), default=True)
-MATCH_COUNT = 10000
+MAX_MATCH = 10000
 
 
-# TODO:
-# - support specifying the max match number of filtered lines
-#   add_filter(Messages, "Such an Error", 10)
-# def add_filter(component, patterns, match_count=MATCH_COUNT):
-def add_filter(component, patterns):
+def add_filter(component, patterns, max_match=MAX_MATCH):
     """
     Add a filter or list of filters to a component. When the component is
     a datasource, the filter will be directly added to that datasouce.
@@ -71,8 +67,26 @@ def add_filter(component, patterns):
             parser or combiner.
        patterns (str, [str]): A string, list of strings, or set of strings to
             add to the datasource's filters.
+       max_match (int): A int, the maximum matched lines to filter out.
+            MAX_MATCH by default.
     """
 
+    def get_dependency_datasources(comp):
+        """Get (all) the first depended datasource"""
+        dep_ds = set()
+        if plugins.is_datasource(comp):
+            dep_ds.add(comp)
+            return dep_ds
+        for dep in dr.get_dependencies(comp):
+            dep_ds.update(get_dependency_datasources(dep))
+        return dep_ds
+
+    def none_max(a, b):
+        return a if b is None else b if a is None else max(a, b)
+
+    def max_matchs(da, db):
+        return dict((k, none_max(da.get(k), db.get(k))) for k in set(da.keys()).union(db.keys()))
+
     def inner(comp, patterns):
         if comp in _CACHE:
             del _CACHE[comp]
@@ -82,25 +96,16 @@ def inner(comp, patterns):
             raise TypeError("Filter patterns must be of type string, list, or set.")
 
         if isinstance(patterns, six.string_types):
-            patterns = set([patterns])
+            patterns = {patterns: max_match}
         elif isinstance(patterns, list):
-            patterns = set(patterns)
+            patterns = dict((pt, max_match) for pt in patterns)
+        # here patterns is a dict
 
         for pat in patterns:
             if not pat:
                 raise Exception("Filter patterns must not be empty.")
 
-        FILTERS[comp] |= patterns
-
-    def get_dependency_datasources(comp):
-        """Get (all) the first depended datasource"""
-        dep_ds = set()
-        if plugins.is_datasource(comp):
-            dep_ds.add(comp)
-            return dep_ds
-        for dep in dr.get_dependencies(comp):
-            dep_ds.update(get_dependency_datasources(dep))
-        return dep_ds
+        FILTERS[comp].update(max_matchs(FILTERS[comp], patterns))
 
     if not plugins.is_datasource(component):
         deps = get_dependency_datasources(component)
@@ -127,7 +132,7 @@ def get_dependency_datasources(comp):
 _add_filter = add_filter
 
 
-def get_filters(component):
+def get_filters(component, with_matches=False):
     """
     Get the set of filters for the given datasource.
 
@@ -143,13 +148,19 @@ def get_filters(component):
 
     Args:
         component (a datasource): The target datasource
+        with_matches (boolean): Needs the max matches being returned? False by
+                                default.
 
     Returns:
-        set: The set of filters defined for the datasource
+        (set or dict): when `with_matches=False`, returns the set of filters
+                       defined for the datasource only.
+                       when `with_matches=True`, returns filters defined for
+                       the datasource with the max match count specified by
+                       `add_filter`.
     """
 
     def inner(c, filters=None):
-        filters = filters or set()
+        filters = filters or dict()
 
         if hasattr(c, 'filterable') and c.filterable is False:
             return filters
@@ -161,20 +172,21 @@ def inner(c, filters=None):
             return filters
 
         if c in FILTERS:
-            filters |= FILTERS[c]
+            filters.update(FILTERS[c])
 
         for d in dr.get_dependents(c):
-            filters |= inner(d, filters)
+            filters.update(inner(d, filters))
+
         return filters
 
     if not component:
         # No filters for nothing
-        return set()
+        return dict() if with_matches else set()
 
     if component not in _CACHE:
         _CACHE[component] = inner(component)
 
-    return _CACHE[component]
+    return _CACHE[component] if with_matches else set(_CACHE[component].keys())
 
 
 def apply_filters(target, lines):
@@ -202,7 +214,7 @@ def loads(string):
     """Loads the filters dictionary given a string."""
     d = _loads(string)
     for k, v in d.items():
-        FILTERS[dr.get_component(k) or k] = set(v)
+        FILTERS[dr.get_component(k) or k] = v
 
 
 def load(stream=None):
@@ -222,7 +234,7 @@ def dumps():
     """Returns a string representation of the sorted FILTERS dictionary."""
     d = {}
     for k, v in FILTERS.items():
-        d[dr.get_name(k)] = sorted(v)
+        d[dr.get_name(k)] = dict(sorted(v.items()))
     return _dumps(d)
 
 

diff --git a/insights/core/spec_cleaner.py b/insights/core/spec_cleaner.py
@@ -11,6 +11,8 @@
   Obfuscate the IP or Hostname appears in the spec content according to the
   specs native requirement and user configuration.
 
+- Filtering
+  Filter line as per the allow list got from the "filters.yaml"
 """
 
 import logging
@@ -360,7 +362,7 @@ def _filter_line_per_allowlist(self, line, allow_info):
             for a_key in list(allow_info.keys()):
                 # keep line when any filter match
                 # FIXME:
-                # Considering performance, din't handle multiple filters in one same line
+                # Considering performance, didn't handle multiple filters in one same line
                 if a_key in line:
                     allow_info[a_key] -= 1
                     # stop checking it when enough lines contain the key were found

diff --git a/insights/core/spec_factory.py b/insights/core/spec_factory.py
@@ -61,7 +61,7 @@ def __init__(self):
         self._content = None
         self._exception = None
         self._filterable = False
-        self._filters = set()
+        self._filters = dict()
 
     def load(self):
         raise NotImplementedError()
@@ -95,7 +95,7 @@ def _clean_content(self):
             allowlist = None
             if self._filterable:
                 cleans.append("Filter")
-                allowlist = dict((f, filters.MATCH_COUNT) for f in self._filters)
+                allowlist = self._filters
             # Cleaning - Entry
             if cleans:
                 log.debug("Cleaning (%s) %s", "/".join(cleans), self.relative_path)
@@ -208,7 +208,7 @@ def __init__(self, relative_path, root="/", save_as=None, ds=None, ctx=None, cle
             if self.ds and filters.ENABLED
             else False
         )
-        self._filters = filters.get_filters(self.ds) if self.ds else set()
+        self._filters = filters.get_filters(self.ds, True) if self.ds else set()
 
         self.validate()
 
@@ -361,7 +361,7 @@ def __init__(
             if self.ds and filters.ENABLED
             else False
         )
-        self._filters = filters.get_filters(self.ds) if self.ds else set()
+        self._filters = filters.get_filters(self.ds, True) if self.ds else set()
 
         self.validate()
 

diff --git a/insights/tests/__init__.py b/insights/tests/__init__.py
@@ -36,11 +36,14 @@
 
 def _intercept_add_filter(func):
     @wraps(func)
-    def inner(component, pattern):
-        ret = add_filter(component, pattern)
+    def inner(component, pattern, max_match=filters.MAX_MATCH):
+        ret = add_filter(component, pattern, max_match=max_match)
         calling_module = inspect.stack()[1][0].f_globals.get("__name__")
-        ADDED_FILTERS[calling_module] |= set(r for r in dr.get_registry_points(component) if r.filterable)
+        ADDED_FILTERS[calling_module] |= set(
+            r for r in dr.get_registry_points(component) if r.filterable
+        )
         return ret
+
     return inner
 
 
@@ -51,6 +54,7 @@ def inner(ds, pattern):
         calling_module = inspect.stack()[1][0].f_globals.get("__name__")
         ADDED_FILTERS[calling_module].add(ds)
         return ret
+
     return inner
 
 
@@ -92,8 +96,11 @@ def _beautify_deep_compare_diff(result, expected):
         diff.append('\tkey "{0}" not in Result;'.format(k))
     for k in common_keys:
         if not eq(result[k], expected[k]):
-            diff.append('\tkey "{0}" unequal values:\n\t\tExpected: {1}\n\t\tResult  : {2}'.format(
-                            k, expected[k], result[k]))
+            diff.append(
+                '\tkey "{0}" unequal values:\n\t\tExpected: {1}\n\t\tResult  : {2}'.format(
+                    k, expected[k], result[k]
+                )
+            )
     if not diff:
         diff.append('\tUnrecognized unequal values in result layer one;')
 
@@ -118,7 +125,7 @@ def deep_compare(result, expected):
     # This case ensures that when rules return a make_none() response, all of the older
     # CI tests that are looking for None instead of make_none() will still pass
     if result is None or (isinstance(result, dict) and result.get("type") == "none"):
-        assert (expected is None or expected == MAKE_NONE_RESULT), result
+        assert expected is None or expected == MAKE_NONE_RESULT, result
         return
 
     if isinstance(result, dict) and expected is None:
@@ -150,12 +157,11 @@ def run_input_data(component, input_data, store_skips=False):
     'CloudInstance': ['insights.parsers.subscription_manager.SubscriptionManagerFacts'],
     'CloudProvider': ['insights.parsers.rhsm_conf.RHSMConf'],
     'OSRelease': ['insights.parsers.dmesg.DmesgLineList'],
-    'Sap': ['insights.parsers.saphostctrl.SAPHostCtrlInstances']
+    'Sap': ['insights.parsers.saphostctrl.SAPHostCtrlInstances'],
 }
 
 
-def run_test(component, input_data,
-             expected=_UNDEFINED, return_make_none=False, do_filter=True):
+def run_test(component, input_data, expected=_UNDEFINED, return_make_none=False, do_filter=True):
     """
     Arguments:
         component: The insights component need to test.
@@ -165,6 +171,7 @@ def run_test(component, input_data,
         do_filter: Does need to check dependency spec filter warning?
             - it's not required to check the filters for sosreport
     """
+
     def get_filtered_specs(module):
         filtered = set()
         mods = dir(importlib.import_module(module))
@@ -183,7 +190,9 @@ def get_filtered_specs(module):
         rps = dr.get_registry_points(component)
         filtered = get_filtered_specs(mod)
         filterable = set(d for d in rps if dr.get_delegate(d).filterable) - filtered
-        missing_filters = filterable - ADDED_FILTERS.get(mod, set()) - ADDED_FILTERS.get(sup_mod, set())
+        missing_filters = (
+            filterable - ADDED_FILTERS.get(mod, set()) - ADDED_FILTERS.get(sup_mod, set())
+        )
         if missing_filters:
             names = [dr.get_name(m) for m in missing_filters]
             msg = "%s must add filters to %s"
@@ -204,16 +213,18 @@ def integrate(input_data, component):
     return run_test(component, input_data)
 
 
-def context_wrap(lines,
-                 path="path",
-                 hostname=DEFAULT_HOSTNAME,
-                 release=DEFAULT_RELEASE,
-                 version="-1.-1",
-                 machine_id="machine_id",
-                 strip=True,
-                 split=True,
-                 filtered_spec=None,
-                 **kwargs):
+def context_wrap(
+    lines,
+    path="path",
+    hostname=DEFAULT_HOSTNAME,
+    release=DEFAULT_RELEASE,
+    version="-1.-1",
+    machine_id="machine_id",
+    strip=True,
+    split=True,
+    filtered_spec=None,
+    **kwargs
+):
     if isinstance(lines, six.string_types):
         if strip:
             lines = lines.strip()
@@ -223,10 +234,16 @@ def context_wrap(lines,
     if filtered_spec is not None and filtered_spec in filters.FILTERS:
         lines = [l for l in lines if any([f in l for f in filters.FILTERS[filtered_spec]])]
 
-    return Context(content=lines,
-                   path=path, hostname=hostname,
-                   release=release, version=version.split("."),
-                   machine_id=machine_id, relative_path=path, **kwargs)
+    return Context(
+        content=lines,
+        path=path,
+        hostname=hostname,
+        release=release,
+        version=version.split("."),
+        machine_id=machine_id,
+        relative_path=path,
+        **kwargs
+    )
 
 
 input_data_cache = {}
@@ -236,10 +253,7 @@ def context_wrap(lines,
 
 
 def create_metadata(system_id, product):
-    ctx_metadata = {
-        "system_id": system_id,
-        "links": []
-    }
+    ctx_metadata = {"system_id": system_id, "links": []}
     ctx_metadata["type"] = product.role
     ctx_metadata["product"] = product.__class__.__name__
     return json.dumps(ctx_metadata)
@@ -265,6 +279,7 @@ class InputData(object):
     contain the specified value in the context.path field.  This is useful for
     testing pattern-like file parsers.
     """
+
     def __init__(self, name=None, hostname=None):
         cnt = input_data_cache.get(name, 0)
         self.name = "{0}-{1:0>5}".format(name, cnt)
@@ -421,6 +436,7 @@ def archive_provider(component, test_func=deep_compare, stride=1):
 
     [1] insights.tests.deep_compare()
     """
+
     def _wrap(func):
         @six.wraps(func)
         def __wrap(stride=stride):
@@ -430,4 +446,5 @@ def __wrap(stride=stride):
         __wrap.stride = stride
         ARCHIVE_GENERATORS.append(__wrap)
         return __wrap
+
     return _wrap