Skip to content

Commit

Permalink
feat: enhance add_filter to specify the max matched lines
Browse files Browse the repository at this point in the history
- added a new keyword argument "max_match=MAX_MATCH" to `add_filter`.
  By declaring the max_match, at most `max_match` number of lines that
  contain the patterns will be filtered out in the collection.
  For redundant declarations the maximum `max_match` will be kept as the
  final scanning number.  If no `max_match` is declared when `add_filter`,
  the filters.MAX_MATCH (=10000) will be take as the default value.
- added a new keyword argument "with_matches=False" to the `get_filters`.
  When "with_matches=True" is specified, the return value of the
  `get_filters` will be dict in which the max scanning numbers for each
  filter pattern are included as the dict value.
- update the exiting tests
- RHINENG-14669

Signed-off-by: Xiangce Liu <[email protected]>
  • Loading branch information
xiangce committed Dec 27, 2024
1 parent d5c77bb commit 864120a
Show file tree
Hide file tree
Showing 27 changed files with 448 additions and 220 deletions.
70 changes: 41 additions & 29 deletions insights/core/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,16 +48,12 @@
from insights.util import parse_bool

_CACHE = {}
FILTERS = defaultdict(set)
FILTERS = defaultdict(dict)
ENABLED = parse_bool(os.environ.get("INSIGHTS_FILTERS_ENABLED"), default=True)
MATCH_COUNT = 10000
MAX_MATCH = 10000


# TODO:
# - support specifying the max match number of filtered lines
# add_filter(Messages, "Such an Error", 10)
# def add_filter(component, patterns, match_count=MATCH_COUNT):
def add_filter(component, patterns):
def add_filter(component, patterns, max_match=MAX_MATCH):
"""
Add a filter or list of filters to a component. When the component is
a datasource, the filter will be directly added to that datasouce.
Expand All @@ -71,8 +67,26 @@ def add_filter(component, patterns):
parser or combiner.
patterns (str, [str]): A string, list of strings, or set of strings to
add to the datasource's filters.
max_match (int): A int, the maximum matched lines to filter out.
MAX_MATCH by default.
"""

def get_dependency_datasources(comp):
"""Get (all) the first depended datasource"""
dep_ds = set()
if plugins.is_datasource(comp):
dep_ds.add(comp)
return dep_ds
for dep in dr.get_dependencies(comp):
dep_ds.update(get_dependency_datasources(dep))
return dep_ds

def none_max(a, b):
return a if b is None else b if a is None else max(a, b)

def max_matchs(da, db):
return dict((k, none_max(da.get(k), db.get(k))) for k in set(da.keys()).union(db.keys()))

def inner(comp, patterns):
if comp in _CACHE:
del _CACHE[comp]
Expand All @@ -82,25 +96,16 @@ def inner(comp, patterns):
raise TypeError("Filter patterns must be of type string, list, or set.")

if isinstance(patterns, six.string_types):
patterns = set([patterns])
patterns = {patterns: max_match}
elif isinstance(patterns, list):
patterns = set(patterns)
patterns = dict((pt, max_match) for pt in patterns)
# here patterns is a dict

for pat in patterns:
if not pat:
raise Exception("Filter patterns must not be empty.")

FILTERS[comp] |= patterns

def get_dependency_datasources(comp):
"""Get (all) the first depended datasource"""
dep_ds = set()
if plugins.is_datasource(comp):
dep_ds.add(comp)
return dep_ds
for dep in dr.get_dependencies(comp):
dep_ds.update(get_dependency_datasources(dep))
return dep_ds
FILTERS[comp].update(max_matchs(FILTERS[comp], patterns))

if not plugins.is_datasource(component):
deps = get_dependency_datasources(component)
Expand All @@ -127,7 +132,7 @@ def get_dependency_datasources(comp):
_add_filter = add_filter


def get_filters(component):
def get_filters(component, with_matches=False):
"""
Get the set of filters for the given datasource.
Expand All @@ -143,13 +148,19 @@ def get_filters(component):
Args:
component (a datasource): The target datasource
with_matches (boolean): Needs the max matches being returned? False by
default.
Returns:
set: The set of filters defined for the datasource
(set or dict): when `with_matches=False`, returns the set of filters
defined for the datasource only.
when `with_matches=True`, returns filters defined for
the datasource with the max match count specified by
`add_filter`.
"""

def inner(c, filters=None):
filters = filters or set()
filters = filters or dict()

if hasattr(c, 'filterable') and c.filterable is False:
return filters
Expand All @@ -161,20 +172,21 @@ def inner(c, filters=None):
return filters

if c in FILTERS:
filters |= FILTERS[c]
filters.update(FILTERS[c])

for d in dr.get_dependents(c):
filters |= inner(d, filters)
filters.update(inner(d, filters))

return filters

if not component:
# No filters for nothing
return set()
return dict() if with_matches else set()

if component not in _CACHE:
_CACHE[component] = inner(component)

return _CACHE[component]
return _CACHE[component] if with_matches else set(_CACHE[component].keys())


def apply_filters(target, lines):
Expand Down Expand Up @@ -202,7 +214,7 @@ def loads(string):
"""Loads the filters dictionary given a string."""
d = _loads(string)
for k, v in d.items():
FILTERS[dr.get_component(k) or k] = set(v)
FILTERS[dr.get_component(k) or k] = v


def load(stream=None):
Expand All @@ -222,7 +234,7 @@ def dumps():
"""Returns a string representation of the sorted FILTERS dictionary."""
d = {}
for k, v in FILTERS.items():
d[dr.get_name(k)] = sorted(v)
d[dr.get_name(k)] = dict(sorted(v.items()))
return _dumps(d)


Expand Down
6 changes: 4 additions & 2 deletions insights/core/spec_cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
Obfuscate the IP or Hostname appears in the spec content according to the
specs native requirement and user configuration.
- Filtering
Filter line as per the allow list got from the "filters.yaml"
"""

import logging
Expand Down Expand Up @@ -360,13 +362,13 @@ def _filter_line_per_allowlist(self, line, allow_info):
for a_key in list(allow_info.keys()):
# keep line when any filter match
# FIXME:
# Considering performance, din't handle multiple filters in one same line
# Considering performance, didn't handle multiple filters in one same line
if a_key in line:
allow_info[a_key] -= 1
# stop checking it when enough lines contain the key were found
allow_info.pop(a_key) if allow_info[a_key] == 0 else None
return line
# discard the line when all filters are enough matched
# discard line when none filters found

def get_obfuscate_functions(self, filename='', no_obfuscate=None):
"""
Expand Down
15 changes: 4 additions & 11 deletions insights/core/spec_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def __init__(self):
self._content = None
self._exception = None
self._filterable = False
self._filters = set()
self._filters = dict()

def load(self):
raise NotImplementedError()
Expand Down Expand Up @@ -96,11 +96,10 @@ def _clean_content(self):
allowlist = None
if self._filterable:
cleans.append("Filter")
allowlist = dict((f, filters.MATCH_COUNT) for f in self._filters)
allowlist = self._filters
# Cleaning - Entry
if cleans:
log.debug("Cleaning (%s) %s", "/".join(cleans), self.relative_path)
allowlist = None
content = self.cleaner.clean_content(
content[::-1], # Scan from bottom
allowlist=allowlist,
Expand Down Expand Up @@ -210,7 +209,7 @@ def __init__(self, relative_path, root="/", save_as=None, ds=None, ctx=None, cle
if self.ds and filters.ENABLED
else False
)
self._filters = filters.get_filters(self.ds) if self.ds else set()
self._filters = filters.get_filters(self.ds, True) if self.ds else set()

self.validate()

Expand Down Expand Up @@ -314,10 +313,7 @@ def create_args(self):
"""
args = []
if self._filters:
<<<<<<< HEAD
log.debug("Pre-filtering %s", self.relative_path)
=======
>>>>>>> 19ecd79e (feat: support line filter in spec Cleaner)
args.append(["grep", "-F", "\n".join(self._filters), self.path])

return args
Expand Down Expand Up @@ -412,7 +408,7 @@ def __init__(
if self.ds and filters.ENABLED
else False
)
self._filters = filters.get_filters(self.ds) if self.ds else set()
self._filters = filters.get_filters(self.ds, True) if self.ds else set()

self.validate()

Expand All @@ -439,10 +435,7 @@ def create_args(self):
command = [shlex.split(self.cmd)]

if self.split and self._filters:
<<<<<<< HEAD
log.debug("Pre-filtering %s", self.relative_path)
=======
>>>>>>> 19ecd79e (feat: support line filter in spec Cleaner)
command.append(["grep", "-F", "\n".join(self._filters)])

return command
Expand Down
Loading

0 comments on commit 864120a

Please sign in to comment.