Skip to content

Commit

Permalink
feat: support line filter in spec Cleaner (#4299)
Browse files Browse the repository at this point in the history
- RHINENG-14668
  for each filter, keep 10k matched lines by default for now.
  don't remove the "grep -F" as it's faster and can be used
  to reduce the file size before cleaning
- Add test to verify the count of matched lines
  Move all spec relevant tests to tests/specs
  Move all core relevant tests to tests/core

Signed-off-by: Xiangce Liu <[email protected]>
(cherry picked from commit 55dc28d)
(cherry picked from commit c34edd3)
  • Loading branch information
xiangce committed Jan 2, 2025
1 parent 2e957a0 commit 78a64cd
Show file tree
Hide file tree
Showing 29 changed files with 733 additions and 242 deletions.
10 changes: 9 additions & 1 deletion insights/core/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
``INSIGHTS_FILTERS_ENABLED=False``. This means that no datasources will be
filtered even if filters are defined for them.
"""

import os
import pkgutil
import six
Expand All @@ -49,8 +50,13 @@
_CACHE = {}
FILTERS = defaultdict(set)
ENABLED = parse_bool(os.environ.get("INSIGHTS_FILTERS_ENABLED"), default=True)
MATCH_COUNT = 10000


# TODO:
# - support specifying the max match number of filtered lines
# add_filter(Messages, "Such an Error", 10)
# def add_filter(component, patterns, match_count=MATCH_COUNT):
def add_filter(component, patterns):
"""
Add a filter or list of filters to a component. When the component is
Expand All @@ -66,6 +72,7 @@ def add_filter(component, patterns):
patterns (str, [str]): A string, list of strings, or set of strings to
add to the datasource's filters.
"""

def inner(comp, patterns):
if comp in _CACHE:
del _CACHE[comp]
Expand All @@ -86,7 +93,7 @@ def inner(comp, patterns):
FILTERS[comp] |= patterns

def get_dependency_datasources(comp):
""""Get (all) the first depended datasource"""
"""Get (all) the first depended datasource"""
dep_ds = set()
if plugins.is_datasource(comp):
dep_ds.add(comp)
Expand Down Expand Up @@ -140,6 +147,7 @@ def get_filters(component):
Returns:
set: The set of filters defined for the datasource
"""

def inner(c, filters=None):
filters = filters or set()

Expand Down
89 changes: 60 additions & 29 deletions insights/core/spec_cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,15 @@
The following processes will be applied to clean the collected specs:
- Redaction
This is a must-be-done operation to all the collected specs.
- Redaction
This is a must-be-done operation to all the collected specs.
- Obfuscation
Obfuscate the IP or Hostname appears in the spec content according to the
specs native requirement and user configuration.
- Obfuscation
Obfuscate the IP or Hostname appears in the spec content according to the
specs native requirement and user configuration.
"""

import logging
import hashlib
import json
Expand Down Expand Up @@ -55,8 +56,8 @@ class Cleaner(object):
def __init__(self, config, rm_conf, fqdn=None):
self.report_dir = '/tmp'
self.rhsm_facts_file = getattr(
config, 'rhsm_facts_file',
os.path.join(self.report_dir, 'insights-client.facts'))
config, 'rhsm_facts_file', os.path.join(self.report_dir, 'insights-client.facts')
)
# Obfuscation - set: ip and hostname only
self.obfuscate = set()
self.obfuscate.add('ip') if config and config.obfuscate else None
Expand All @@ -76,7 +77,7 @@ def __init__(self, config, rm_conf, fqdn=None):
# Keyword replacement does NOT depend on "obfuscate=True"
keywords = rm_conf.get('keywords')
self.kw_db = dict() # keyword database
self.kws = set() # keywords that have been replaced
self.kws = set() # keywords that have been replaced
self._keywords2db(keywords)

# Obfuscation
Expand All @@ -103,8 +104,8 @@ def __init__(self, config, rm_conf, fqdn=None):
if config and config.obfuscate_hostname and self.fqdn:
self._domains2db()
hashed_hostname = hashlib.sha1(
self.fqdn.encode('utf-8')
if six.PY3 else self.fqdn).hexdigest()[:12]
self.fqdn.encode('utf-8') if six.PY3 else self.fqdn
).hexdigest()[:12]
self.obfuscated_fqdn = '{0}.example.com'.format(hashed_hostname)
self.hostname_count += 1
self.hn_db[self.obfuscated_fqdn] = self.fqdn
Expand Down Expand Up @@ -137,9 +138,9 @@ def _ip2db(self, ip):
if v == ip_num:
ret_ip = self._int2ip(k)
ip_found = True
if ip_found: # the entry already existed
if ip_found: # the entry already existed
return ret_ip
else: # the entry did not already exist
else: # the entry did not already exist
if len(self.ip_db) > 0:
new_ip = max(db.keys()) + 1
else:
Expand All @@ -151,7 +152,7 @@ def _ip2db(self, ip):
def _sub_ip(self, line):
'''
This will substitute an obfuscated IP for each instance of a given IP in a file
This is called in the self._clean_line function, along with user _sub_* functions to scrub a given
This is called in the self.clean_* function, along with user _sub_* functions to scrub a given
line in a file.
It scans a given line and if an IP exists, it obfuscates the IP using _ip2db and returns the altered line
'''
Expand Down Expand Up @@ -213,7 +214,7 @@ def _sub_ip_netstat(self, line):
if idx == len(line):
break
c = line[idx]
line = line[0:idx] + line[(idx + numspaces):]
line = line[0:idx] + line[(idx + numspaces) :]

else:
line = line.replace(ip, new_ip)
Expand All @@ -232,7 +233,9 @@ def _domains2db(self):
# we will add the root domain for an FQDN as well.
if self.domain is not None:
self.dn_db[self.obfuscated_domain] = self.domain
logger.debug("Obfuscated Domain Created - %s -> %s" % (self.domain, self.obfuscated_domain))
logger.debug(
"Obfuscated Domain Created - %s -> %s" % (self.domain, self.obfuscated_domain)
)

self.domain_count = len(self.dn_db)
return True
Expand All @@ -252,7 +255,8 @@ def _hn2db(self, hn):
if hn_found:
return ret_hn
else:
self.hostname_count += 1 # we have a new hostname, so we increment the counter to get the host ID number
# we have a new hostname, so we increment the counter to get the host ID number
self.hostname_count += 1
o_domain = self.obfuscated_domain
for od, d in self.dn_db.items():
if d in hn: # never false
Expand All @@ -279,7 +283,8 @@ def _sub_hostname(self, line):
logger.debug("Obfuscating FQDN - %s > %s", hn, new_hn)
line = line.replace(hn, new_hn)
if self.hostname:
line = line.replace(self.hostname, self._hn2db(self.fqdn)) # catch any non-fqdn instances of the system hostname
# catch any non-fqdn instances of the system hostname
line = line.replace(self.hostname, self._hn2db(self.fqdn))
return line
except Exception as e: # pragma: no cover
logger.warning(e)
Expand Down Expand Up @@ -347,6 +352,22 @@ def _redact_line(self, line):
# 3. keyword replacement redaction
return self._sub_keywords(line)

def _filter_line_per_allowlist(self, line, allow_info):
# filter line as per the allow list specified by plugins
if not line:
return line
if allow_info:
for a_key in list(allow_info.keys()):
# keep line when any filter match
# FIXME:
# Considering performance, din't handle multiple filters in one same line
if a_key in line:
allow_info[a_key] -= 1
# stop checking it when enough lines contain the key were found
allow_info.pop(a_key) if allow_info[a_key] == 0 else None
return line
# discard line when none filters found

def get_obfuscate_functions(self, filename='', no_obfuscate=None):
"""
Return the list of required obfuscation function according to the
Expand All @@ -359,23 +380,32 @@ def get_obfuscate_functions(self, filename='', no_obfuscate=None):
# Get the actual obfuscate list setting for this file
obfs = set(self.obfuscate) - set(no_obfuscate or [])
# IP obfuscation entry
obf_funcs.append(self._sub_ip_netstat if filename.endswith("netstat_-neopa") else self._sub_ip) if "ip" in obfs else None
(
obf_funcs.append(
self._sub_ip_netstat if filename.endswith("netstat_-neopa") else self._sub_ip
)
if "ip" in obfs
else None
)
# Hostname obfuscation entry
obf_funcs.append(self._sub_hostname) if "hostname" in obfs else None
return obf_funcs

def clean_content(self, lines, obf_funcs=None, no_redact=False):
def clean_content(self, lines, obf_funcs=None, no_redact=False, allowlist=None):
"""
Clean lines one by one according to the configuration, the cleaned
lines will be returned.
"""

def _clean_line(_line):
# 1. Do Redaction by default, unless "no_redact=True"
if _line and not no_redact:
_line = self._redact_line(_line)
# 2. Do Obfuscation as per the "obf_funcs"
_line = self._obfuscate_line(_line, obf_funcs or [])
return _line
# 2. Do filtering as per allowlist got from "filters.yaml"
if _line and allowlist is not None:
_line = self._filter_line_per_allowlist(_line, allowlist)
# 3. Do Obfuscation as per the "obf_funcs"
return self._obfuscate_line(_line, obf_funcs or [])

# handle single string
if not isinstance(lines, list):
Expand All @@ -391,7 +421,7 @@ def _clean_line(_line):
# All lines blank
return []

def clean_file(self, _file, no_obfuscate=None, no_redact=False):
def clean_file(self, _file, no_obfuscate=None, no_redact=False, allowlist=None):
"""
Clean a file according to the configuration, the file will be updated
directly with the cleaned content.
Expand All @@ -405,7 +435,12 @@ def clean_file(self, _file, no_obfuscate=None, no_redact=False):
try:
with open(_file, 'r') as fh:
raw_data = fh.readlines()
content = self.clean_content(raw_data, obf_funcs, no_redact)
content = self.clean_content(
raw_data,
obf_funcs=obf_funcs,
no_redact=no_redact,
allowlist=allowlist,
)
except Exception as e: # pragma: no cover
logger.warning(e)
raise Exception("Error: Cannot Open File for Cleaning: %s" % _file)
Expand Down Expand Up @@ -437,11 +472,7 @@ def generate_rhsm_facts(self):

ip_block = []
for k, v in self.ip_db.items():
ip_block.append(
{
'original': self._int2ip(v),
'obfuscated': self._int2ip(k)
})
ip_block.append({'original': self._int2ip(v), 'obfuscated': self._int2ip(k)})

facts = {
'insights_client.hostname': self.fqdn,
Expand Down
61 changes: 37 additions & 24 deletions insights/core/spec_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ def __init__(self):
self.loaded = False
self._content = None
self._exception = None
self._filterable = False
self._filters = set()

def load(self):
raise NotImplementedError()
Expand All @@ -82,27 +84,34 @@ def _clean_content(self):
collection.
"""
content = self.content # load first for debugging info order
if isinstance(self.ctx, HostContext) and self.ds and self.cleaner:
if content and isinstance(self.ctx, HostContext) and self.ds and self.cleaner:
cleans = []
# Redacting?
no_red = getattr(self.ds, 'no_redact', False)
cleans.append("Redact") if not no_red else None
# Obfuscating?
no_obf = getattr(self.ds, 'no_obfuscate', [])
cleans.append("Obfuscate") if len(no_obf) < 2 else None
# Filtering?
allowlist = None
if self._filterable:
cleans.append("Filter")
allowlist = dict((f, filters.MATCH_COUNT) for f in self._filters)
# Cleaning - Entry
if cleans:
log.debug("Cleaning (%s) %s", "/".join(cleans), "/" + self.relative_path)
log.debug("Cleaning (%s) %s", "/".join(cleans), self.relative_path)
content = self.cleaner.clean_content(
content,
content[::-1], # Scan from bottom
allowlist=allowlist,
obf_funcs=self.cleaner.get_obfuscate_functions(self.relative_path, no_obf),
no_redact=no_red,
)
)[::-1]
# ^ Reverse to the right order then
if len(content) == 0:
log.debug("Skipping %s due to empty after cleaning", self.path)
raise ContentException("Empty after cleaning: %s" % self.path)
else:
log.debug("Skipping cleaning %s", "/" + self.relative_path)
log.debug("Skipping cleaning %s", self.relative_path)
return content

@property
Expand Down Expand Up @@ -195,6 +204,12 @@ def __init__(self, relative_path, root="/", save_as=None, ds=None, ctx=None, cle
self.relative_path = relative_path.lstrip("/")
self.save_as = save_as
self.file_name = os.path.basename(self.path)
self._filterable = (
any(s.filterable for s in dr.get_registry_points(self.ds))
if self.ds and filters.ENABLED
else False
)
self._filters = filters.get_filters(self.ds) if self.ds else set()

self.validate()

Expand All @@ -205,12 +220,7 @@ def validate(self):
# 2. Check only when collecting
if isinstance(self.ctx, HostContext):
# 2.1 No Filters for 'filterable=True' Specs
if (
self.ds
and filters.ENABLED
and any(s.filterable for s in dr.get_registry_points(self.ds))
and not filters.get_filters(self.ds)
):
if self._filterable and not self._filters:
raise NoFilterException("Skipping %s due to no filters." % dr.get_name(self.ds))
# 2.2 Customer Prohibits Collection
if not blacklist.allow_file("/" + self.relative_path):
Expand Down Expand Up @@ -298,10 +308,13 @@ class TextFileProvider(FileProvider):
"""

def create_args(self):
"""
The "grep" is faster and can be used shrink the size of file.
"""
args = []
_filters = "\n".join(filters.get_filters(self.ds))
if _filters:
args.append(["grep", "-F", _filters, self.path])
if self._filters:
log.debug("Pre-filtering %s", self.relative_path)
args.append(["grep", "-F", "\n".join(self._filters), self.path])

return args

Expand Down Expand Up @@ -390,6 +403,12 @@ def __init__(
self._misc_settings()
self._content = None
self._env = self.create_env()
self._filterable = (
any(s.filterable for s in dr.get_registry_points(self.ds))
if self.ds and filters.ENABLED
else False
)
self._filters = filters.get_filters(self.ds) if self.ds else set()

self.validate()

Expand All @@ -405,12 +424,7 @@ def validate(self):
# 2. Check only when collecting
if isinstance(self.ctx, HostContext):
# 2.1 No Filters for 'filterable=True' Specs
if (
self.ds
and filters.ENABLED
and any(s.filterable for s in dr.get_registry_points(self.ds))
and not filters.get_filters(self.ds)
):
if self._filterable and not self._filters:
raise NoFilterException("Skipping %s due to no filters." % dr.get_name(self.ds))
# 2.2 Customer Prohibits Collection
if not blacklist.allow_command(self.cmd):
Expand All @@ -420,10 +434,9 @@ def validate(self):
def create_args(self):
command = [shlex.split(self.cmd)]

if self.split:
_filters = "\n".join(filters.get_filters(self.ds))
if _filters:
command.append(["grep", "-F", _filters])
if self.split and self._filters:
log.debug("Pre-filtering %s", self.relative_path)
command.append(["grep", "-F", "\n".join(self._filters)])

return command

Expand Down
File renamed without changes.
Loading

0 comments on commit 78a64cd

Please sign in to comment.