Skip to content

Commit

Permalink
Add rule tree configuration validation (#736)
Browse files Browse the repository at this point in the history
* refactor: decouple rule tree configuration and refactor initialization

- Extracted RuleTree configuration into a dedicated `Config` class for improved modularity.
- Updated unit tests and references to reflect the refactoring changes in RuleTree initialization.
- Add documentation to rst docs


---------

Co-authored-by: ekneg54 <[email protected]>
  • Loading branch information
dtrai2 and ekneg54 authored Jan 3, 2025
1 parent 33f6cb2 commit c0004a5
Show file tree
Hide file tree
Showing 9 changed files with 144 additions and 125 deletions.
39 changes: 0 additions & 39 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -91,45 +91,6 @@ and secondly they specify how to process the message.
For example which fields should be deleted or to which IP-address the geolocation should be
retrieved.

For performance reasons on startup all rules per processor are aggregated to a rule tree.
Instead of evaluating all rules independently for each log message the message is checked against
the rule tree.
Each node in the rule tree represents a condition that has to be meet,
while the leafs represent changes that the processor should apply.
If no condition is met, the processor will just pass the log event to the next processor.

The following chart gives an example of such a rule tree:

```mermaid
flowchart TD
A[root]
A-->B[Condition 1]
A-->C[Condition 2]
A-->D[Condition 3]
B-->E[Condition 4]
B-->H(Rule 1)
C-->I(Rule 2)
D-->J(rule 3)
E-->G(Rule 4)
```

To further improve the performance, it is possible to prioritize specific nodes of the rule tree,
such that broader conditions are higher up in the tree.
And specific conditions can be moved further down.
Following json gives an example of such a rule tree configuration.
This configuration will lead to the prioritization of `tags` and `message` in the rule tree.

```json
{
"priority_dict": {
"category": "01",
"message": "02"
},
"tag_map": {
"check_field_name": "check-tag"
}
}
```

### Connectors

Expand Down
1 change: 1 addition & 0 deletions doc/source/configuration/rules.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ Rules

.. automodule:: logprep.processor.base.rule
.. automodule:: logprep.filter.lucene_filter
.. automodule:: logprep.framework.rule_tree.rule_tree
4 changes: 2 additions & 2 deletions logprep/abc/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ class Config(Component.Config):
tree_config: Optional[str] = field(
default=None, validator=[validators.optional(validators.instance_of(str))]
)
"""Path to a JSON file with a valid rule tree configuration.
"""Path to a JSON file with a valid :ref:`Rule Tree Configuration`.
For string format see :ref:`getters`."""
apply_multiple_times: Optional[bool] = field(
default=False, validator=[validators.optional(validators.instance_of(bool))]
Expand All @@ -123,7 +123,7 @@ class Config(Component.Config):

def __init__(self, name: str, configuration: "Processor.Config"):
super().__init__(name, configuration)
self._rule_tree = RuleTree(processor_name=self.name, processor_config=self._config)
self._rule_tree = RuleTree(config=self._config.tree_config)
self.load_rules(rules_targets=self._config.rules)
self.result = None
self._bypass_rule_tree = False
Expand Down
145 changes: 95 additions & 50 deletions logprep/framework/rule_tree/rule_tree.py
Original file line number Diff line number Diff line change
@@ -1,94 +1,140 @@
"""This module contains the rule tree functionality."""
"""
.. _Rule Tree:
from enum import Enum
from logging import Logger
from typing import TYPE_CHECKING, List, Optional, Union
RuleTree
========
For performance reasons on startup, all rules per processor are aggregated to a rule tree.
Instead of evaluating all rules independently for each log message, the message is checked against
the rule tree.
Each node in the rule tree represents a condition that has to be met,
while the leaves represent changes that the processor should apply.
If no condition is met, the processor will just pass the log event to the next processor.
.. _Rule Tree Configuration:
Rule Tree Configuration
^^^^^^^^^^^^^^^^^^^^^^^
To further improve the performance, it is possible to prioritize specific nodes of the rule tree,
such that broader conditions are higher up in the tree.
And specific conditions can be moved further down.
The following json gives an example of such a rule tree configuration.
This configuration will lead to the prioritization of `category` and `message` in the rule tree.
.. code-block:: json
:linenos:
{
"priority_dict": {
"category": "01",
"message": "02"
},
"tag_map": {
"check_field_name": "check-tag"
}
}
A path to a rule tree configuration can be set in any processor configuration under the key
:code:`tree_config`.
"""

from logging import getLogger
from typing import TYPE_CHECKING, List, Optional

from attrs import define, field, validators

from logprep.framework.rule_tree.node import Node
from logprep.framework.rule_tree.rule_parser import RuleParser
from logprep.util import getter

if TYPE_CHECKING: # pragma: no cover
from logprep.abc.processor import Processor
from logprep.processor.base.rule import Rule

logger = getLogger("RuleTree")


class RuleTree:
"""Represent a set of rules using a rule tree model."""

@define(kw_only=True, slots=False)
class Config:
"""Configuration for the RuleTree"""

priority_dict: dict[str] = field(
validator=[
validators.instance_of(dict),
validators.deep_mapping(
key_validator=validators.instance_of(str),
value_validator=validators.instance_of(str),
),
],
default=dict(),
)
"""Fields used in filter expressions can be prioritized by specifying
them in this priority dict. The key describes the field name used in
the filter expression and the value describes the priority in form
of string number. The higher the number, the higher the priority."""

tag_map: dict = field(
validator=[
validators.instance_of(dict),
validators.deep_mapping(
key_validator=validators.instance_of(str),
value_validator=validators.instance_of(str),
),
],
default=dict(),
)
"""tbd"""

__slots__ = (
"rule_parser",
"priority_dict",
"_rule_mapping",
"_processor_config",
"_processor_type",
"_processor_name",
"tree_config",
"_root",
"__dict__",
)

tree_config: Config

rule_parser: Optional[RuleParser]
priority_dict: dict
_rule_mapping: dict
_processor_name: str
_processor_config: "Processor.Config"
_processor_type: str
_root: Node

def __init__(
self,
root: Node = None,
processor_name: str = None,
processor_config: "Processor.Config" = None,
config: str = None,
):
"""Rule tree initialization function.
Initializes a new rule tree with a given root node and a path to the tree's optional config
file. If no root node is specified, a new node will be created and used as root node.
Also starts the further setup.
Parameters
----------
root: Node, optional
Node that should be used as the new rule tree's root node.
processor_config: Processor.Config, optional
Configuration of the processor that uses the rule tree.
config: str, optional
Path to a tree configuration.
"""
self.rule_parser = None
self._rule_mapping = {}
self._processor_config = processor_config
self._processor_name = processor_name if processor_name is not None else ""
self._processor_type = processor_config.type if processor_name is not None else ""
self._setup()
self.tree_config = RuleTree.Config()
if config:
config_data = getter.GetterFactory.from_string(config).get_json()
self.tree_config = RuleTree.Config(**config_data)
self.rule_parser = RuleParser(self.tree_config.tag_map)

self._root = Node(None)
if root:
self._root = root
else:
self._root = Node(None)

@property
def number_of_rules(self) -> int:
return len(self._rule_mapping)

def _setup(self):
"""Basic setup of rule tree.
Initiate the rule tree's priority dict, tag map and load the configuration from file.
"""
self.priority_dict = {}
tag_map = {}

if self._processor_config and self._processor_config.tree_config:
config_data = getter.GetterFactory.from_string(
self._processor_config.tree_config
).get_json()
self.priority_dict = config_data["priority_dict"]
tag_map = config_data["tag_map"]
self.rule_parser = RuleParser(tag_map)

def add_rule(self, rule: "Rule", logger: Logger = None):
def add_rule(self, rule: "Rule"):
"""Add rule to rule tree.
Add a new rule to the rule tree.
Expand All @@ -102,16 +148,15 @@ def add_rule(self, rule: "Rule", logger: Logger = None):
----------
rule: Rule
Rule to be added to the rule tree.
logger: Logger
Logger to use for logging.
"""
try:
parsed_rule = self.rule_parser.parse_rule(rule, self.priority_dict)
parsed_rule = self.rule_parser.parse_rule(rule, self.tree_config.priority_dict)
except Exception as error: # pylint: disable=broad-except
logger.warning(
f'Error parsing rule "{rule.file_name}.yml": {type(error).__name__}: {error}. '
f"Ignore and continue with next rule."
'Error parsing rule "%s.yml": %s: %s. Ignore and continue with next rule.',
rule.file_name,
type(error).__name__,
error,
)
return
for rule_segment in parsed_rule:
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ doc = [
"sphinx",
"sphinx_rtd_theme",
"sphinxcontrib.datatemplates",
"sphinxcontrib-mermaid",
"sphinx-copybutton",
"nbsphinx",
"ipython",
Expand Down
63 changes: 39 additions & 24 deletions tests/unit/framework/rule_tree/test_rule_tree.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,18 @@
# pylint: disable=protected-access
# pylint: disable=missing-docstring
# pylint: disable=line-too-long
import json
from copy import deepcopy
from unittest import mock

import pytest

from logprep.factory import Factory
from logprep.filter.expression.filter_expression import Exists, StringFilterExpression
from logprep.framework.rule_tree.node import Node
from logprep.framework.rule_tree.rule_parser import RuleParser
from logprep.framework.rule_tree.rule_tree import RuleTree
from logprep.processor.pre_detector.rule import PreDetectorRule
from logprep.util import getter


@pytest.fixture(name="rule_dict")
Expand All @@ -33,29 +34,42 @@ def rule_dict_fixture():
class TestRuleTree:
def test_init_without_specifying_parameters(self):
rule_tree = RuleTree()

assert isinstance(rule_tree.root, Node)
assert not rule_tree.rule_parser._rule_tagger._tag_map
assert not rule_tree.priority_dict
assert not rule_tree.tree_config.priority_dict
assert rule_tree.root.expression is None

def test_init_with_specifying_config(self):
processor = Factory.create(
{
"processor": {
"type": "dissector",
"rules": [],
"tree_config": "tests/testdata/unit/tree_config.json",
}
}
)
rule_tree = RuleTree(processor_config=processor._config)

rule_tree = RuleTree(config="tests/testdata/unit/tree_config.json")
assert isinstance(rule_tree.root, Node)
assert rule_tree.rule_parser._rule_tagger._tag_map == {
"field_name_to_check_for_in_rule": "TAG-TO-CHECK-IF-IN-EVENT"
}
assert rule_tree.priority_dict == {"field_name": "priority"}
assert rule_tree.tree_config.priority_dict == {"field_name": "priority"}

@pytest.mark.parametrize(
"tree_config",
[
{
"priority_dict": {
"winlog": 1, # should be string
},
"tag_map": {"field_name_to_check_for_in_rule": "TAG-TO-CHECK-IF-IN-EVENT"},
},
{
"priority_dict": {
"winlog": "1",
},
"tag_map": {"field_name_to_check_for_in_rule": 1}, # should be string
},
],
)
def test_init_with_invalid_tree_config(self, tmp_path, tree_config):
tree_config_path = tmp_path / "tree_config.json"
tree_config_path.write_text(json.dumps(tree_config))
config_data = getter.GetterFactory.from_string(str(tree_config_path)).get_json()
with pytest.raises(TypeError, match=r"must be \<class \'str\'\>"):
self.tree_config = RuleTree.Config(**config_data)

def test_add_rule(self, rule_dict):
rule_tree = RuleTree()
Expand All @@ -80,21 +94,22 @@ def test_add_rule(self, rule_dict):
rule
]

def test_add_rule_fails(self, rule_dict):
@mock.patch("logging.Logger.warning")
def test_add_rule_fails(self, mock_warning, rule_dict):
rule_tree = RuleTree()
rule = PreDetectorRule._create_from_dict(rule_dict)

mocked_logger = mock.MagicMock()
error = Exception("mocked error")
with mock.patch(
"logprep.framework.rule_tree.rule_parser.RuleParser.parse_rule",
side_effect=Exception("mocked error"),
side_effect=error,
):
rule_tree.add_rule(rule, logger=mocked_logger)
expected_call = mock.call.warning(
'Error parsing rule "None.yml": Exception: mocked error. '
"Ignore and continue with next rule."
rule_tree.add_rule(rule)
mock_warning.assert_called_with(
'Error parsing rule "%s.yml": %s: %s. Ignore and continue with next rule.',
None,
type(error).__name__,
error,
)
assert expected_call in mocked_logger.mock_calls

def test_get_rule_id(self, rule_dict):
rule_tree = RuleTree()
Expand Down
Loading

0 comments on commit c0004a5

Please sign in to comment.