Skip to content

Commit

Permalink
refactor(cli): cli deduplication in core (semgrep#9711)
Browse files Browse the repository at this point in the history
## What:
This PR moves the CLI deduplication (based on `cli_unique_key`) to the
Core engine.

## Why:
This makes the core output more in line with the cli output, and
prevents duplication of the logic across pysemgrep and osemgrep.

## How:
The cli unique key is based on the rule metadata, so I had to move that
to the pattern matches.

I also deleted the deduplication logic from the `pysemgrep` side.

## Test plan:
Tests still pass.

Note that I updated a bunch of snapshots. This is because of [this
line](https://github.com/semgrep/semgrep/blob/39f95450a7d4d70e54c9edbd109bed8210a36889/cli/src/semgrep/core_runner.py#L771)
in the pysemgrep core runner, which reorders the fields of the rule JSON
before it enters the core engine.

When the core engine didn't used to deal with metadata, this means that
`pysemgrep` would insert the metadata, in a non-sorted manner. Now,
though, the core engine reports the sorted metadata, which is correct,
as that's the metadata it receives. This changes the order in the
snapshot data, though, so we need to change it to reflect the data the
core engine is returning.
  • Loading branch information
brandonspark authored Mar 6, 2024
1 parent d073c16 commit 752b48d
Show file tree
Hide file tree
Showing 76 changed files with 1,051 additions and 1,018 deletions.
15 changes: 5 additions & 10 deletions cli/src/semgrep/core_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,8 @@
from semgrep.error import SemgrepCoreError
from semgrep.error import TARGET_PARSE_FAILURE_EXIT_CODE
from semgrep.rule import Rule
from semgrep.rule_match import CliUniqueKey
from semgrep.rule_match import RuleMatch
from semgrep.rule_match import RuleMatchSet
from semgrep.rule_match import RuleMatches
from semgrep.verbose_logging import getLogger

logger = getLogger(__name__)
Expand Down Expand Up @@ -122,16 +121,12 @@ def convert_to_rule_match(match: out.CoreMatch) -> RuleMatch:
fix=fix,
)

by_unique_key: Dict[CliUniqueKey, RuleMatch] = {}
# TODO: Dict[out.RuleId, RuleMatches]
# We used to deduplicate by `cli_unique_key` here, but now no longer need to,
# because it is deduplicated in semgrep-core as core_unique_key!
findings: Dict[Rule, RuleMatches] = {rule: RuleMatches(rule) for rule in rules}
for match in res.results:
rule_match = convert_to_rule_match(match)
curr = by_unique_key.setdefault(rule_match.cli_unique_key, rule_match)
if rule_match.should_report_instead(curr):
by_unique_key[rule_match.cli_unique_key] = rule_match

# TODO: Dict[out.RuleId, RuleMatchSet]
findings: Dict[Rule, RuleMatchSet] = {rule: RuleMatchSet(rule) for rule in rules}
for rule_match in by_unique_key.values():
rule = rule_table[rule_match.rule_id]
findings[rule].add(rule_match)

Expand Down
88 changes: 7 additions & 81 deletions cli/src/semgrep/rule_match.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,6 @@ class RuleMatch:
lines: List[str] = field(init=False, repr=False)
previous_line: str = field(init=False, repr=False)
syntactic_context: str = field(init=False, repr=False)
cli_unique_key: CliUniqueKey = field(init=False, repr=False)
ci_unique_key: Tuple = field(init=False, repr=False)
ordering_key: Tuple = field(init=False, repr=False)
match_based_key: Tuple = field(init=False, repr=False)
Expand Down Expand Up @@ -222,74 +221,12 @@ def get_syntactic_context(self) -> str:
code = code.strip()
return code

@cli_unique_key.default
def get_cli_unique_key(self) -> CliUniqueKey:
"""
A unique key designed with data-completeness & correctness in mind.
Results in more unique findings than ci_unique_key.
Used for deduplication in the CLI before writing output.
"""
return (
# NOTE: We include the previous scan's rules in the config for
# consistent fixed status work. For unique hashing/grouping,
# previous and current scan rules must have distinct check IDs.
# Hence, previous scan rules are annotated with a unique check ID,
# while the original ID is kept in metadata. As check_id is used
# for cli_unique_key, this patch fetches the check ID from metadata
# for previous scan findings.
# TODO: Once the fixed status work is stable, all findings should
# fetch the check ID from metadata. This fallback prevents breaking
# current scan results if an issue arises.
self.annotated_rule_name if self.from_transient_scan else self.rule_id,
str(self.git_blob.value if self.git_blob else self.path),
self.start.offset,
self.end.offset,
self.message,
# TODO: Bring this back.
# This is necessary so we don't deduplicate taint findings which
# have different sources.
#
# self.match.extra.dataflow_trace.to_json_string
# if self.match.extra.dataflow_trace
# else None,
None,
# NOTE: previously, we considered self.match.extra.validation_state
# here, but since in some cases (e.g., with `anywhere`) we generate
# many matches in certain cases, we want to consider secrets
# matches unique under the above set of things, but with a priority
# associated with the validation state; i.e., a match with a
# confirmed valid state should replace all matches equal under the
# above key. We can't do that just by not considering validation
# state since we would pick one arbitrarily, and if we added it
# below then we would report _both_ valid and invalid (but we only
# want to report valid, if a valid one is present and unique per
# above fields). See also `should_report_instead`.
)

def should_report_instead(self, other: "RuleMatch") -> bool:
"""
Returns True iff we should report `self` in lieu of reporting `other`.
This is currently only used for the following items:
- secrets: a valid finding is reported over an invalid one
Assumes that self.cli_unique_key == other.cli_unique_key
"""
if self.validation_state is None:
return False
if other.validation_state is None:
return True
return isinstance(
self.validation_state.value, out.ConfirmedValid
) and not isinstance(other.validation_state.value, out.ConfirmedValid)

@ci_unique_key.default
def get_ci_unique_key(self) -> Tuple:
"""
A unique key designed with notification user experience in mind.
Results in fewer unique findings than cli_unique_key.
Results in fewer unique findings than core_unique_key.
"""
try:
path = self.path.relative_to(Path.cwd())
Expand All @@ -314,7 +251,7 @@ def get_path_changed_ci_unique_key(self, rename_dict: Dict[str, Path]) -> Tuple:
"""
A unique key that accounts for filepath renames.
Results in fewer unique findings than cli_unique_key.
Results in fewer unique findings than core_unique_key.
"""
try:
path = str(self.path.relative_to(Path.cwd()))
Expand Down Expand Up @@ -621,24 +558,13 @@ def from_transient_scan(self) -> bool:
def annotated_rule_name(self) -> str:
return self.metadata.get("semgrep.dev", {}).get("rule", {}).get("rule_name")

def __hash__(self) -> int:
"""
We use the "data-correctness" key to prevent keeping around duplicates.
"""
return hash(self.cli_unique_key)

def __eq__(self, other: object) -> bool:
if not isinstance(other, type(self)):
return False
return self.cli_unique_key == other.cli_unique_key

def __lt__(self, other: "RuleMatch") -> bool:
if not isinstance(other, type(self)):
return NotImplemented
return self.ordering_key < other.ordering_key


class RuleMatchSet(Iterable[RuleMatch]):
class RuleMatches(Iterable[RuleMatch]):
"""
A custom set type which is aware when findings are the same.
Expand All @@ -652,9 +578,9 @@ def __init__(
self._ci_key_counts: CounterType[Tuple] = Counter()
self._rule = rule
if __iterable is None:
self._set = set()
self._store = []
else:
self._set = set(__iterable)
self._store = list(__iterable)

def add(self, match: RuleMatch) -> None:
"""
Expand All @@ -674,7 +600,7 @@ def add(self, match: RuleMatch) -> None:
match,
match_based_index=self._match_based_counts[match.get_match_based_key()] - 1,
)
self._set.add(match)
self._store.append(match)

def update(self, *rule_match_iterables: Iterable[RuleMatch]) -> None:
"""
Expand All @@ -689,7 +615,7 @@ def update(self, *rule_match_iterables: Iterable[RuleMatch]) -> None:
self.add(rule_match)

def __iter__(self) -> Iterator[RuleMatch]:
return iter(self._set)
return iter(self._store)


# Our code orders findings at one point and then just assumes they're in order.
Expand Down
4 changes: 2 additions & 2 deletions cli/src/semgrep/run_scan.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,8 @@
from semgrep.output_extra import OutputExtra
from semgrep.profile_manager import ProfileManager
from semgrep.rule import Rule
from semgrep.rule_match import RuleMatches
from semgrep.rule_match import RuleMatchMap
from semgrep.rule_match import RuleMatchSet
from semgrep.semgrep_interfaces.semgrep_metrics import Any_ as AnySecretsOrigin
from semgrep.semgrep_interfaces.semgrep_metrics import CodeConfig
from semgrep.semgrep_interfaces.semgrep_metrics import SecretsConfig
Expand Down Expand Up @@ -234,7 +234,7 @@ def run_rules(
join_rule_matches, join_rule_errors = join_rule.run_join_rule(
rule.raw, [target.path for target in target_manager.targets]
)
join_rule_matches_set = RuleMatchSet(rule)
join_rule_matches_set = RuleMatches(rule)
for m in join_rule_matches:
join_rule_matches_set.add(m)
join_rule_matches_by_rule = {
Expand Down
32 changes: 16 additions & 16 deletions cli/tests/e2e-pro/snapshots/test_ci/test_dryrun/None/results.txt
Original file line number Diff line number Diff line change
Expand Up @@ -124,9 +124,9 @@ Would have sent findings and ignores blob: {
"semgrep.dev": {
"rule": {
"rule_id": "abcf",
"version_id": "version1",
"shortlink": "https://sg.run/abcd",
"url": "https://semgrep.dev/r/python.eqeq-five",
"shortlink": "https://sg.run/abcd"
"version_id": "version1"
},
"src": "new-rule"
}
Expand Down Expand Up @@ -264,10 +264,10 @@ Would have sent findings and ignores blob: {
"semgrep.dev": {
"rule": {
"rule_id": "abce",
"version_id": "version1",
"url": "https://semgrep.dev/r/python.eqeq-five",
"rule_name": "eqeq-four",
"shortlink": "https://sg.run/abcd",
"rule_name": "eqeq-four"
"url": "https://semgrep.dev/r/python.eqeq-five",
"version_id": "version1"
},
"src": "previous-scan"
}
Expand Down Expand Up @@ -302,9 +302,9 @@ Would have sent findings and ignores blob: {
"semgrep.dev": {
"rule": {
"rule_id": "abce",
"version_id": "version2",
"shortlink": "https://sg.run/abcd",
"url": "https://semgrep.dev/r/python.eqeq-five",
"shortlink": "https://sg.run/abcd"
"version_id": "version2"
},
"src": "new-version"
}
Expand Down Expand Up @@ -337,9 +337,9 @@ Would have sent findings and ignores blob: {
"semgrep.dev": {
"rule": {
"rule_id": "abcd",
"version_id": "version1",
"shortlink": "https://sg.run/abcd",
"url": "https://semgrep.dev/r/python.eqeq-five",
"shortlink": "https://sg.run/abcd"
"version_id": "version1"
},
"src": "unchanged"
}
Expand Down Expand Up @@ -524,10 +524,10 @@ Would have sent findings and ignores blob: {
"semgrep.dev": {
"rule": {
"rule_id": "abce",
"version_id": "version1",
"url": "https://semgrep.dev/r/python.eqeq-five",
"rule_name": "eqeq-four",
"shortlink": "https://sg.run/abcd",
"rule_name": "eqeq-four"
"url": "https://semgrep.dev/r/python.eqeq-five",
"version_id": "version1"
},
"src": "previous-scan"
}
Expand Down Expand Up @@ -562,9 +562,9 @@ Would have sent findings and ignores blob: {
"semgrep.dev": {
"rule": {
"rule_id": "abce",
"version_id": "version2",
"shortlink": "https://sg.run/abcd",
"url": "https://semgrep.dev/r/python.eqeq-five",
"shortlink": "https://sg.run/abcd"
"version_id": "version2"
},
"src": "new-version"
}
Expand Down Expand Up @@ -597,9 +597,9 @@ Would have sent findings and ignores blob: {
"semgrep.dev": {
"rule": {
"rule_id": "abcd",
"version_id": "version1",
"shortlink": "https://sg.run/abcd",
"url": "https://semgrep.dev/r/python.eqeq-five",
"shortlink": "https://sg.run/abcd"
"version_id": "version1"
},
"src": "unchanged"
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@
"semgrep.dev": {
"rule": {
"rule_id": "abcf",
"version_id": "version1",
"shortlink": "https://sg.run/abcd",
"url": "https://semgrep.dev/r/python.eqeq-five",
"shortlink": "https://sg.run/abcd"
"version_id": "version1"
},
"src": "new-rule"
}
Expand Down Expand Up @@ -160,10 +160,10 @@
"semgrep.dev": {
"rule": {
"rule_id": "abce",
"version_id": "version1",
"url": "https://semgrep.dev/r/python.eqeq-five",
"rule_name": "eqeq-four",
"shortlink": "https://sg.run/abcd",
"rule_name": "eqeq-four"
"url": "https://semgrep.dev/r/python.eqeq-five",
"version_id": "version1"
},
"src": "previous-scan"
}
Expand Down Expand Up @@ -198,9 +198,9 @@
"semgrep.dev": {
"rule": {
"rule_id": "abce",
"version_id": "version2",
"shortlink": "https://sg.run/abcd",
"url": "https://semgrep.dev/r/python.eqeq-five",
"shortlink": "https://sg.run/abcd"
"version_id": "version2"
},
"src": "new-version"
}
Expand Down Expand Up @@ -233,9 +233,9 @@
"semgrep.dev": {
"rule": {
"rule_id": "abcd",
"version_id": "version1",
"shortlink": "https://sg.run/abcd",
"url": "https://semgrep.dev/r/python.eqeq-five",
"shortlink": "https://sg.run/abcd"
"version_id": "version1"
},
"src": "unchanged"
}
Expand Down Expand Up @@ -423,10 +423,10 @@
"semgrep.dev": {
"rule": {
"rule_id": "abce",
"version_id": "version1",
"url": "https://semgrep.dev/r/python.eqeq-five",
"rule_name": "eqeq-four",
"shortlink": "https://sg.run/abcd",
"rule_name": "eqeq-four"
"url": "https://semgrep.dev/r/python.eqeq-five",
"version_id": "version1"
},
"src": "previous-scan"
}
Expand Down Expand Up @@ -461,9 +461,9 @@
"semgrep.dev": {
"rule": {
"rule_id": "abce",
"version_id": "version2",
"shortlink": "https://sg.run/abcd",
"url": "https://semgrep.dev/r/python.eqeq-five",
"shortlink": "https://sg.run/abcd"
"version_id": "version2"
},
"src": "new-version"
}
Expand Down Expand Up @@ -496,9 +496,9 @@
"semgrep.dev": {
"rule": {
"rule_id": "abcd",
"version_id": "version1",
"shortlink": "https://sg.run/abcd",
"url": "https://semgrep.dev/r/python.eqeq-five",
"shortlink": "https://sg.run/abcd"
"version_id": "version1"
},
"src": "unchanged"
}
Expand Down
Loading

0 comments on commit 752b48d

Please sign in to comment.