refactor(cli): cli deduplication in core (semgrep#9711)

## What: This PR moves the CLI deduplication (based on `cli_unique_key`) to the Core engine. ## Why: This makes the core output more in line with the cli output, and prevents duplication of the logic across pysemgrep and osemgrep. ## How: The cli unique key is based on the rule metadata, so I had to move that to the pattern matches. I also deleted the deduplication logic from the `pysemgrep` side. ## Test plan: Tests still pass. Note that I updated a bunch of snapshots. This is because of [this line](https://github.com/semgrep/semgrep/blob/39f95450a7d4d70e54c9edbd109bed8210a36889/cli/src/semgrep/core_runner.py#L771) in the pysemgrep core runner, which reorders the fields of the rule JSON before it enters the core engine. When the core engine didn't used to deal with metadata, this means that `pysemgrep` would insert the metadata, in a non-sorted manner. Now, though, the core engine reports the sorted metadata, which is correct, as that's the metadata it receives. This changes the order in the snapshot data, though, so we need to change it to reflect the data the core engine is returning.
shombo · Mar 6, 2024 · 752b48d · 752b48d
1 parent d073c16
commit 752b48d
Show file tree

Hide file tree

Showing 76 changed files with 1,051 additions and 1,018 deletions.
diff --git a/cli/src/semgrep/core_output.py b/cli/src/semgrep/core_output.py
@@ -22,9 +22,8 @@
 from semgrep.error import SemgrepCoreError
 from semgrep.error import TARGET_PARSE_FAILURE_EXIT_CODE
 from semgrep.rule import Rule
-from semgrep.rule_match import CliUniqueKey
 from semgrep.rule_match import RuleMatch
-from semgrep.rule_match import RuleMatchSet
+from semgrep.rule_match import RuleMatches
 from semgrep.verbose_logging import getLogger
 
 logger = getLogger(__name__)
@@ -122,16 +121,12 @@ def convert_to_rule_match(match: out.CoreMatch) -> RuleMatch:
             fix=fix,
         )
 
-    by_unique_key: Dict[CliUniqueKey, RuleMatch] = {}
+    # TODO: Dict[out.RuleId, RuleMatches]
+    # We used to deduplicate by `cli_unique_key` here, but now no longer need to,
+    # because it is deduplicated in semgrep-core as core_unique_key!
+    findings: Dict[Rule, RuleMatches] = {rule: RuleMatches(rule) for rule in rules}
     for match in res.results:
         rule_match = convert_to_rule_match(match)
-        curr = by_unique_key.setdefault(rule_match.cli_unique_key, rule_match)
-        if rule_match.should_report_instead(curr):
-            by_unique_key[rule_match.cli_unique_key] = rule_match
-
-    # TODO: Dict[out.RuleId, RuleMatchSet]
-    findings: Dict[Rule, RuleMatchSet] = {rule: RuleMatchSet(rule) for rule in rules}
-    for rule_match in by_unique_key.values():
         rule = rule_table[rule_match.rule_id]
         findings[rule].add(rule_match)
 

diff --git a/cli/src/semgrep/rule_match.py b/cli/src/semgrep/rule_match.py
@@ -93,7 +93,6 @@ class RuleMatch:
     lines: List[str] = field(init=False, repr=False)
     previous_line: str = field(init=False, repr=False)
     syntactic_context: str = field(init=False, repr=False)
-    cli_unique_key: CliUniqueKey = field(init=False, repr=False)
     ci_unique_key: Tuple = field(init=False, repr=False)
     ordering_key: Tuple = field(init=False, repr=False)
     match_based_key: Tuple = field(init=False, repr=False)
@@ -222,74 +221,12 @@ def get_syntactic_context(self) -> str:
         code = code.strip()
         return code
 
-    @cli_unique_key.default
-    def get_cli_unique_key(self) -> CliUniqueKey:
-        """
-        A unique key designed with data-completeness & correctness in mind.
-
-        Results in more unique findings than ci_unique_key.
-
-        Used for deduplication in the CLI before writing output.
-        """
-        return (
-            # NOTE: We include the previous scan's rules in the config for
-            # consistent fixed status work. For unique hashing/grouping,
-            # previous and current scan rules must have distinct check IDs.
-            # Hence, previous scan rules are annotated with a unique check ID,
-            # while the original ID is kept in metadata. As check_id is used
-            # for cli_unique_key, this patch fetches the check ID from metadata
-            # for previous scan findings.
-            # TODO: Once the fixed status work is stable, all findings should
-            # fetch the check ID from metadata. This fallback prevents breaking
-            # current scan results if an issue arises.
-            self.annotated_rule_name if self.from_transient_scan else self.rule_id,
-            str(self.git_blob.value if self.git_blob else self.path),
-            self.start.offset,
-            self.end.offset,
-            self.message,
-            # TODO: Bring this back.
-            # This is necessary so we don't deduplicate taint findings which
-            # have different sources.
-            #
-            # self.match.extra.dataflow_trace.to_json_string
-            # if self.match.extra.dataflow_trace
-            # else None,
-            None,
-            # NOTE: previously, we considered self.match.extra.validation_state
-            # here, but since in some cases (e.g., with `anywhere`) we generate
-            # many matches in certain cases, we want to consider secrets
-            # matches unique under the above set of things, but with a priority
-            # associated with the validation state; i.e., a match with a
-            # confirmed valid state should replace all matches equal under the
-            # above key. We can't do that just by not considering validation
-            # state since we would pick one arbitrarily, and if we added it
-            # below then we would report _both_ valid and invalid (but we only
-            # want to report valid, if a valid one is present and unique per
-            # above fields). See also `should_report_instead`.
-        )
-
-    def should_report_instead(self, other: "RuleMatch") -> bool:
-        """
-        Returns True iff we should report `self` in lieu of reporting `other`.
-        This is currently only used for the following items:
-        - secrets: a valid finding is reported over an invalid one
-
-        Assumes that self.cli_unique_key == other.cli_unique_key
-        """
-        if self.validation_state is None:
-            return False
-        if other.validation_state is None:
-            return True
-        return isinstance(
-            self.validation_state.value, out.ConfirmedValid
-        ) and not isinstance(other.validation_state.value, out.ConfirmedValid)
-
     @ci_unique_key.default
     def get_ci_unique_key(self) -> Tuple:
         """
         A unique key designed with notification user experience in mind.
 
-        Results in fewer unique findings than cli_unique_key.
+        Results in fewer unique findings than core_unique_key.
         """
         try:
             path = self.path.relative_to(Path.cwd())
@@ -314,7 +251,7 @@ def get_path_changed_ci_unique_key(self, rename_dict: Dict[str, Path]) -> Tuple:
         """
         A unique key that accounts for filepath renames.
 
-        Results in fewer unique findings than cli_unique_key.
+        Results in fewer unique findings than core_unique_key.
         """
         try:
             path = str(self.path.relative_to(Path.cwd()))
@@ -621,24 +558,13 @@ def from_transient_scan(self) -> bool:
     def annotated_rule_name(self) -> str:
         return self.metadata.get("semgrep.dev", {}).get("rule", {}).get("rule_name")
 
-    def __hash__(self) -> int:
-        """
-        We use the "data-correctness" key to prevent keeping around duplicates.
-        """
-        return hash(self.cli_unique_key)
-
-    def __eq__(self, other: object) -> bool:
-        if not isinstance(other, type(self)):
-            return False
-        return self.cli_unique_key == other.cli_unique_key
-
     def __lt__(self, other: "RuleMatch") -> bool:
         if not isinstance(other, type(self)):
             return NotImplemented
         return self.ordering_key < other.ordering_key
 
 
-class RuleMatchSet(Iterable[RuleMatch]):
+class RuleMatches(Iterable[RuleMatch]):
     """
     A custom set type which is aware when findings are the same.
 
@@ -652,9 +578,9 @@ def __init__(
         self._ci_key_counts: CounterType[Tuple] = Counter()
         self._rule = rule
         if __iterable is None:
-            self._set = set()
+            self._store = []
         else:
-            self._set = set(__iterable)
+            self._store = list(__iterable)
 
     def add(self, match: RuleMatch) -> None:
         """
@@ -674,7 +600,7 @@ def add(self, match: RuleMatch) -> None:
             match,
             match_based_index=self._match_based_counts[match.get_match_based_key()] - 1,
         )
-        self._set.add(match)
+        self._store.append(match)
 
     def update(self, *rule_match_iterables: Iterable[RuleMatch]) -> None:
         """
@@ -689,7 +615,7 @@ def update(self, *rule_match_iterables: Iterable[RuleMatch]) -> None:
                 self.add(rule_match)
 
     def __iter__(self) -> Iterator[RuleMatch]:
-        return iter(self._set)
+        return iter(self._store)
 
 
 # Our code orders findings at one point and then just assumes they're in order.

diff --git a/cli/src/semgrep/run_scan.py b/cli/src/semgrep/run_scan.py
@@ -59,8 +59,8 @@
 from semgrep.output_extra import OutputExtra
 from semgrep.profile_manager import ProfileManager
 from semgrep.rule import Rule
+from semgrep.rule_match import RuleMatches
 from semgrep.rule_match import RuleMatchMap
-from semgrep.rule_match import RuleMatchSet
 from semgrep.semgrep_interfaces.semgrep_metrics import Any_ as AnySecretsOrigin
 from semgrep.semgrep_interfaces.semgrep_metrics import CodeConfig
 from semgrep.semgrep_interfaces.semgrep_metrics import SecretsConfig
@@ -234,7 +234,7 @@ def run_rules(
             join_rule_matches, join_rule_errors = join_rule.run_join_rule(
                 rule.raw, [target.path for target in target_manager.targets]
             )
-            join_rule_matches_set = RuleMatchSet(rule)
+            join_rule_matches_set = RuleMatches(rule)
             for m in join_rule_matches:
                 join_rule_matches_set.add(m)
             join_rule_matches_by_rule = {

diff --git a/cli/tests/e2e-pro/snapshots/test_ci/test_dryrun/None/results.txt b/cli/tests/e2e-pro/snapshots/test_ci/test_dryrun/None/results.txt
@@ -124,9 +124,9 @@ Would have sent findings and ignores blob: {
                 "semgrep.dev": {
                     "rule": {
                         "rule_id": "abcf",
-                        "version_id": "version1",
+                        "shortlink": "https://sg.run/abcd",
                         "url": "https://semgrep.dev/r/python.eqeq-five",
-                        "shortlink": "https://sg.run/abcd"
+                        "version_id": "version1"
                     },
                     "src": "new-rule"
                 }
@@ -264,10 +264,10 @@ Would have sent findings and ignores blob: {
                 "semgrep.dev": {
                     "rule": {
                         "rule_id": "abce",
-                        "version_id": "version1",
-                        "url": "https://semgrep.dev/r/python.eqeq-five",
+                        "rule_name": "eqeq-four",
                         "shortlink": "https://sg.run/abcd",
-                        "rule_name": "eqeq-four"
+                        "url": "https://semgrep.dev/r/python.eqeq-five",
+                        "version_id": "version1"
                     },
                     "src": "previous-scan"
                 }
@@ -302,9 +302,9 @@ Would have sent findings and ignores blob: {
                 "semgrep.dev": {
                     "rule": {
                         "rule_id": "abce",
-                        "version_id": "version2",
+                        "shortlink": "https://sg.run/abcd",
                         "url": "https://semgrep.dev/r/python.eqeq-five",
-                        "shortlink": "https://sg.run/abcd"
+                        "version_id": "version2"
                     },
                     "src": "new-version"
                 }
@@ -337,9 +337,9 @@ Would have sent findings and ignores blob: {
                 "semgrep.dev": {
                     "rule": {
                         "rule_id": "abcd",
-                        "version_id": "version1",
+                        "shortlink": "https://sg.run/abcd",
                         "url": "https://semgrep.dev/r/python.eqeq-five",
-                        "shortlink": "https://sg.run/abcd"
+                        "version_id": "version1"
                     },
                     "src": "unchanged"
                 }
@@ -524,10 +524,10 @@ Would have sent findings and ignores blob: {
                 "semgrep.dev": {
                     "rule": {
                         "rule_id": "abce",
-                        "version_id": "version1",
-                        "url": "https://semgrep.dev/r/python.eqeq-five",
+                        "rule_name": "eqeq-four",
                         "shortlink": "https://sg.run/abcd",
-                        "rule_name": "eqeq-four"
+                        "url": "https://semgrep.dev/r/python.eqeq-five",
+                        "version_id": "version1"
                     },
                     "src": "previous-scan"
                 }
@@ -562,9 +562,9 @@ Would have sent findings and ignores blob: {
                 "semgrep.dev": {
                     "rule": {
                         "rule_id": "abce",
-                        "version_id": "version2",
+                        "shortlink": "https://sg.run/abcd",
                         "url": "https://semgrep.dev/r/python.eqeq-five",
-                        "shortlink": "https://sg.run/abcd"
+                        "version_id": "version2"
                     },
                     "src": "new-version"
                 }
@@ -597,9 +597,9 @@ Would have sent findings and ignores blob: {
                 "semgrep.dev": {
                     "rule": {
                         "rule_id": "abcd",
-                        "version_id": "version1",
+                        "shortlink": "https://sg.run/abcd",
                         "url": "https://semgrep.dev/r/python.eqeq-five",
-                        "shortlink": "https://sg.run/abcd"
+                        "version_id": "version1"
                     },
                     "src": "unchanged"
                 }

diff --git a/...ll_run/autofix-azure-pipelines-overwrite-autodetected-variables/findings_and_ignores.json b/...ll_run/autofix-azure-pipelines-overwrite-autodetected-variables/findings_and_ignores.json
@@ -20,9 +20,9 @@
         "semgrep.dev": {
           "rule": {
             "rule_id": "abcf",
-            "version_id": "version1",
+            "shortlink": "https://sg.run/abcd",
             "url": "https://semgrep.dev/r/python.eqeq-five",
-            "shortlink": "https://sg.run/abcd"
+            "version_id": "version1"
           },
           "src": "new-rule"
         }
@@ -160,10 +160,10 @@
         "semgrep.dev": {
           "rule": {
             "rule_id": "abce",
-            "version_id": "version1",
-            "url": "https://semgrep.dev/r/python.eqeq-five",
+            "rule_name": "eqeq-four",
             "shortlink": "https://sg.run/abcd",
-            "rule_name": "eqeq-four"
+            "url": "https://semgrep.dev/r/python.eqeq-five",
+            "version_id": "version1"
           },
           "src": "previous-scan"
         }
@@ -198,9 +198,9 @@
         "semgrep.dev": {
           "rule": {
             "rule_id": "abce",
-            "version_id": "version2",
+            "shortlink": "https://sg.run/abcd",
             "url": "https://semgrep.dev/r/python.eqeq-five",
-            "shortlink": "https://sg.run/abcd"
+            "version_id": "version2"
           },
           "src": "new-version"
         }
@@ -233,9 +233,9 @@
         "semgrep.dev": {
           "rule": {
             "rule_id": "abcd",
-            "version_id": "version1",
+            "shortlink": "https://sg.run/abcd",
             "url": "https://semgrep.dev/r/python.eqeq-five",
-            "shortlink": "https://sg.run/abcd"
+            "version_id": "version1"
           },
           "src": "unchanged"
         }
@@ -423,10 +423,10 @@
         "semgrep.dev": {
           "rule": {
             "rule_id": "abce",
-            "version_id": "version1",
-            "url": "https://semgrep.dev/r/python.eqeq-five",
+            "rule_name": "eqeq-four",
             "shortlink": "https://sg.run/abcd",
-            "rule_name": "eqeq-four"
+            "url": "https://semgrep.dev/r/python.eqeq-five",
+            "version_id": "version1"
           },
           "src": "previous-scan"
         }
@@ -461,9 +461,9 @@
         "semgrep.dev": {
           "rule": {
             "rule_id": "abce",
-            "version_id": "version2",
+            "shortlink": "https://sg.run/abcd",
             "url": "https://semgrep.dev/r/python.eqeq-five",
-            "shortlink": "https://sg.run/abcd"
+            "version_id": "version2"
           },
           "src": "new-version"
         }
@@ -496,9 +496,9 @@
         "semgrep.dev": {
           "rule": {
             "rule_id": "abcd",
-            "version_id": "version1",
+            "shortlink": "https://sg.run/abcd",
             "url": "https://semgrep.dev/r/python.eqeq-five",
-            "shortlink": "https://sg.run/abcd"
+            "version_id": "version1"
           },
           "src": "unchanged"
         }