Export results transformation (subtext and hashed) (#582)

* square bracket workaround in keywort regex * path filter * BM score fix * ValueStringTypeCheck workaround for heterogenous source * wrap added to filter array definitions * TOML format sanitizer * YAML case * BM fix * BM scores fix * [skip actions] [subhashtext] 2024-08-12T21:32:30+03:00 * variable is hashed too * hash & subtext test * testBM * updBMscor * refactoring * skip f* in BM experiment * keep 0*-3* meta for experiment * less repos in test * refactoring2 * read_text.cache_clear() * --subtext in benchmark * [skip actions] [subhashtext] 2024-08-13T12:52:11+03:00 * [skip actions] [subhashtext] 2024-08-13T12:55:14+03:00 * fix * subtext * experiment ml rollback * BM scores with hashes * some rollbacks
Samsung · Aug 14, 2024 · 5e2bf59 · 5e2bf59
1 parent 45e0643
commit 5e2bf59
Show file tree

Hide file tree

Showing 20 changed files with 1,243 additions and 1,135 deletions.
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
@@ -121,7 +121,7 @@ jobs:
 
       - name: Run CredSweeper tool
         run: |
-          credsweeper --banner --log info --jobs $(nproc) --path data --save-json report.${{ github.event.pull_request.head.sha }}.json | tee credsweeper.${{ github.event.pull_request.head.sha }}.log
+          credsweeper --banner --log info --jobs $(nproc) --subtext --path data --save-json report.${{ github.event.pull_request.head.sha }}.json | tee credsweeper.${{ github.event.pull_request.head.sha }}.log
 
       - name: Run Benchmark
         run: |

diff --git a/cicd/benchmark.txt b/cicd/benchmark.txt
diff --git a/credsweeper/__main__.py b/credsweeper/__main__.py
@@ -8,7 +8,7 @@
 
 from credsweeper import __version__
 from credsweeper.app import APP_PATH, CredSweeper
-from credsweeper.common.constants import ThresholdPreset, Severity, RuleType, DiffRowType
+from credsweeper.common.constants import ThresholdPreset, Severity, RuleType, DiffRowType, ML_HUNK
 from credsweeper.file_handler.abstract_provider import AbstractProvider
 from credsweeper.file_handler.files_provider import FilesProvider
 from credsweeper.file_handler.patches_provider import PatchesProvider
@@ -215,6 +215,14 @@ def get_arguments() -> Namespace:
                         const="output.xlsx",
                         dest="xlsx_filename",
                         metavar="PATH")
+    parser.add_argument("--hashed",
+                        help="line, variable, value will be hashed in output",
+                        action="store_const",
+                        const=True)
+    parser.add_argument("--subtext",
+                        help=f"line text will be stripped in {2 * ML_HUNK} symbols but value and variable are kept",
+                        action="store_const",
+                        const=True)
     parser.add_argument("--sort", help="enable output sorting", dest="sort_output", action="store_true")
     parser.add_argument("--log",
                         "-l",
@@ -282,6 +290,8 @@ def scan(args: Namespace, content_provider: AbstractProvider, json_filename: Opt
                                   api_validation=args.api_validation,
                                   json_filename=json_filename,
                                   xlsx_filename=xlsx_filename,
+                                  hashed=args.hashed,
+                                  subtext=args.subtext,
                                   sort_output=args.sort_output,
                                   use_filters=args.no_filters,
                                   pool_count=args.jobs,

diff --git a/credsweeper/app.py b/credsweeper/app.py
@@ -42,6 +42,8 @@ def __init__(self,
                  api_validation: bool = False,
                  json_filename: Union[None, str, Path] = None,
                  xlsx_filename: Union[None, str, Path] = None,
+                 hashed: bool = False,
+                 subtext: bool = False,
                  sort_output: bool = False,
                  use_filters: bool = True,
                  pool_count: int = 1,
@@ -70,6 +72,8 @@ def __init__(self,
                 to json
             xlsx_filename: optional string variable, path to save result
                 to xlsx
+            hashed: use hash of line, value and variable instead plain text
+            subtext: use subtext of line near variable-value like it performed in ML
             use_filters: boolean variable, specifying the need of rule filters
             pool_count: int value, number of parallel processes to use
             ml_batch_size: int value, size of the batch for model inference
@@ -104,6 +108,8 @@ def __init__(self,
         self.credential_manager = CredentialManager()
         self.json_filename: Union[None, str, Path] = json_filename
         self.xlsx_filename: Union[None, str, Path] = xlsx_filename
+        self.hashed = hashed
+        self.subtext = subtext
         self.sort_output = sort_output
         self.ml_batch_size = ml_batch_size if ml_batch_size and 0 < ml_batch_size else 16
         self.ml_threshold = ml_threshold
@@ -400,16 +406,17 @@ def export_results(self) -> None:
 
         if self.json_filename:
             is_exported = True
-            Util.json_dump([credential.to_json() for credential in credentials], file_path=self.json_filename)
+            Util.json_dump([credential.to_json(hashed=self.hashed, subtext=self.subtext) for credential in credentials],
+                           file_path=self.json_filename)
 
         if self.xlsx_filename:
             is_exported = True
             data_list = []
             for credential in credentials:
-                data_list.extend(credential.to_dict_list())
+                data_list.extend(credential.to_dict_list(hashed=self.hashed, subtext=self.subtext))
             df = pd.DataFrame(data=data_list)
             df.to_excel(self.xlsx_filename, index=False)
 
         if is_exported is False:
             for credential in credentials:
-                print(credential)
+                print(credential.to_str(hashed=self.hashed, subtext=self.subtext))
diff --git a/credsweeper/common/constants.py b/credsweeper/common/constants.py
@@ -1,4 +1,5 @@
 import re
+import typing
 from enum import Enum
 from typing import Optional, Union
 
@@ -167,6 +168,8 @@ class DiffRowType(Enum):
     DELETED = "deleted"
 
 
+StartEnd = typing.NamedTuple("StartEnd", [("start", int), ("end", int)])
+
 MIN_VARIABLE_LENGTH = 1
 MIN_SEPARATOR_LENGTH = 1
 MIN_VALUE_LENGTH = 4

diff --git a/credsweeper/credentials/candidate.py b/credsweeper/credentials/candidate.py
@@ -88,18 +88,22 @@ def is_api_validation_available(self) -> bool:
         """
         return len(self.validations) > 0
 
-    def __str__(self) -> str:
+    def to_str(self, subtext: bool = False, hashed: bool = False) -> str:
+        """Represent candidate with subtext or|and hashed values"""
         return f"rule: {self.rule_name}" \
                f" | severity: {self.severity.value}" \
                f" | confidence: {self.confidence.value}" \
-               f" | line_data_list: {self.line_data_list}" \
+               f" | line_data_list: [{', '.join([x.to_str(subtext, hashed) for x in self.line_data_list])}]" \
                f" | api_validation: {self.api_validation.name}" \
                f" | ml_validation: {self.ml_validation.name}"
 
+    def __str__(self):
+        return self.to_str()
+
     def __repr__(self):
-        return str(self)
+        return self.to_str(subtext=True)
 
-    def to_json(self) -> Dict:
+    def to_json(self, hashed: bool, subtext: bool) -> Dict:
         """Convert credential candidate object to dictionary.
 
         Return:
@@ -116,23 +120,23 @@ def to_json(self) -> Dict:
             "confidence": self.confidence.value,
             "use_ml": self.use_ml,
             # put the array to end to make json more readable
-            "line_data_list": [line_data.to_json() for line_data in self.line_data_list],
+            "line_data_list": [line_data.to_json(hashed, subtext) for line_data in self.line_data_list],
         }
         if self.config is not None:
             reported_output = {k: v for k, v in full_output.items() if k in self.config.candidate_output}
         else:
             reported_output = full_output
         return reported_output
 
-    def to_dict_list(self) -> List[dict]:
+    def to_dict_list(self, hashed: bool, subtext: bool) -> List[dict]:
         """Convert credential candidate object to List[dict].
 
         Return:
             List[dict] object generated from current credential candidate
 
         """
         reported_output = []
-        json_output = self.to_json()
+        json_output = self.to_json(hashed, subtext)
         refined_data = copy.deepcopy(json_output)
         del refined_data["line_data_list"]
         for line_data in json_output["line_data_list"]:

diff --git a/credsweeper/credentials/line_data.py b/credsweeper/credentials/line_data.py
@@ -1,10 +1,11 @@
 import contextlib
+import hashlib
 import re
 import string
 from functools import cached_property
 from typing import Any, Dict, Optional, Tuple
 
-from credsweeper.common.constants import MAX_LINE_LENGTH
+from credsweeper.common.constants import MAX_LINE_LENGTH, UTF_8, StartEnd, ML_HUNK
 from credsweeper.config import Config
 from credsweeper.utils import Util
 from credsweeper.utils.entropy_validator import EntropyValidator
@@ -300,34 +301,81 @@ def is_source_file_with_quotes(self) -> bool:
             return True
         return False
 
+    @staticmethod
+    def get_hash_or_subtext(
+            text: Optional[str],  #
+            hashed: bool,  #
+            cut_pos: Optional[StartEnd] = None,  #
+    ) -> Optional[str]:
+        """Represent not empty text with hash or a "beauty" subtext if required
+
+        Args:
+            text: str - input string
+            hashed: bool - whether the text will be hashed and returned
+            cut_pos: Optional[StartEnd] - start, end positions which text must be kept in output
+
+        Return:
+            sha256 hash in hex representation of input text with UTF-8 encodings
+            or
+            subtext from start to end, or original text as is
+
+        """
+        if text:
+            if hashed:
+                text = hashlib.sha256(text.encode(UTF_8, errors="strict")).hexdigest()
+            elif cut_pos is not None:
+                if 2 * ML_HUNK < cut_pos.end - cut_pos.start:
+                    # subtext positions exceed the limit
+                    text = text[cut_pos.start:cut_pos.end]
+                else:
+                    strip_text = text.strip()
+                    if 2 * ML_HUNK >= len(strip_text):
+                        # stripped text length meets the limit
+                        text = strip_text
+                    else:
+                        offset = len(text) - len(text.lstrip())
+                        center = (cut_pos.end + cut_pos.start - offset) >> 1
+                        text = Util.subtext(strip_text, center, ML_HUNK)
+        return text
+
+    def to_str(self, subtext: bool = False, hashed: bool = False) -> str:
+        """Represent line_data with subtext or|and hashed values"""
+        cut_pos = StartEnd(self.variable_start, self.value_end) if subtext else None
+        return f"line: '{self.get_hash_or_subtext(self.line, hashed, cut_pos)}'" \
+               f" | line_num: {self.line_num} | path: {self.path}" \
+               f" | value: '{self.get_hash_or_subtext(self.value, hashed)}'" \
+               f" | entropy_validation: {EntropyValidator(self.value)}"
+
     def __str__(self):
-        return f"line: '{self.line}' | line_num: {self.line_num} | path: {self.path}" \
-               f" | value: '{self.value}' | entropy_validation: {EntropyValidator(self.value)}"
+        return self.to_str()
 
     def __repr__(self):
-        return str(self)
+        return self.to_str(subtext=True)
 
-    def to_json(self) -> Dict:
+    def to_json(self, hashed: bool, subtext: bool) -> Dict:
         """Convert line data object to dictionary.
 
         Return:
             Dictionary object generated from current line data
 
         """
+        cut_pos = StartEnd(self.variable_start if 0 <= self.variable_start else self.value_start,
+                           self.value_end) if subtext else None
         full_output = {
             "key": self.key,
-            "line": self.line,
+            "line": self.get_hash_or_subtext(self.line, hashed, cut_pos),
             "line_num": self.line_num,
             "path": self.path,
-            "info": self.info,
+            # info may contain variable name - so let it be hashed if requested
+            "info": self.get_hash_or_subtext(self.info, hashed),
             "pattern": self.pattern.pattern,
             "separator": self.separator,
             "separator_start": self.separator_start,
             "separator_end": self.separator_end,
-            "value": self.value,
+            "value": self.get_hash_or_subtext(self.value, hashed),
             "value_start": self.value_start,
             "value_end": self.value_end,
-            "variable": self.variable,
+            "variable": self.get_hash_or_subtext(self.variable, hashed),
             "variable_start": self.variable_start,
             "variable_end": self.variable_end,
             "value_leftquote": self.value_leftquote,

diff --git a/credsweeper/utils/util.py b/credsweeper/utils/util.py
@@ -226,8 +226,8 @@ def decode_bytes(content: bytes, encodings: Optional[List[str]] = None) -> List[
                 text = content.decode(encoding, errors="strict")
                 if content != text.encode(encoding, errors="strict"):
                     raise UnicodeError
-                # windows style workaround
-                lines = text.replace('\r\n', '\n').replace('\r', '\n').split("\n")
+                # windows & macos styles workaround
+                lines = text.replace('\r\n', '\n').replace('\r', '\n').split('\n')
                 break
             except UnicodeError:
                 binary_suggest = True

diff --git a/docs/source/guide.rst b/docs/source/guide.rst
@@ -15,7 +15,7 @@ Get all argument list:
 
     usage: python -m credsweeper [-h] (--path PATH [PATH ...] | --diff_path PATH [PATH ...] | --export_config [PATH] | --export_log_config [PATH]) [--rules [PATH]] [--severity SEVERITY] [--config [PATH]]
                              [--log_config [PATH]] [--denylist PATH] [--find-by-ext] [--depth POSITIVE_INT] [--no-filters] [--doc] [--ml_threshold FLOAT_OR_STR] [--ml_batch_size POSITIVE_INT]
-                             [--azure | --cuda] [--api_validation] [--jobs POSITIVE_INT] [--skip_ignored] [--save-json [PATH]] [--save-xlsx [PATH]] [--sort] [--log LOG_LEVEL] [--size_limit SIZE_LIMIT]
+                             [--azure | --cuda] [--api_validation] [--jobs POSITIVE_INT] [--skip_ignored] [--save-json [PATH]] [--save-xlsx [PATH]] [--hashed] [--subtext] [--sort] [--log LOG_LEVEL] [--size_limit SIZE_LIMIT]
                              [--banner] [--version]
     options:
       -h, --help            show this help message and exit
@@ -49,6 +49,8 @@ Get all argument list:
       --skip_ignored        parse .gitignore files and skip credentials from ignored objects
       --save-json [PATH]    save result to json file (default: output.json)
       --save-xlsx [PATH]    save result to xlsx file (default: output.xlsx)
+      --hashed              line, variable, value will be hashed in output
+      --subtext             line text will be stripped in 160 symbols but value and variable are kept
       --sort                enable output sorting
       --log LOG_LEVEL, -l LOG_LEVEL
                             provide logging level of ['DEBUG', 'INFO', 'WARN', 'WARNING', 'ERROR', 'FATAL', 'CRITICAL', 'SILENCE'](default: 'warning', case insensitive)

diff --git a/experiment/main.py b/experiment/main.py
@@ -20,7 +20,7 @@
 from experiment.src.features import prepare_data
 from experiment.src.lstm_model import get_model
 from experiment.src.model_config_preprocess import model_config_preprocess
-from experiment.src.prepare_data import prepare_train_data, meta_checksum
+from experiment.src.prepare_data import prepare_train_data, data_checksum
 
 
 def evaluate_model(thresholds: dict, keras_model: Model, x_data: List[np.ndarray], y_label: np.ndarray):
@@ -59,7 +59,7 @@ def main(cred_data_location: str, jobs: int) -> str:
     prepare_train_data(_cred_data_location, jobs)
 
     # detected data means which data is passed to ML validator of credsweeper after filters with RuleName
-    detected_data = read_detected_data(f"results/detected_data.{meta_checksum(cred_data_location)}.json")
+    detected_data = read_detected_data(f"results/detected_data.{data_checksum(cred_data_location)}.json")
     print(f"CredSweeper detected {len(detected_data)} credentials without ML")
     # all markup data
     meta_data = read_metadata(f"{cred_data_location}/meta")

diff --git a/experiment/main.sh b/experiment/main.sh
@@ -18,7 +18,7 @@ if [ 0 -ne ${error_code} ]; then exit ${error_code}; fi
 
 cd ${CREDSWEEPER_DIR}
 report_file=${RESULT_DIR}/${now}.json
-${CREDSWEEPER_DIR}/.venv/bin/python -m credsweeper --sort --path ~/q/DataCred/auxiliary/data/ --log info --job $(nproc) --save-json ${report_file}
+${CREDSWEEPER_DIR}/.venv/bin/python -m credsweeper --sort --path ~/q/DataCred/abspos/data/ --log info --job $(nproc) --subtext --save-json ${report_file}
 
-cd ~/q/DataCred/auxiliary/
+cd ~/q/DataCred/abspos/
 .venv/bin/python -m benchmark --scanner credsweeper --load ${report_file} | tee ${report_file}.log
diff --git a/experiment/src/data_loader.py b/experiment/src/data_loader.py
@@ -3,6 +3,7 @@
 import os
 import pathlib
 from copy import deepcopy
+from functools import cache
 from typing import Tuple, Dict, Set, Any
 
 import numpy as np
@@ -38,14 +39,7 @@ def read_detected_data(file_path: str) -> Dict[identifier, Dict]:
         line_data = deepcopy(cred["line_data_list"][0])
         line_data.pop("entropy_validation")
         line_data.pop("info")
-        line = line_data["line"].lstrip()
-        offset = len(line_data["line"]) - len(line)
-        line_data["line"] = line.rstrip()
-        line_data["value_start"] -= offset
-        line_data["value_end"] -= offset
-        line_data["variable_start"] -= offset
-        line_data["variable_end"] -= offset
-        assert line_data["value"] == line_data["line"][line_data["value_start"]:line_data["value_end"]], line_data
+        line_data["line"] = None  # will be read during join_label with data for ML input only
         meta_path = transform_to_meta_path(line_data["path"])
         line_data["path"] = meta_path
         line_data["RuleName"] = [rule_name]
@@ -143,11 +137,20 @@ def get_colored_line(line_data: Dict[str, Any]) -> str:
 
 def join_label(detected_data: Dict[identifier, Dict], meta_data: Dict[identifier, Dict],
                cred_data_location: str) -> pd.DataFrame:
+
+    @cache
+    def read_text(path) -> list[str]:
+        with open(path, "r", encoding="utf8") as f:
+            return f.read().replace("\r\n", '\n').replace('\r', '\n').split('\n')
+
     values = []
     detected_rules: Set[str] = set()
     for index, line_data in detected_data.items():
         for i in line_data["RuleName"]:
             detected_rules.add(i)
+        text = read_text(f'{cred_data_location}/{line_data["path"]}')
+        line = text[line_data["line_num"] - 1]
+        line_data["line"] = line
         if not line_data["value"]:
             print(f"WARNING: empty value\n{line_data}")
             continue
@@ -184,11 +187,9 @@ def join_label(detected_data: Dict[identifier, Dict], meta_data: Dict[identifier
                   f"\nvariable:'{line_data['variable']}' value:'{line_data['value']}'"
                   f"\nsub_line:'{get_colored_line(line_data)}'")
             continue
-        line = line_data["line"]
-        # the line in detected data must be striped
-        assert line == line.strip(), line_data
         # check the value in detected data
-        assert line[line_data["value_start"]:line_data["value_end"]] == line_data["value"]
+        assert line[line_data["value_start"]:line_data["value_end"]] == line_data["value"], (
+            line_data, line[line_data["value_start"]:line_data["value_end"]], line_data["value"])
         # todo: variable input has to be markup in meta too, or/and new feature "VariableExists" created ???
         line_data["GroundTruth"] = label
         line_data["ext"] = Util.get_extension(line_data["path"])
@@ -206,18 +207,17 @@ def join_label(detected_data: Dict[identifier, Dict], meta_data: Dict[identifier
                         print(','.join(markup.keys()))
                         all_meta_found = False
                     print(','.join(str(x) for x in markup.values()))
-                    text = Util.read_file(f'{cred_data_location}/{markup["FilePath"]}')
-                    line = text[markup["LineStart"] - 1].strip()
+                    text = read_text(f'{cred_data_location}/{markup["FilePath"]}')
+                    line = text[markup["LineStart"] - 1]
                     if 0 <= markup["ValueStart"] and 0 <= markup["ValueEnd"]:
                         line = line[:markup["ValueStart"]] \
                                + Fore.LIGHTGREEN_EX \
                                + line[markup["ValueStart"]:markup["ValueEnd"]] \
                                + Style.RESET_ALL \
                                + line[markup["ValueEnd"]:]
                     print(line)
-                    # print(Util.subtext(line, markup['ValueStart'], ML_HUNK))
                     break
-
+    read_text.cache_clear()
     df = pd.DataFrame(values)
     return df