Merge branch 'main' into babenek-patch-1

Samsung · Aug 21, 2024 · 4fb5b0c · 4fb5b0c
2 parents 42a7e1e + 548e2cc
commit 4fb5b0c
Show file tree

Hide file tree

Showing 22 changed files with 1,804 additions and 996 deletions.
diff --git a/cicd/README.md → .ci/README.md b/cicd/README.md → .ci/README.md
diff --git a/cicd/benchmark.txt → .ci/benchmark.txt b/cicd/benchmark.txt → .ci/benchmark.txt
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
@@ -150,7 +150,7 @@ jobs:
 
       - name: Verify benchmark scores of the PR
         run: |
-          diff --ignore-all-space --ignore-blank-lines temp/CredSweeper/cicd/benchmark.txt benchmark.${{ github.event.pull_request.head.sha }}.log
+          diff --unified=3 --ignore-all-space --ignore-blank-lines temp/CredSweeper/.ci/benchmark.txt benchmark.${{ github.event.pull_request.head.sha }}.log
 
   # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
 
@@ -422,7 +422,7 @@ jobs:
           # crc32 should be changed
           python -m credsweeper --banner
           # run quick scan
-          python -m credsweeper --log debug --path ../tests/samples --save-json
+          python -m credsweeper --ml_providers AzureExecutionProvider,CPUExecutionProvider --log debug --path ../tests/samples --save-json
           NEW_MODEL_FOUND_SAMPLES=$(jq '.|length' output.json)
           if [ 10 -gt ${NEW_MODEL_FOUND_SAMPLES} ]; then
             echo "Failure: found ${NEW_MODEL_FOUND_SAMPLES} credentials"

diff --git a/.github/workflows/check.yml b/.github/workflows/check.yml
@@ -27,6 +27,15 @@ jobs:
         fetch-depth: 0
         ref: ${{ github.event.pull_request.head.sha }}
 
+    # # # ml_config & ml_model integrity
+
+    - name: Check ml_model.onnx integrity
+      if: ${{ always() && steps.code_checkout.conclusion == 'success' }}
+      run: |
+        md5sum --binary credsweeper/ml_model/ml_config.json | grep 2b29c5e1aa199d14b788652bd542c7c0
+        md5sum --binary credsweeper/ml_model/ml_model.onnx | grep 88f37978fc0599ac8d1bf732ad40c077
+
+
     # # # line ending
 
     - name: Check for text file ending
@@ -53,13 +62,6 @@ jobs:
         done
         exit ${n}
 
-    # # # ml_model integrity
-
-    - name: Check ml_model.onnx integrity
-      if: ${{ always() && steps.code_checkout.conclusion == 'success' }}
-      run: |
-        md5sum --binary credsweeper/ml_model/ml_model.onnx | grep 62d92ab2f91a18e861d846a7b8a0c3a7
-
     # # # Python setup
 
     - name: Set up Python

diff --git a/credsweeper/__main__.py b/credsweeper/__main__.py
@@ -117,7 +117,6 @@ def get_arguments() -> Namespace:
                        dest="export_log_config",
                        metavar="PATH")
     parser.add_argument("--rules",
-                        nargs="?",
                         help="path of rule config file (default: credsweeper/rules/config.yaml). "
                         f"severity:{[i.value for i in Severity]} "
                         f"type:{[i.value for i in RuleType]}",
@@ -131,13 +130,11 @@ def get_arguments() -> Namespace:
                         dest="severity",
                         type=severity_levels)
     parser.add_argument("--config",
-                        nargs="?",
                         help="use custom config (default: built-in)",
                         default=None,
                         dest="config_path",
                         metavar="PATH")
     parser.add_argument("--log_config",
-                        nargs="?",
                         help="use custom log config (default: built-in)",
                         default=None,
                         dest="log_config_path",
@@ -178,15 +175,27 @@ def get_arguments() -> Namespace:
                         default=16,
                         required=False,
                         metavar="POSITIVE_INT")
-    ml_provider_group = parser.add_mutually_exclusive_group()
-    ml_provider_group.add_argument("--azure",
-                                   help="enable AzureExecutionProvider for onnx",
-                                   dest="azure",
-                                   action="store_true")
-    ml_provider_group.add_argument("--cuda",
-                                   help="enable CUDAExecutionProvider for onnx",
-                                   dest="cuda",
-                                   action="store_true")
+    parser.add_argument("--ml_config",
+                        help="use external config for ml model",
+                        type=str,
+                        default=None,
+                        dest="ml_config",
+                        required=False,
+                        metavar="PATH")
+    parser.add_argument("--ml_model",
+                        help="use external ml model",
+                        type=str,
+                        default=None,
+                        dest="ml_model",
+                        required=False,
+                        metavar="PATH")
+    parser.add_argument("--ml_providers",
+                        help="comma separated list of providers for onnx (CPUExecutionProvider is used by default)",
+                        type=str,
+                        default=None,
+                        dest="ml_providers",
+                        required=False,
+                        metavar="STR")
     parser.add_argument("--api_validation",
                         help="add credential api validation option to credsweeper pipeline. "
                         "External API is used to reduce FP for some rule types.",
@@ -297,8 +306,9 @@ def scan(args: Namespace, content_provider: AbstractProvider, json_filename: Opt
                                   pool_count=args.jobs,
                                   ml_batch_size=args.ml_batch_size,
                                   ml_threshold=args.ml_threshold,
-                                  azure=args.azure,
-                                  cuda=args.cuda,
+                                  ml_config=args.ml_config,
+                                  ml_model=args.ml_model,
+                                  ml_providers=args.ml_providers,
                                   find_by_ext=args.find_by_ext,
                                   depth=args.depth,
                                   doc=args.doc,

diff --git a/credsweeper/app.py b/credsweeper/app.py
@@ -49,8 +49,9 @@ def __init__(self,
                  pool_count: int = 1,
                  ml_batch_size: Optional[int] = None,
                  ml_threshold: Union[float, ThresholdPreset] = ThresholdPreset.medium,
-                 azure: bool = False,
-                 cuda: bool = False,
+                 ml_config: Union[None, str, Path] = None,
+                 ml_model: Union[None, str, Path] = None,
+                 ml_providers: Optional[str] = None,
                  find_by_ext: bool = False,
                  depth: int = 0,
                  doc: bool = False,
@@ -78,6 +79,9 @@ def __init__(self,
             pool_count: int value, number of parallel processes to use
             ml_batch_size: int value, size of the batch for model inference
             ml_threshold: float or string value to specify threshold for the ml model
+            ml_config: str or Path to set custom config of ml model
+            ml_model: str or Path to set custom ml model
+            ml_providers: str - comma separated list with providers
             find_by_ext: boolean - files will be reported by extension
             depth: int - how deep container files will be scanned
             doc: boolean - document-specific scanning
@@ -113,8 +117,9 @@ def __init__(self,
         self.sort_output = sort_output
         self.ml_batch_size = ml_batch_size if ml_batch_size and 0 < ml_batch_size else 16
         self.ml_threshold = ml_threshold
-        self.azure = azure
-        self.cuda = cuda
+        self.ml_config = ml_config
+        self.ml_model = ml_model
+        self.ml_providers = ml_providers
         self.ml_validator = None
         self.__log_level = log_level
 
@@ -187,7 +192,12 @@ def ml_validator(self) -> MlValidator:
         """ml_validator getter"""
         from credsweeper.ml_model import MlValidator
         if not self.__ml_validator:
-            self.__ml_validator: MlValidator = MlValidator(threshold=self.ml_threshold)
+            self.__ml_validator: MlValidator = MlValidator(
+                threshold=self.ml_threshold,  #
+                ml_config=self.ml_config,  #
+                ml_model=self.ml_model,  #
+                ml_providers=self.ml_providers,  #
+            )
         assert self.__ml_validator, "self.__ml_validator was not initialized"
         return self.__ml_validator
 

diff --git a/credsweeper/filters/value_string_type_check.py b/credsweeper/filters/value_string_type_check.py
@@ -41,7 +41,7 @@ def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
         not_comment = not line_data.is_comment()
 
         if line_data.is_source_file_with_quotes() and not_comment and not_quoted and not line_data.is_quoted \
-                and '=' in line_data.separator:
+                and line_data.separator and '=' in line_data.separator:
             # heterogeneous code e.g. YAML in Python uses colon sign instead equals
             return True
 

diff --git a/credsweeper/ml_model/model_config.json → credsweeper/ml_model/ml_config.json b/credsweeper/ml_model/model_config.json → credsweeper/ml_model/ml_config.json
@@ -373,12 +373,12 @@
                     ".cjs",
                     ".cljc",
                     ".cmd",
+                    ".cmm",
                     ".cnf",
                     ".coffee",
                     ".conf",
                     ".config",
                     ".cpp",
-                    ".creds",
                     ".crt",
                     ".cs",
                     ".csp",
@@ -417,7 +417,6 @@
                     ".json",
                     ".jsp",
                     ".jsx",
-                    ".jwt",
                     ".kt",
                     ".las",
                     ".ldif",
@@ -428,7 +427,6 @@
                     ".log",
                     ".lua",
                     ".m",
-                    ".map",
                     ".markerb",
                     ".md",
                     ".mdx",
@@ -445,7 +443,6 @@
                     ".patch",
                     ".php",
                     ".pl",
-                    ".pm",
                     ".po",
                     ".pod",
                     ".postinst",
@@ -457,7 +454,6 @@
                     ".purs",
                     ".pxd",
                     ".py",
-                    ".pyi",
                     ".pyx",
                     ".r",
                     ".rb",
@@ -469,6 +465,7 @@
                     ".rs",
                     ".rsp",
                     ".rst",
+                    ".rules",
                     ".sample",
                     ".sbt",
                     ".scala",
@@ -478,7 +475,6 @@
                     ".sql",
                     ".storyboard",
                     ".strings",
-                    ".swift",
                     ".t",
                     ".td",
                     ".tdf",
@@ -498,7 +494,6 @@
                     ".vue",
                     ".xaml",
                     ".xib",
-                    ".xml",
                     ".yaml",
                     ".yml",
                     ".zsh",
@@ -515,7 +510,6 @@
                     "Certificate",
                     "Credential",
                     "Github Old Token",
-                    "JSON Web Token",
                     "Key",
                     "Nonce",
                     "Password",

diff --git a/credsweeper/ml_model/ml_model.onnx b/credsweeper/ml_model/ml_model.onnx
diff --git a/credsweeper/ml_model/ml_validator.py b/credsweeper/ml_model/ml_validator.py
@@ -1,7 +1,8 @@
+import hashlib
 import logging
-import os
 import string
-from typing import List, Tuple, Union
+from pathlib import Path
+from typing import List, Tuple, Union, Optional
 
 import numpy as np
 import onnxruntime as ort
@@ -21,35 +22,56 @@ class MlValidator:
     CHAR_INDEX = {char: index for index, char in enumerate('\0' + string.printable + NON_ASCII)}
     NUM_CLASSES = len(CHAR_INDEX)
 
-    def __init__(self, threshold: Union[float, ThresholdPreset], azure: bool = False, cuda: bool = False) -> None:
+    def __init__(
+            self,  #
+            threshold: Union[float, ThresholdPreset],  #
+            ml_config: Union[None, str, Path] = None,  #
+            ml_model: Union[None, str, Path] = None,  #
+            ml_providers: Optional[str] = None) -> None:
         """Init
 
         Args:
             threshold: decision threshold
+            ml_config: path to ml config
+            ml_model: path to ml model
+            ml_providers: coma separated list of providers https://onnxruntime.ai/docs/execution-providers/
         """
-        dir_path = os.path.dirname(os.path.realpath(__file__))
-        model_file_path = os.path.join(dir_path, "ml_model.onnx")
-        if azure:
-            provider = "AzureExecutionProvider"
-        elif cuda:
-            provider = "CUDAExecutionProvider"
+        dir_path = Path(__file__).parent
+
+        if ml_config:
+            ml_config_path = Path(ml_config)
+        else:
+            ml_config_path = dir_path / "ml_config.json"
+        with open(ml_config_path, "rb") as f:
+            md5_config = hashlib.md5(f.read()).hexdigest()
+
+        if ml_model:
+            ml_model_path = Path(ml_model)
+        else:
+            ml_model_path = dir_path / "ml_model.onnx"
+        with open(ml_model_path, "rb") as f:
+            md5_model = hashlib.md5(f.read()).hexdigest()
+
+        if ml_providers:
+            providers = ml_providers.split(',')
         else:
-            provider = "CPUExecutionProvider"
-        self.model_session = ort.InferenceSession(model_file_path, providers=[provider])
+            providers = ["CPUExecutionProvider"]
+        self.model_session = ort.InferenceSession(ml_model_path, providers=providers)
 
-        model_details = Util.json_load(os.path.join(dir_path, "model_config.json"))
+        model_config = Util.json_load(ml_config_path)
         if isinstance(threshold, float):
             self.threshold = threshold
-        elif isinstance(threshold, ThresholdPreset) and "thresholds" in model_details:
-            self.threshold = model_details["thresholds"][threshold.value]
+        elif isinstance(threshold, ThresholdPreset) and "thresholds" in model_config:
+            self.threshold = model_config["thresholds"][threshold.value]
         else:
             self.threshold = 0.5
 
         self.common_feature_list = []
         self.unique_feature_list = []
-        logger.info("Init ML validator, model file path: %s", model_file_path)
-        logger.debug("ML validator details: %s", model_details)
-        for feature_definition in model_details["features"]:
+        logger.info("Init ML validator with %s provider; config:'%s' md5:%s model:'%s' md5:%s", providers,
+                    ml_config_path, md5_config, ml_model_path, md5_model)
+        logger.debug("ML validator details: %s", model_config)
+        for feature_definition in model_config["features"]:
             feature_class = feature_definition["type"]
             kwargs = feature_definition.get("kwargs", {})
             feature_constructor = getattr(features, feature_class, None)

diff --git a/docs/source/guide.rst b/docs/source/guide.rst
@@ -13,9 +13,13 @@ Get all argument list:
 
 .. code-block:: text
 
-    usage: python -m credsweeper [-h] (--path PATH [PATH ...] | --diff_path PATH [PATH ...] | --export_config [PATH] | --export_log_config [PATH]) [--rules [PATH]] [--severity SEVERITY] [--config [PATH]]
-                             [--log_config [PATH]] [--denylist PATH] [--find-by-ext] [--depth POSITIVE_INT] [--no-filters] [--doc] [--ml_threshold FLOAT_OR_STR] [--ml_batch_size POSITIVE_INT]
-                             [--azure | --cuda] [--api_validation] [--jobs POSITIVE_INT] [--skip_ignored] [--save-json [PATH]] [--save-xlsx [PATH]] [--hashed] [--subtext] [--sort] [--log LOG_LEVEL] [--size_limit SIZE_LIMIT]
+    usage: python -m credsweeper [-h] (--path PATH [PATH ...] | --diff_path PATH [PATH ...] | --export_config [PATH] | --export_log_config [PATH])
+                             [--rules PATH] [--severity SEVERITY] [--config PATH] [--log_config PATH] [--denylist PATH]
+                             [--find-by-ext] [--depth POSITIVE_INT] [--no-filters] [--doc] [--ml_threshold FLOAT_OR_STR]
+                             [--ml_batch_size POSITIVE_INT] [--ml_config PATH] [--ml_model PATH] [--ml_providers STR]
+                             [--api_validation] [--jobs POSITIVE_INT] [--skip_ignored] [--save-json [PATH]]
+                             [--save-xlsx [PATH]] [--hashed] [--subtext] [--sort] [--log LOG_LEVEL]
+                             [--size_limit SIZE_LIMIT]
                              [--banner] [--version]
     options:
       -h, --help            show this help message and exit
@@ -27,10 +31,10 @@ Get all argument list:
                             exporting default config to file (default: config.json)
       --export_log_config [PATH]
                             exporting default logger config to file (default: log.yaml)
-      --rules [PATH]        path of rule config file (default: credsweeper/rules/config.yaml). severity:['critical', 'high', 'medium', 'low', 'info'] type:['keyword', 'pattern', 'pem_key', 'multi']
+      --rules PATH          path of rule config file (default: credsweeper/rules/config.yaml). severity:['critical', 'high', 'medium', 'low', 'info'] type:['keyword', 'pattern', 'pem_key', 'multi']
       --severity SEVERITY   set minimum level for rules to apply ['critical', 'high', 'medium', 'low', 'info'](default: 'Severity.INFO', case insensitive)
-      --config [PATH]       use custom config (default: built-in)
-      --log_config [PATH]   use custom log config (default: built-in)
+      --config PATH         use custom config (default: built-in)
+      --log_config PATH     use custom log config (default: built-in)
       --denylist PATH       path to a plain text file with lines or secrets to ignore
       --find-by-ext         find files by predefined extension
       --depth POSITIVE_INT  additional recursive search in data (experimental)
@@ -41,8 +45,9 @@ Get all argument list:
                             'highest'] (default: medium)
       --ml_batch_size POSITIVE_INT, -b POSITIVE_INT
                             batch size for model inference (default: 16)
-      --azure               enable AzureExecutionProvider for onnx
-      --cuda                enable CUDAExecutionProvider for onnx
+      --ml_config PATH      use external config for ml model
+      --ml_model PATH       use external ml model
+      --ml_providers STR    comma separated list of providers for onnx (CPUExecutionProvider is used by default)
       --api_validation      add credential api validation option to credsweeper pipeline. External API is used to reduce FP for some rule types.
       --jobs POSITIVE_INT, -j POSITIVE_INT
                             number of parallel processes to use (default: 1)