Samsung · babenek · Jul 9, 2024 · Jul 10, 2024 · Jul 10, 2024 · Jul 10, 2024
@@ -22,7 +22,8 @@ jobs:
       - name: Checkout CredData
         uses: actions/checkout@v4
         with:
-          repository: Samsung/CredData
+          repository: babenek/CredData
+          ref: valsanitizer
 
       - name: Markup hashing
         run: |
@@ -72,7 +73,8 @@ jobs:
       - name: Checkout CredData
         uses: actions/checkout@v4
         with:
-          repository: Samsung/CredData
+          repository: babenek/CredData
+          ref: valsanitizer
 
       - name: Markup hashing
         run: |
@@ -126,7 +128,7 @@ jobs:
 
       - name: Run CredSweeper tool
         run: |
-          credsweeper --banner --jobs $(nproc) --path data --save-json report.${{ github.event.pull_request.head.sha }}.json | tee credsweeper.${{ github.event.pull_request.head.sha }}.log
+          credsweeper --banner --log info --jobs $(nproc) --path data --save-json report.${{ github.event.pull_request.head.sha }}.json | tee credsweeper.${{ github.event.pull_request.head.sha }}.log
 
       - name: Run Benchmark
         run: |
@@ -174,7 +176,8 @@ jobs:
       - name: Checkout CredData
         uses: actions/checkout@v4
         with:
-          repository: Samsung/CredData
+          repository: babenek/CredData
+          ref: valsanitizer
 
       - name: Markup hashing
         run: |
@@ -355,7 +358,8 @@ jobs:
       - name: Checkout CredData
         uses: actions/checkout@v4
         with:
-          repository: Samsung/CredData
+          repository: babenek/CredData
+          ref: valsanitizer
 
       - name: Markup hashing
         run: |

@@ -215,6 +215,11 @@ def get_arguments() -> Namespace:
                         const="output.xlsx",
                         dest="xlsx_filename",
                         metavar="PATH")
+    parser.add_argument("--subtext", help="only part of text will be outputted", action="store_const", const=True)
+    parser.add_argument("--hashed",
+                        help="line, variable, value will be hashed in output",
+                        action="store_const",
+                        const=True)
     parser.add_argument("--sort", help="enable output sorting", dest="sort_output", action="store_true")
     parser.add_argument("--log",
                         "-l",
@@ -282,6 +287,8 @@ def scan(args: Namespace, content_provider: AbstractProvider, json_filename: Opt
                                   api_validation=args.api_validation,
                                   json_filename=json_filename,
                                   xlsx_filename=xlsx_filename,
+                                  subtext=args.subtext,
+                                  hashed=args.hashed,
                                   sort_output=args.sort_output,
                                   use_filters=args.no_filters,
                                   pool_count=args.jobs,

@@ -1,8 +1,6 @@
-import itertools
 import logging
 import multiprocessing
 import signal
-import sys
 from pathlib import Path
 from typing import Any, List, Optional, Union, Dict, Sequence, Tuple
 
@@ -44,6 +42,8 @@ def __init__(self,
                  api_validation: bool = False,
                  json_filename: Union[None, str, Path] = None,
                  xlsx_filename: Union[None, str, Path] = None,
+                 subtext: bool = False,
+                 hashed: bool = False,
                  sort_output: bool = False,
                  use_filters: bool = True,
                  pool_count: int = 1,
@@ -72,6 +72,8 @@ def __init__(self,
                 to json
             xlsx_filename: optional string variable, path to save result
                 to xlsx
+            subtext: use subtext of line near value like it performed in ML
+            hashed: use hash of line, value and variable instead plain text
             use_filters: boolean variable, specifying the need of rule filters
             pool_count: int value, number of parallel processes to use
             ml_batch_size: int value, size of the batch for model inference
@@ -106,6 +108,8 @@ def __init__(self,
         self.credential_manager = CredentialManager()
         self.json_filename: Union[None, str, Path] = json_filename
         self.xlsx_filename: Union[None, str, Path] = xlsx_filename
+        self.subtext = subtext
+        self.hashed = hashed
         self.sort_output = sort_output
         self.ml_batch_size = ml_batch_size if ml_batch_size and 0 < ml_batch_size else 16
         self.ml_threshold = ml_threshold
@@ -253,10 +257,7 @@ def scan(self, content_providers: Sequence[Union[DiffContentProvider, TextConten
 
     def __single_job_scan(self, content_providers: Sequence[Union[DiffContentProvider, TextContentProvider]]) -> None:
         """Performs scan in main thread"""
-        all_cred: List[Candidate] = []
-        for i in content_providers:
-            candidates = self.file_scan(i)
-            all_cred.extend(candidates)
+        all_cred = self.files_scan(content_providers)
         if self.config.api_validation:
             api_validation = ApplyValidation()
             for cred in all_cred:
@@ -278,24 +279,39 @@ def __multi_jobs_scan(self, content_providers: Sequence[Union[DiffContentProvide
             if "SILENCE" == self.__log_level:
                 logging.addLevelName(60, "SILENCE")
             log_kwargs["level"] = self.__log_level
+        # providers_map: List[Sequence[Union[DiffContentProvider, TextContentProvider]]] = \
+        #     [content_providers[x::self.pool_count] for x in range(self.pool_count)]
         with multiprocessing.get_context("spawn").Pool(processes=self.pool_count,
                                                        initializer=self.pool_initializer,
                                                        initargs=(log_kwargs, )) as pool:
             try:
-                # Get list credentials for each file
-                scan_results_per_file = pool.map(self.file_scan, content_providers)
-                # Join all sublist into a single list
-                scan_results = list(itertools.chain(*scan_results_per_file))
-                for cred in scan_results:
-                    self.credential_manager.add_credential(cred)
+                for scan_results in pool.imap_unordered(self.files_scan, (content_providers[x::self.pool_count]
+                                                                          for x in range(self.pool_count))):
+                    for cred in scan_results:
+                        self.credential_manager.add_credential(cred)
                 if self.config.api_validation:
                     logger.info("Run API Validation")
                     api_validation = ApplyValidation()
                     api_validation.validate_credentials(pool, self.credential_manager)
             except KeyboardInterrupt:
                 pool.terminate()
                 pool.join()
-                sys.exit()
+                raise
+            pool.close()
+            pool.join()
+
+    # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
+
+    def files_scan(
+            self,  #
+            content_providers: Sequence[Union[DiffContentProvider, TextContentProvider]]) -> List[Candidate]:
+        """Auxiliary method for scan one sequence"""
+        all_cred: List[Candidate] = []
+        for i in content_providers:
+            candidates = self.file_scan(i)
+            all_cred.extend(candidates)
+        logger.info(f"Completed: processed {len(content_providers)} providers with {len(all_cred)} candidates")
+        return all_cred
 
     # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
 
@@ -390,16 +406,17 @@ def export_results(self) -> None:
 
         if self.json_filename:
             is_exported = True
-            Util.json_dump([credential.to_json() for credential in credentials], file_path=self.json_filename)
+            Util.json_dump([credential.to_json(subtext=self.subtext, hashed=self.hashed) for credential in credentials],
+                           file_path=self.json_filename)
 
         if self.xlsx_filename:
             is_exported = True
             data_list = []
             for credential in credentials:
-                data_list.extend(credential.to_dict_list())
+                data_list.extend(credential.to_dict_list(subtext=self.subtext, hashed=self.hashed))
             df = pd.DataFrame(data=data_list)
             df.to_excel(self.xlsx_filename, index=False)
 
         if is_exported is False:
             for credential in credentials:
-                print(credential)
+                print(credential.to_str(subtext=self.subtext, hashed=self.hashed))
@@ -5,7 +5,7 @@
 
 class KeywordPattern:
     """Pattern set of keyword types"""
-    key_left = r"(?P<variable>(([`'\"]+[^:='\"`}<>\\/&?]*|[^:='\"`}<>\s()\\/&?]*)" \
+    key_left = r"(\\[nrt])?(?P<variable>(([`'\"]+[^:='\"`}<>\\/&?]*|[^:='\"`}<>\s()\\/&?]*)" \
                r"(?P<keyword>"
     # there will be inserted a keyword
     key_right = r")" \

@@ -88,15 +88,22 @@ def is_api_validation_available(self) -> bool:
         """
         return len(self.validations) > 0
 
-    def __str__(self) -> str:
+    def to_str(self, subtext: bool = False, hashed: bool = False) -> str:
+        """Represent candidate with subtext or|and hashed values"""
         return f"rule: {self.rule_name}" \
                f" | severity: {self.severity.value}" \
                f" | confidence: {self.confidence.value}" \
-               f" | line_data_list: {self.line_data_list}" \
+               f" | line_data_list: [{', '.join([x.to_str(subtext, hashed) for x in self.line_data_list])}]" \
                f" | api_validation: {self.api_validation.name}" \
                f" | ml_validation: {self.ml_validation.name}"
 
-    def to_json(self) -> Dict:
+    def __str__(self):
+        return self.to_str()
+
+    def __repr__(self):
+        return self.to_str(subtext=True)
+
+    def to_json(self, subtext: bool, hashed: bool) -> Dict:
         """Convert credential candidate object to dictionary.
 
         Return:
@@ -113,23 +120,23 @@ def to_json(self) -> Dict:
             "confidence": self.confidence.value,
             "use_ml": self.use_ml,
             # put the array to end to make json more readable
-            "line_data_list": [line_data.to_json() for line_data in self.line_data_list],
+            "line_data_list": [line_data.to_json(subtext, hashed) for line_data in self.line_data_list],
         }
         if self.config is not None:
             reported_output = {k: v for k, v in full_output.items() if k in self.config.candidate_output}
         else:
             reported_output = full_output
         return reported_output
 
-    def to_dict_list(self) -> List[dict]:
+    def to_dict_list(self, subtext: bool, hashed: bool) -> List[dict]:
         """Convert credential candidate object to List[dict].
 
         Return:
             List[dict] object generated from current credential candidate
 
         """
         reported_output = []
-        json_output = self.to_json()
+        json_output = self.to_json(subtext, hashed)
         refined_data = copy.deepcopy(json_output)
         del refined_data["line_data_list"]
         for line_data in json_output["line_data_list"]:

@@ -1,10 +1,11 @@
 import contextlib
+import hashlib
 import re
 import string
 from functools import cached_property
 from typing import Any, Dict, Optional, Tuple
 
-from credsweeper.common.constants import MAX_LINE_LENGTH
+from credsweeper.common.constants import MAX_LINE_LENGTH, UTF_8, ML_HUNK
 from credsweeper.config import Config
 from credsweeper.utils import Util
 from credsweeper.utils.entropy_validator import EntropyValidator
@@ -31,6 +32,7 @@ class LineData:
     quotation_marks = ('"', "'", '`')
     comment_starts = ("//", "* ", "#", "/*", "<!––", "%{", "%", "...", "(*", "--", "--[[", "#=")
     bash_param_split = re.compile("\\s+(\\-|\\||\\>|\\w+?\\>|\\&)")
+    line_endings = re.compile(r"\\{1,8}[nr]")
     url_param_split = re.compile(r"(%|\\u(00){0,2})(26|3f)", flags=re.IGNORECASE)
     # some symbols e.g. double quotes cannot be in URL string https://www.ietf.org/rfc/rfc1738.txt
     # \ - was added for case of url in escaped string \u0026amp; - means escaped & in HTML
@@ -180,6 +182,10 @@ def clean_bash_parameters(self) -> None:
             #  and value can be split by bash special characters
             if len(value_spl) > 1:
                 self.value = value_spl[0]
+        if ' ' not in self.value and ("\\n" in self.value or "\\r" in self.value):
+            value_whsp = self.line_endings.split(self.value)
+            if len(value_whsp) > 1:
+                self.value = value_whsp[0]
 
     def sanitize_variable(self) -> None:
         """Remove trailing spaces, dashes and quotations around the variable. Correct position."""
@@ -282,11 +288,29 @@ def is_source_file_with_quotes(self) -> bool:
             return True
         return False
 
-    def __repr__(self) -> str:
-        return f"line: '{self.line}' | line_num: {self.line_num} | path: {self.path}" \
-               f" | value: '{self.value}' | entropy_validation: {EntropyValidator(self.value)}"
-
-    def to_json(self) -> Dict:
+    @staticmethod
+    def get_subtext_or_hash(text: Optional[str], pos: int, subtext: bool, hashed: bool) -> Optional[str]:
+        """Represent a text with subtext or|and hash if required"""
+        text = Util.subtext(text, pos, ML_HUNK) if subtext and text is not None else text
+        if hashed:
+            # text = hashlib.sha256(text.encode(UTF_8, errors="replace")).hexdigest() if text is not None else None
+            text = hashlib.sha256(text.encode(UTF_8, errors="strict")).hexdigest() if text is not None else None
+        return text
+
+    def to_str(self, subtext: bool = False, hashed: bool = False) -> str:
+        """Represent line_data with subtext or|and hashed values"""
+        return f"line: '{self.get_subtext_or_hash(self.line, self.value_start, subtext, hashed)}'" \
+               f" | line_num: {self.line_num} | path: {self.path}" \
+               f" | value: '{self.get_subtext_or_hash(self.value, 0, subtext, hashed)}'" \
+               f" | entropy_validation: {EntropyValidator(self.value)}"
+
+    def __str__(self):
+        return self.to_str()
+
+    def __repr__(self):
+        return self.to_str(subtext=True)
+
+    def to_json(self, subtext: bool, hashed: bool) -> Dict:
         """Convert line data object to dictionary.
 
         Return:
@@ -295,18 +319,19 @@ def to_json(self) -> Dict:
         """
         full_output = {
             "key": self.key,
-            "line": self.line,
+            "line": self.get_subtext_or_hash(self.line, self.value_start, subtext, hashed),
             "line_num": self.line_num,
             "path": self.path,
-            "info": self.info,
+            # info may contain variable name - so let it be hashed if requested
+            "info": hashlib.sha256(self.info.encode(UTF_8)).hexdigest() if hashed and self.info else self.info,
             "pattern": self.pattern.pattern,
             "separator": self.separator,
             "separator_start": self.separator_start,
             "separator_end": self.separator_end,
-            "value": self.value,
+            "value": self.get_subtext_or_hash(self.value, 0, subtext, hashed),
             "value_start": self.value_start,
             "value_end": self.value_end,
-            "variable": self.variable,
+            "variable": self.get_subtext_or_hash(self.variable, 0, subtext, hashed),
             "variable_start": self.variable_start,
             "variable_end": self.variable_end,
             "value_leftquote": self.value_leftquote,

@@ -1,6 +1,7 @@
 import bz2
 import logging
 from abc import ABC
+from pathlib import Path
 from typing import List
 
 from credsweeper.credentials import Candidate
@@ -22,10 +23,12 @@ def data_scan(
         """Extracts data from bzip2 archive and launches data_scan"""
         candidates = []
         try:
-            new_path = data_provider.file_path if ".bz2" != Util.get_extension(
-                data_provider.file_path) else data_provider.file_path[:-4]
+            file_path = Path(data_provider.file_path)
+            new_path = file_path.as_posix()
+            if ".bz2" == file_path.suffix:
+                new_path = new_path[:-4]
             bzip2_content_provider = DataContentProvider(data=bz2.decompress(data_provider.data),
-                                                         file_path=data_provider.file_path,
+                                                         file_path=new_path,
                                                          file_type=Util.get_extension(new_path),
                                                          info=f"{data_provider.info}|BZIP2|{new_path}")
             new_limit = recursive_limit_size - len(bzip2_content_provider.data)

@@ -1,5 +1,6 @@
 import datetime
 import logging
+from pathlib import Path
 from typing import List, Optional, Any, Tuple, Union
 
 from credsweeper.common.constants import RECURSIVE_SCAN_LIMITATION
@@ -136,7 +137,7 @@ def scan(self,
             data_provider = DataContentProvider(data=data,
                                                 file_path=content_provider.file_path,
                                                 file_type=content_provider.file_type,
-                                                info=content_provider.file_path)
+                                                info=Path(content_provider.file_path).as_posix())
             # iterate for all possibly scanner methods WITHOUT ByteContentProvider for TextContentProvider
             scanner_classes = self.get_deep_scanners(data, content_provider.file_type)
             for scan_class in scanner_classes: