improvements

Samsung · Jul 16, 2024 · d5c64c4 · d5c64c4
1 parent 0f86c96
commit d5c64c4
Show file tree

Hide file tree

Showing 9 changed files with 60 additions and 28 deletions.
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
@@ -22,7 +22,8 @@ jobs:
       - name: Checkout CredData
         uses: actions/checkout@v4
         with:
-          repository: Samsung/CredData
+          repository: babenek/CredData
+          ref: valsanitizer
 
       - name: Markup hashing
         run: |
@@ -72,7 +73,8 @@ jobs:
       - name: Checkout CredData
         uses: actions/checkout@v4
         with:
-          repository: Samsung/CredData
+          repository: babenek/CredData
+          ref: valsanitizer
 
       - name: Markup hashing
         run: |
@@ -169,7 +171,8 @@ jobs:
       - name: Checkout CredData
         uses: actions/checkout@v4
         with:
-          repository: Samsung/CredData
+          repository: babenek/CredData
+          ref: valsanitizer
 
       - name: Markup hashing
         run: |
@@ -350,7 +353,8 @@ jobs:
       - name: Checkout CredData
         uses: actions/checkout@v4
         with:
-          repository: Samsung/CredData
+          repository: babenek/CredData
+          ref: valsanitizer
 
       - name: Markup hashing
         run: |

diff --git a/credsweeper/common/constants.py b/credsweeper/common/constants.py
@@ -5,7 +5,7 @@
 
 class KeywordPattern:
     """Pattern set of keyword types"""
-    key_left = r"(?P<variable>(([`'\"]+[^:='\"`}<>\\/&?]*|[^:='\"`}<>\s()\\/&?]*)" \
+    key_left = r"(\\[nrt])?(?P<variable>(([`'\"]+[^:='\"`}<>\\/&?]*|[^:='\"`}<>\s()\\/&?]*)" \
                r"(?P<keyword>"
     # there will be inserted a keyword
     key_right = r")" \

diff --git a/credsweeper/credentials/line_data.py b/credsweeper/credentials/line_data.py
@@ -32,6 +32,7 @@ class LineData:
     quotation_marks = ('"', "'", '`')
     comment_starts = ("//", "* ", "#", "/*", "<!––", "%{", "%", "...", "(*", "--", "--[[", "#=")
     bash_param_split = re.compile("\\s+(\\-|\\||\\>|\\w+?\\>|\\&)")
+    line_endings = re.compile(r"\\{1,8}[nr]")
     url_param_split = re.compile(r"(%|\\u(00){0,2})(26|3f)", flags=re.IGNORECASE)
     # some symbols e.g. double quotes cannot be in URL string https://www.ietf.org/rfc/rfc1738.txt
     # \ - was added for case of url in escaped string \u0026amp; - means escaped & in HTML
@@ -181,6 +182,10 @@ def clean_bash_parameters(self) -> None:
             #  and value can be split by bash special characters
             if len(value_spl) > 1:
                 self.value = value_spl[0]
+        if ' ' not in self.value and ("\\n" in self.value or "\\r" in self.value):
+            value_whsp = self.line_endings.split(self.value)
+            if len(value_whsp) > 1:
+                self.value = value_whsp[0]
 
     def sanitize_variable(self) -> None:
         """Remove trailing spaces, dashes and quotations around the variable. Correct position."""

diff --git a/credsweeper/filters/value_file_path_check.py b/credsweeper/filters/value_file_path_check.py
@@ -12,6 +12,9 @@ class ValueFilePathCheck(Filter):
     Check if a value contains either '/' or ':\' separators (but not both)
     and do not have any special characters ( !$@`&*()+)
     """
+    base64_possible_set = set(Chars.BASE64_CHARS.value) | set(Chars.BASE64URL_CHARS.value)
+    unusual_windows_symbols_in_path = "\t\n\r !$@`&*()[]{}<>+=;,~"
+    unusual_linux_symbols_in_path = unusual_windows_symbols_in_path + ":\\"
 
     def __init__(self, config: Config = None) -> None:
         pass
@@ -30,25 +33,32 @@ def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
         value = line_data.value
         contains_unix_separator = '/' in value
         if contains_unix_separator:
+            if "://" in value or value.startswith("~/") or value.startswith("./") or "../" in value or "/.." in value:
+                # common case for url definition or aliases
+                return True
             # base64 encoded data might look like linux path
             min_entropy = ValueEntropyBase64Check.get_min_data_entropy(len(value))
             # get minimal entropy to compare with shannon entropy of found value
             # min_entropy == 0 means that the value cannot be checked with the entropy due high variance
-            if 0 == min_entropy or min_entropy > Util.get_shannon_entropy(value, Chars.BASE64STD_CHARS.value):
-                for i in value:
-                    if i not in Chars.BASE64STD_CHARS.value:
-                        # value contains wrong BASE64STD_CHARS symbols
-                        break
-                else:
-                    # all symbols are from base64 alphabet
-                    contains_unix_separator = 1 < value.count('/')
+            for i in value:
+                if i not in self.base64_possible_set:
+                    # value contains wrong BASE64STD_CHARS symbols like .
+                    break
             else:
-                # high entropy means base64 encoded data
-                contains_unix_separator = False
+                # all symbols are from base64 alphabet
+                entropy = Util.get_shannon_entropy(value, Chars.BASE64STD_CHARS.value)
+                if 0 == min_entropy or min_entropy > entropy:
+                    contains_unix_separator = 1 < value.count('/')
+                else:
+                    # high entropy means base64 encoded data
+                    contains_unix_separator = False
+
             # low shannon entropy points that the value maybe not a high randomized value in base64
         contains_windows_separator = ':\\' in value
         if contains_unix_separator or contains_windows_separator:
-            for i in " !$@`&*()[]{}+=;,":
+            unusual_symbols_in_path = self.unusual_linux_symbols_in_path if contains_unix_separator \
+                else self.unusual_windows_symbols_in_path
+            for i in unusual_symbols_in_path:
                 if i in value:
                     # the symbols which not passed in a path usually
                     break

diff --git a/credsweeper/ml_model/ml_validator.py b/credsweeper/ml_model/ml_validator.py
@@ -220,8 +220,9 @@ def validate_groups(self, group_list: List[Tuple[CandidateKey, List[Candidate]]]
             probability[head:tail] = self._batch_call_model(line_input_list, variable_input_list, value_input_list,
                                                             features_list)
         is_cred = probability > self.threshold
-        for i in range(len(is_cred)):
-            logger.debug("ML decision: %s with prediction: %s for value: %s", is_cred[i], round(probability[i], 8),
-                         group_list[i][0])
+        if logger.isEnabledFor(logging.DEBUG):
+            for i in range(len(is_cred)):
+                logger.debug("ML decision: %s with prediction: %s for value: %s", is_cred[i], probability[i],
+                             group_list[i][0])
         # apply cast to float to avoid json export issue
         return is_cred, probability.astype(float)
diff --git a/requirements.txt b/requirements.txt
@@ -13,7 +13,7 @@ PyYAML==6.0.1
 python-docx==1.1.0
 requests==2.32.0
 typing_extensions==4.9.0
-whatthepatch==1.0.5
+whatthepatch==1.0.6
 pdfminer.six==20231228
 password-strength==0.0.3.post2
 python-dateutil==2.8.2

diff --git a/tests/data/depth_3.json b/tests/data/depth_3.json
@@ -7718,7 +7718,7 @@
     {
         "api_validation": "NOT_AVAILABLE",
         "ml_validation": "VALIDATED_KEY",
-        "ml_probability": 0.78,
+        "ml_probability": 0.756,
         "rule": "Github Old Token",
         "severity": "high",
         "confidence": "moderate",
@@ -7745,7 +7745,7 @@
     {
         "api_validation": "NOT_AVAILABLE",
         "ml_validation": "VALIDATED_KEY",
-        "ml_probability": 0.78,
+        "ml_probability": 0.756,
         "rule": "Token",
         "severity": "medium",
         "confidence": "moderate",
@@ -7758,8 +7758,8 @@
                 "value": "gireogicracklecrackle1231567190113413981",
                 "value_start": 15,
                 "value_end": 55,
-                "variable": "ngit_token",
-                "variable_start": 1,
+                "variable": "git_token",
+                "variable_start": 2,
                 "variable_end": 11,
                 "entropy_validation": {
                     "iterator": "BASE36_CHARS",

diff --git a/tests/filters/test_value_file_path_check.py b/tests/filters/test_value_file_path_check.py
@@ -7,18 +7,24 @@
 
 class TestValueFilePathCheck:
 
-    def test_value_file_path_check_p(self, file_path: pytest.fixture, success_line: pytest.fixture) -> None:
-        line_data = get_line_data(file_path, line=success_line, pattern=LINE_VALUE_PATTERN)
+    @pytest.mark.parametrize("line", [
+        "5//0KCPafDhZvtCwqrsyiKFeDGT_0ZGHiI-E0ClIWrLC7tZ1WE5vHc4-Y2qi1IhPy3Pz5fmCe9OPIxEZUONUg7SWJF9nwQ_j2lIdXU0",
+    ])
+    def test_value_file_path_check_p(self, file_path: pytest.fixture, line: str) -> None:
+        line_data = get_line_data(file_path, line=line, pattern=LINE_VALUE_PATTERN)
         assert ValueFilePathCheck().run(line_data, DUMMY_ANALYSIS_TARGET) is False
 
     @pytest.mark.parametrize(
         "line",
         [
+            "crackle/filepath.txt",
             "/home/user/tmp",  # simple path
             "../..",  # path
+            "dir/..",  # path
+            "../dir",  # path
             "file:///Crackle/filepath/",  # path from browser url
             "~/.custompass",  # path with synonym
-            "crackle/filepath_txt",
+            "./sshpass.sh",  # path with synonym
             "crackle/file.path",  #
             "C:\\Crackle\\filepath",  #
         ])

diff --git a/tests/test_main.py b/tests/test_main.py
@@ -1,5 +1,4 @@
 import io
-import io
 import os
 import random
 import shutil
@@ -806,6 +805,13 @@ def test_param_n(self) -> None:
     def test_param_p(self) -> None:
         # internal parametrized tests for quick debug
         items = [  #
+            ("slt.py", b'\\t\\tsalt = "\\x187bhgerjhqw\\n iKa\\tW_R~0/8"', "salt", "\\x187bhgerjhqw\\n iKa\\tW_R~0/8"),
+            ("log.txt",
+             b'json\\nAuthorization: Basic jfhlksadjiu9813ryiuhdfskadjlkjh34\\n\\u003c/code\\u003e\\u003c/pre\\u003e"',
+             "Authorization", "jfhlksadjiu9813ryiuhdfskadjlkjh34"),
+            ("pwd.py", b'password = "ji3_8iKgaW_R~0/8"', "password", "ji3_8iKgaW_R~0/8"),
+            ("pwd.py", b'password = "/_tcTz<D8sWXsW<E"', "password", "/_tcTz<D8sWXsW<E"),
+            ("pwd.py", b'password = "I:FbCnXQc/9E02Il"', "password", "I:FbCnXQc/9E02Il"),
             ("url_part.py", b'39084?token=3487263-2384579834-234732875-345&key=DnBeiGdgy6253fytfdDHGg&hasToBeFound=2',
              'token', '3487263-2384579834-234732875-345'),
             ("prod.py", b"secret_api_key='Ahga%$FiQ@Ei8'", "secret_api_key", "Ahga%$FiQ@Ei8"),  #