Merge branch 'main' into babenek-patch-1

Samsung · Jul 16, 2024 · 0a3b8d2 · 0a3b8d2
2 parents 3f8c45d + 16dd8ac
commit 0a3b8d2
Show file tree

Hide file tree

Showing 22 changed files with 585 additions and 136 deletions.
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
@@ -114,19 +114,14 @@ jobs:
           ref: ${{ github.event.pull_request.head.sha }}
           path: temp/CredSweeper
 
-      - name: Patch benchmark for PR work
-        run: |
-          sed -i 's|CREDSWEEPER = "https://github.com/Samsung/CredSweeper.git"|CREDSWEEPER = "dummy://github.com/Samsung/CredSweeper.git"|' benchmark/common/constants.py
-          grep --with-filename --line-number 'dummy://github.com/Samsung/CredSweeper.git' benchmark/common/constants.py
-
       - name: Install CredSweeper
         run: |
           python -m pip install temp/CredSweeper
-          credsweeper_head=
+          python -m credsweeper --banner
 
       - name: Run CredSweeper tool
         run: |
-          credsweeper --banner --jobs $(nproc) --path data --save-json report.${{ github.event.pull_request.head.sha }}.json | tee credsweeper.${{ github.event.pull_request.head.sha }}.log
+          credsweeper --banner --log info --jobs $(nproc) --path data --save-json report.${{ github.event.pull_request.head.sha }}.json | tee credsweeper.${{ github.event.pull_request.head.sha }}.log
 
       - name: Run Benchmark
         run: |

diff --git a/.github/workflows/check.yml b/.github/workflows/check.yml
@@ -187,9 +187,9 @@ jobs:
             file_crc32_int=$((16#${file_crc32_hex}))
             crc32_int=$(( ${crc32_int} ^ ${file_crc32_int} ))
             done
-        version_with_crc="$(credsweeper --version | head -1) crc32:$(printf '%x' ${crc32_int})"
+        version_with_crc="$(python -m credsweeper --version | head -1) crc32:$(printf '%x' ${crc32_int})"
         echo "version_with_crc = '${version_with_crc}'"
-        banner=$(credsweeper --banner --path requirements.txt | head -1)
+        banner=$(python -m credsweeper --banner | head -1)
         echo "banner = '${banner}'"
         if ! [ -n "${version_with_crc}" ] && [ -n "${banner}" ] && [ "${version_with_crc}" == "${banner}" ]; then
             echo "'${version_with_crc}' != '${banner}'"

diff --git a/cicd/benchmark.txt b/cicd/benchmark.txt
diff --git a/credsweeper/app.py b/credsweeper/app.py
@@ -1,8 +1,6 @@
-import itertools
 import logging
 import multiprocessing
 import signal
-import sys
 from pathlib import Path
 from typing import Any, List, Optional, Union, Dict, Sequence, Tuple
 
@@ -253,10 +251,7 @@ def scan(self, content_providers: Sequence[Union[DiffContentProvider, TextConten
 
     def __single_job_scan(self, content_providers: Sequence[Union[DiffContentProvider, TextContentProvider]]) -> None:
         """Performs scan in main thread"""
-        all_cred: List[Candidate] = []
-        for i in content_providers:
-            candidates = self.file_scan(i)
-            all_cred.extend(candidates)
+        all_cred = self.files_scan(content_providers)
         if self.config.api_validation:
             api_validation = ApplyValidation()
             for cred in all_cred:
@@ -278,24 +273,39 @@ def __multi_jobs_scan(self, content_providers: Sequence[Union[DiffContentProvide
             if "SILENCE" == self.__log_level:
                 logging.addLevelName(60, "SILENCE")
             log_kwargs["level"] = self.__log_level
+        # providers_map: List[Sequence[Union[DiffContentProvider, TextContentProvider]]] = \
+        #     [content_providers[x::self.pool_count] for x in range(self.pool_count)]
         with multiprocessing.get_context("spawn").Pool(processes=self.pool_count,
                                                        initializer=self.pool_initializer,
                                                        initargs=(log_kwargs, )) as pool:
             try:
-                # Get list credentials for each file
-                scan_results_per_file = pool.map(self.file_scan, content_providers)
-                # Join all sublist into a single list
-                scan_results = list(itertools.chain(*scan_results_per_file))
-                for cred in scan_results:
-                    self.credential_manager.add_credential(cred)
+                for scan_results in pool.imap_unordered(self.files_scan, (content_providers[x::self.pool_count]
+                                                                          for x in range(self.pool_count))):
+                    for cred in scan_results:
+                        self.credential_manager.add_credential(cred)
                 if self.config.api_validation:
                     logger.info("Run API Validation")
                     api_validation = ApplyValidation()
                     api_validation.validate_credentials(pool, self.credential_manager)
             except KeyboardInterrupt:
                 pool.terminate()
                 pool.join()
-                sys.exit()
+                raise
+            pool.close()
+            pool.join()
+
+    # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
+
+    def files_scan(
+            self,  #
+            content_providers: Sequence[Union[DiffContentProvider, TextContentProvider]]) -> List[Candidate]:
+        """Auxiliary method for scan one sequence"""
+        all_cred: List[Candidate] = []
+        for i in content_providers:
+            candidates = self.file_scan(i)
+            all_cred.extend(candidates)
+        logger.info(f"Completed: processed {len(content_providers)} providers with {len(all_cred)} candidates")
+        return all_cred
 
     # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
 

diff --git a/credsweeper/common/constants.py b/credsweeper/common/constants.py
@@ -5,7 +5,7 @@
 
 class KeywordPattern:
     """Pattern set of keyword types"""
-    key_left = r"(?P<variable>(([`'\"]+[^:='\"`}<>\\/&?]*|[^:='\"`}<>\s()\\/&?]*)" \
+    key_left = r"(\\[nrt])?(?P<variable>(([`'\"]+[^:='\"`}<>\\/&?]*|[^:='\"`}<>\s()\\/&?]*)" \
                r"(?P<keyword>"
     # there will be inserted a keyword
     key_right = r")" \

diff --git a/credsweeper/credentials/line_data.py b/credsweeper/credentials/line_data.py
@@ -31,6 +31,7 @@ class LineData:
     quotation_marks = ('"', "'", '`')
     comment_starts = ("//", "* ", "#", "/*", "<!––", "%{", "%", "...", "(*", "--", "--[[", "#=")
     bash_param_split = re.compile("\\s+(\\-|\\||\\>|\\w+?\\>|\\&)")
+    line_endings = re.compile(r"\\{1,8}[nr]")
     url_param_split = re.compile(r"(%|\\u(00){0,2})(26|3f)", flags=re.IGNORECASE)
     # some symbols e.g. double quotes cannot be in URL string https://www.ietf.org/rfc/rfc1738.txt
     # \ - was added for case of url in escaped string \u0026amp; - means escaped & in HTML
@@ -180,6 +181,10 @@ def clean_bash_parameters(self) -> None:
             #  and value can be split by bash special characters
             if len(value_spl) > 1:
                 self.value = value_spl[0]
+        if ' ' not in self.value and ("\\n" in self.value or "\\r" in self.value):
+            value_whsp = self.line_endings.split(self.value)
+            if len(value_whsp) > 1:
+                self.value = value_whsp[0]
 
     def sanitize_variable(self) -> None:
         """Remove trailing spaces, dashes and quotations around the variable. Correct position."""

diff --git a/credsweeper/filters/value_atlassian_token_check.py b/credsweeper/filters/value_atlassian_token_check.py
@@ -32,8 +32,13 @@ def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
             if value.startswith("BBDC-"):
                 # Bitbucket HTTP Access Token
                 return ValueAtlassianTokenCheck.check_atlassian_struct(value[5:])
-            elif value.startswith("ATBB"):
+            elif value.startswith("AT"):
                 # Bitbucket App password
+                while "\\=" in value or "%3d" in value or "%3D" in value:
+                    # = sign may be escaped in URL https://www.rfc-editor.org/rfc/rfc3986
+                    value = value.replace('\\', '')
+                    value = value.replace('%3d', '=')
+                    value = value.replace('%3D', '=')
                 return ValueAtlassianTokenCheck.check_crc32_struct(value)
             else:
                 # Jira / Confluence PAT token
@@ -43,9 +48,10 @@ def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
     @staticmethod
     def check_crc32_struct(value: str) -> bool:
         """Returns False if value is valid for bitbucket app password structure 'payload:crc32'"""
-        crc32 = int(value[28:], 16)
-        data = value[:28].encode(ASCII)
-        if crc32 == binascii.crc32(data):
+        crc32 = int(value[-8:], 16)
+        data = value[:-8].encode(ASCII)
+        data_crc32 = binascii.crc32(data)
+        if crc32 == data_crc32:
             return False
         return True
 

diff --git a/credsweeper/filters/value_file_path_check.py b/credsweeper/filters/value_file_path_check.py
@@ -12,6 +12,9 @@ class ValueFilePathCheck(Filter):
     Check if a value contains either '/' or ':\' separators (but not both)
     and do not have any special characters ( !$@`&*()+)
     """
+    base64_possible_set = set(Chars.BASE64_CHARS.value) | set(Chars.BASE64URL_CHARS.value)
+    unusual_windows_symbols_in_path = "\t\n\r !$@`&*()[]{}<>+=;,~"
+    unusual_linux_symbols_in_path = unusual_windows_symbols_in_path + ":\\"
 
     def __init__(self, config: Config = None) -> None:
         pass
@@ -30,25 +33,32 @@ def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
         value = line_data.value
         contains_unix_separator = '/' in value
         if contains_unix_separator:
+            if "://" in value or value.startswith("~/") or value.startswith("./") or "../" in value or "/.." in value:
+                # common case for url definition or aliases
+                return True
             # base64 encoded data might look like linux path
             min_entropy = ValueEntropyBase64Check.get_min_data_entropy(len(value))
             # get minimal entropy to compare with shannon entropy of found value
             # min_entropy == 0 means that the value cannot be checked with the entropy due high variance
-            if 0 == min_entropy or min_entropy > Util.get_shannon_entropy(value, Chars.BASE64STD_CHARS.value):
-                for i in value:
-                    if i not in Chars.BASE64STD_CHARS.value:
-                        # value contains wrong BASE64STD_CHARS symbols
-                        break
-                else:
-                    # all symbols are from base64 alphabet
-                    contains_unix_separator = 1 < value.count('/')
+            for i in value:
+                if i not in self.base64_possible_set:
+                    # value contains wrong BASE64STD_CHARS symbols like .
+                    break
             else:
-                # high entropy means base64 encoded data
-                contains_unix_separator = False
+                # all symbols are from base64 alphabet
+                entropy = Util.get_shannon_entropy(value, Chars.BASE64STD_CHARS.value)
+                if 0 == min_entropy or min_entropy > entropy:
+                    contains_unix_separator = 1 < value.count('/')
+                else:
+                    # high entropy means base64 encoded data
+                    contains_unix_separator = False
+
             # low shannon entropy points that the value maybe not a high randomized value in base64
         contains_windows_separator = ':\\' in value
         if contains_unix_separator or contains_windows_separator:
-            for i in " !$@`&*()[]{}+=;,":
+            unusual_symbols_in_path = self.unusual_linux_symbols_in_path if contains_unix_separator \
+                else self.unusual_windows_symbols_in_path
+            for i in unusual_symbols_in_path:
                 if i in value:
                     # the symbols which not passed in a path usually
                     break

diff --git a/credsweeper/ml_model/ml_validator.py b/credsweeper/ml_model/ml_validator.py
@@ -220,8 +220,9 @@ def validate_groups(self, group_list: List[Tuple[CandidateKey, List[Candidate]]]
             probability[head:tail] = self._batch_call_model(line_input_list, variable_input_list, value_input_list,
                                                             features_list)
         is_cred = probability > self.threshold
-        for i in range(len(is_cred)):
-            logger.debug("ML decision: %s with prediction: %s for value: %s", is_cred[i], round(probability[i], 8),
-                         group_list[i][0])
+        if logger.isEnabledFor(logging.DEBUG):
+            for i in range(len(is_cred)):
+                logger.debug("ML decision: %s with prediction: %s for value: %s", is_cred[i], probability[i],
+                             group_list[i][0])
         # apply cast to float to avoid json export issue
         return is_cred, probability.astype(float)
diff --git a/credsweeper/rules/config.yaml b/credsweeper/rules/config.yaml
@@ -673,7 +673,7 @@
   confidence: moderate
   type: pattern
   values:
-    - (?P<value_leftquote>[\"'])?(?P<variable>[+0-9A-Za-z-]{2,80}://)([^\s\"<>\[\]^~`{|}?]{0,80}:){1,3}(?P<value>[^\s\"<>\[\]^~`{|}@:/]{3,80})@[\w.-]{1,800}\\{0,8}(?P<value_rightquote>[\"'])?
+    - (?P<value_leftquote>[\"'])?(?P<variable>[+0-9A-Za-z-]{2,80}://)([^\s\'"<>\[\]^~`{|}@:/]{0,80}:){1,3}(?P<value>[^\s\'"<>\[\]^~`{|}@:/]{3,80})@[^\s\'"<>\[\]^~`{|}@:/]{1,800}\\{0,8}(?P<value_rightquote>[\"'])?
   filter_type: UrlCredentialsGroup
   use_ml: true
   required_substrings:
@@ -911,9 +911,10 @@
   confidence: strong
   type: pattern
   values:
-    - (?<![0-9A-Za-z_+-])(?P<value>ATCTT3xFfGN0[a-zA-Z0-9_-]{171}=[A-F0-9]{8})(?![=0-9A-Za-z_+-])
-  filter_type: TokenPattern
-  min_line_len: 183
+    - (?<![0-9A-Za-z_+-])(?P<value>ATCTT3xFfGN0[a-zA-Z0-9_-]{80,800}(\\?=|%3[dD])[A-F0-9]{8})(?![=0-9A-Za-z_+-])
+  filter_type:
+    - ValueAtlassianTokenCheck
+  min_line_len: 160
   required_substrings:
     - ATCTT3xFfGN0
   target:
@@ -997,9 +998,10 @@
   confidence: strong
   type: pattern
   values:
-    - (?<![0-9A-Za-z_+-])(?P<value>ATATT3xFfGF0[a-zA-Z0-9_-]{171}=[A-F0-9]{8})(?![=0-9A-Za-z_+-])
-  filter_type: TokenPattern
-  min_line_len: 191
+    - (?<![0-9A-Za-z_+-])(?P<value>ATATT3xFfGF0[a-zA-Z0-9_-]{80,800}(\\?=|%3[dD])[A-F0-9]{8})(?![=0-9A-Za-z_+-])
+  filter_type:
+    - ValueAtlassianTokenCheck
+  min_line_len: 160
   required_substrings:
     - ATATT3xFfGF0
   target:

diff --git a/credsweeper/scanner/scan_type/scan_type.py b/credsweeper/scanner/scan_type/scan_type.py
@@ -114,16 +114,14 @@ def get_line_data_list(
                     bypass_end = offset_end
 
                 if config.use_filters and cls.filtering(config, target, line_data, filters):
-                    if 0 < line_data.variable_end:
+                    if line_data.variable and 0 <= line_data.variable_start < line_data.variable_end:
                         # may be next matched item will be not filtered - let search it after variable
                         bypass_start = line_data.variable_end
                         bypass_end = offset_end
-                        # offsets.add((line_data.variable_end, offset_end))
-                    elif 0 < line_data.value_end:
+                    elif line_data.value and 0 <= line_data.value_start < line_data.value_end:
                         # may be next matched item will be not filtered - let search it after variable
                         bypass_start = line_data.value_end
                         bypass_end = offset_end
-                        # offsets.add((line_data.value_end, offset_end))
                     continue
 
                 if target.offset is not None:

diff --git a/requirements.txt b/requirements.txt
@@ -13,7 +13,7 @@ PyYAML==6.0.1
 python-docx==1.1.0
 requests==2.32.0
 typing_extensions==4.9.0
-whatthepatch==1.0.5
+whatthepatch==1.0.6
 pdfminer.six==20231228
 password-strength==0.0.3.post2
 python-dateutil==2.8.2

diff --git a/tests/__init__.py b/tests/__init__.py
@@ -7,14 +7,14 @@
 NEGLIGIBLE_ML_THRESHOLD = 0.0001
 
 # credentials count after scan
-SAMPLES_CRED_COUNT: int = 425
-SAMPLES_CRED_LINE_COUNT: int = 442
+SAMPLES_CRED_COUNT: int = 429
+SAMPLES_CRED_LINE_COUNT: int = 446
 
 # credentials count after post-processing
-SAMPLES_POST_CRED_COUNT: int = 383
+SAMPLES_POST_CRED_COUNT: int = 387
 
 # with option --doc
-SAMPLES_IN_DOC = 407
+SAMPLES_IN_DOC = 410
 
 # archived credentials that are not found without --depth
 SAMPLES_IN_DEEP_1 = SAMPLES_POST_CRED_COUNT + 25