Skip to content

Commit

Permalink
Merge branch 'main' into babenek-patch-1
Browse files Browse the repository at this point in the history
  • Loading branch information
babenek authored Jul 16, 2024
2 parents 3f8c45d + 16dd8ac commit 0a3b8d2
Show file tree
Hide file tree
Showing 22 changed files with 585 additions and 136 deletions.
9 changes: 2 additions & 7 deletions .github/workflows/benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -114,19 +114,14 @@ jobs:
ref: ${{ github.event.pull_request.head.sha }}
path: temp/CredSweeper

- name: Patch benchmark for PR work
run: |
sed -i 's|CREDSWEEPER = "https://github.com/Samsung/CredSweeper.git"|CREDSWEEPER = "dummy://github.com/Samsung/CredSweeper.git"|' benchmark/common/constants.py
grep --with-filename --line-number 'dummy://github.com/Samsung/CredSweeper.git' benchmark/common/constants.py
- name: Install CredSweeper
run: |
python -m pip install temp/CredSweeper
credsweeper_head=
python -m credsweeper --banner
- name: Run CredSweeper tool
run: |
credsweeper --banner --jobs $(nproc) --path data --save-json report.${{ github.event.pull_request.head.sha }}.json | tee credsweeper.${{ github.event.pull_request.head.sha }}.log
credsweeper --banner --log info --jobs $(nproc) --path data --save-json report.${{ github.event.pull_request.head.sha }}.json | tee credsweeper.${{ github.event.pull_request.head.sha }}.log
- name: Run Benchmark
run: |
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -187,9 +187,9 @@ jobs:
file_crc32_int=$((16#${file_crc32_hex}))
crc32_int=$(( ${crc32_int} ^ ${file_crc32_int} ))
done
version_with_crc="$(credsweeper --version | head -1) crc32:$(printf '%x' ${crc32_int})"
version_with_crc="$(python -m credsweeper --version | head -1) crc32:$(printf '%x' ${crc32_int})"
echo "version_with_crc = '${version_with_crc}'"
banner=$(credsweeper --banner --path requirements.txt | head -1)
banner=$(python -m credsweeper --banner | head -1)
echo "banner = '${banner}'"
if ! [ -n "${version_with_crc}" ] && [ -n "${banner}" ] && [ "${version_with_crc}" == "${banner}" ]; then
echo "'${version_with_crc}' != '${banner}'"
Expand Down
72 changes: 36 additions & 36 deletions cicd/benchmark.txt

Large diffs are not rendered by default.

36 changes: 23 additions & 13 deletions credsweeper/app.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import itertools
import logging
import multiprocessing
import signal
import sys
from pathlib import Path
from typing import Any, List, Optional, Union, Dict, Sequence, Tuple

Expand Down Expand Up @@ -253,10 +251,7 @@ def scan(self, content_providers: Sequence[Union[DiffContentProvider, TextConten

def __single_job_scan(self, content_providers: Sequence[Union[DiffContentProvider, TextContentProvider]]) -> None:
"""Performs scan in main thread"""
all_cred: List[Candidate] = []
for i in content_providers:
candidates = self.file_scan(i)
all_cred.extend(candidates)
all_cred = self.files_scan(content_providers)
if self.config.api_validation:
api_validation = ApplyValidation()
for cred in all_cred:
Expand All @@ -278,24 +273,39 @@ def __multi_jobs_scan(self, content_providers: Sequence[Union[DiffContentProvide
if "SILENCE" == self.__log_level:
logging.addLevelName(60, "SILENCE")
log_kwargs["level"] = self.__log_level
# providers_map: List[Sequence[Union[DiffContentProvider, TextContentProvider]]] = \
# [content_providers[x::self.pool_count] for x in range(self.pool_count)]
with multiprocessing.get_context("spawn").Pool(processes=self.pool_count,
initializer=self.pool_initializer,
initargs=(log_kwargs, )) as pool:
try:
# Get list credentials for each file
scan_results_per_file = pool.map(self.file_scan, content_providers)
# Join all sublist into a single list
scan_results = list(itertools.chain(*scan_results_per_file))
for cred in scan_results:
self.credential_manager.add_credential(cred)
for scan_results in pool.imap_unordered(self.files_scan, (content_providers[x::self.pool_count]
for x in range(self.pool_count))):
for cred in scan_results:
self.credential_manager.add_credential(cred)
if self.config.api_validation:
logger.info("Run API Validation")
api_validation = ApplyValidation()
api_validation.validate_credentials(pool, self.credential_manager)
except KeyboardInterrupt:
pool.terminate()
pool.join()
sys.exit()
raise
pool.close()
pool.join()

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

def files_scan(
self, #
content_providers: Sequence[Union[DiffContentProvider, TextContentProvider]]) -> List[Candidate]:
"""Auxiliary method for scan one sequence"""
all_cred: List[Candidate] = []
for i in content_providers:
candidates = self.file_scan(i)
all_cred.extend(candidates)
logger.info(f"Completed: processed {len(content_providers)} providers with {len(all_cred)} candidates")
return all_cred

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

Expand Down
2 changes: 1 addition & 1 deletion credsweeper/common/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

class KeywordPattern:
"""Pattern set of keyword types"""
key_left = r"(?P<variable>(([`'\"]+[^:='\"`}<>\\/&?]*|[^:='\"`}<>\s()\\/&?]*)" \
key_left = r"(\\[nrt])?(?P<variable>(([`'\"]+[^:='\"`}<>\\/&?]*|[^:='\"`}<>\s()\\/&?]*)" \
r"(?P<keyword>"
# there will be inserted a keyword
key_right = r")" \
Expand Down
5 changes: 5 additions & 0 deletions credsweeper/credentials/line_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ class LineData:
quotation_marks = ('"', "'", '`')
comment_starts = ("//", "* ", "#", "/*", "<!––", "%{", "%", "...", "(*", "--", "--[[", "#=")
bash_param_split = re.compile("\\s+(\\-|\\||\\>|\\w+?\\>|\\&)")
line_endings = re.compile(r"\\{1,8}[nr]")
url_param_split = re.compile(r"(%|\\u(00){0,2})(26|3f)", flags=re.IGNORECASE)
# some symbols e.g. double quotes cannot be in URL string https://www.ietf.org/rfc/rfc1738.txt
# \ - was added for case of url in escaped string \u0026amp; - means escaped & in HTML
Expand Down Expand Up @@ -180,6 +181,10 @@ def clean_bash_parameters(self) -> None:
# and value can be split by bash special characters
if len(value_spl) > 1:
self.value = value_spl[0]
if ' ' not in self.value and ("\\n" in self.value or "\\r" in self.value):
value_whsp = self.line_endings.split(self.value)
if len(value_whsp) > 1:
self.value = value_whsp[0]

def sanitize_variable(self) -> None:
"""Remove trailing spaces, dashes and quotations around the variable. Correct position."""
Expand Down
14 changes: 10 additions & 4 deletions credsweeper/filters/value_atlassian_token_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,13 @@ def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
if value.startswith("BBDC-"):
# Bitbucket HTTP Access Token
return ValueAtlassianTokenCheck.check_atlassian_struct(value[5:])
elif value.startswith("ATBB"):
elif value.startswith("AT"):
# Bitbucket App password
while "\\=" in value or "%3d" in value or "%3D" in value:
# = sign may be escaped in URL https://www.rfc-editor.org/rfc/rfc3986
value = value.replace('\\', '')
value = value.replace('%3d', '=')
value = value.replace('%3D', '=')
return ValueAtlassianTokenCheck.check_crc32_struct(value)
else:
# Jira / Confluence PAT token
Expand All @@ -43,9 +48,10 @@ def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
@staticmethod
def check_crc32_struct(value: str) -> bool:
"""Returns False if value is valid for bitbucket app password structure 'payload:crc32'"""
crc32 = int(value[28:], 16)
data = value[:28].encode(ASCII)
if crc32 == binascii.crc32(data):
crc32 = int(value[-8:], 16)
data = value[:-8].encode(ASCII)
data_crc32 = binascii.crc32(data)
if crc32 == data_crc32:
return False
return True

Expand Down
32 changes: 21 additions & 11 deletions credsweeper/filters/value_file_path_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ class ValueFilePathCheck(Filter):
Check if a value contains either '/' or ':\' separators (but not both)
and do not have any special characters ( !$@`&*()+)
"""
base64_possible_set = set(Chars.BASE64_CHARS.value) | set(Chars.BASE64URL_CHARS.value)
unusual_windows_symbols_in_path = "\t\n\r !$@`&*()[]{}<>+=;,~"
unusual_linux_symbols_in_path = unusual_windows_symbols_in_path + ":\\"

def __init__(self, config: Config = None) -> None:
pass
Expand All @@ -30,25 +33,32 @@ def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
value = line_data.value
contains_unix_separator = '/' in value
if contains_unix_separator:
if "://" in value or value.startswith("~/") or value.startswith("./") or "../" in value or "/.." in value:
# common case for url definition or aliases
return True
# base64 encoded data might look like linux path
min_entropy = ValueEntropyBase64Check.get_min_data_entropy(len(value))
# get minimal entropy to compare with shannon entropy of found value
# min_entropy == 0 means that the value cannot be checked with the entropy due high variance
if 0 == min_entropy or min_entropy > Util.get_shannon_entropy(value, Chars.BASE64STD_CHARS.value):
for i in value:
if i not in Chars.BASE64STD_CHARS.value:
# value contains wrong BASE64STD_CHARS symbols
break
else:
# all symbols are from base64 alphabet
contains_unix_separator = 1 < value.count('/')
for i in value:
if i not in self.base64_possible_set:
# value contains wrong BASE64STD_CHARS symbols like .
break
else:
# high entropy means base64 encoded data
contains_unix_separator = False
# all symbols are from base64 alphabet
entropy = Util.get_shannon_entropy(value, Chars.BASE64STD_CHARS.value)
if 0 == min_entropy or min_entropy > entropy:
contains_unix_separator = 1 < value.count('/')
else:
# high entropy means base64 encoded data
contains_unix_separator = False

# low shannon entropy points that the value maybe not a high randomized value in base64
contains_windows_separator = ':\\' in value
if contains_unix_separator or contains_windows_separator:
for i in " !$@`&*()[]{}+=;,":
unusual_symbols_in_path = self.unusual_linux_symbols_in_path if contains_unix_separator \
else self.unusual_windows_symbols_in_path
for i in unusual_symbols_in_path:
if i in value:
# the symbols which not passed in a path usually
break
Expand Down
7 changes: 4 additions & 3 deletions credsweeper/ml_model/ml_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,8 +220,9 @@ def validate_groups(self, group_list: List[Tuple[CandidateKey, List[Candidate]]]
probability[head:tail] = self._batch_call_model(line_input_list, variable_input_list, value_input_list,
features_list)
is_cred = probability > self.threshold
for i in range(len(is_cred)):
logger.debug("ML decision: %s with prediction: %s for value: %s", is_cred[i], round(probability[i], 8),
group_list[i][0])
if logger.isEnabledFor(logging.DEBUG):
for i in range(len(is_cred)):
logger.debug("ML decision: %s with prediction: %s for value: %s", is_cred[i], probability[i],
group_list[i][0])
# apply cast to float to avoid json export issue
return is_cred, probability.astype(float)
16 changes: 9 additions & 7 deletions credsweeper/rules/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -673,7 +673,7 @@
confidence: moderate
type: pattern
values:
- (?P<value_leftquote>[\"'])?(?P<variable>[+0-9A-Za-z-]{2,80}://)([^\s\"<>\[\]^~`{|}?]{0,80}:){1,3}(?P<value>[^\s\"<>\[\]^~`{|}@:/]{3,80})@[\w.-]{1,800}\\{0,8}(?P<value_rightquote>[\"'])?
- (?P<value_leftquote>[\"'])?(?P<variable>[+0-9A-Za-z-]{2,80}://)([^\s\'"<>\[\]^~`{|}@:/]{0,80}:){1,3}(?P<value>[^\s\'"<>\[\]^~`{|}@:/]{3,80})@[^\s\'"<>\[\]^~`{|}@:/]{1,800}\\{0,8}(?P<value_rightquote>[\"'])?
filter_type: UrlCredentialsGroup
use_ml: true
required_substrings:
Expand Down Expand Up @@ -911,9 +911,10 @@
confidence: strong
type: pattern
values:
- (?<![0-9A-Za-z_+-])(?P<value>ATCTT3xFfGN0[a-zA-Z0-9_-]{171}=[A-F0-9]{8})(?![=0-9A-Za-z_+-])
filter_type: TokenPattern
min_line_len: 183
- (?<![0-9A-Za-z_+-])(?P<value>ATCTT3xFfGN0[a-zA-Z0-9_-]{80,800}(\\?=|%3[dD])[A-F0-9]{8})(?![=0-9A-Za-z_+-])
filter_type:
- ValueAtlassianTokenCheck
min_line_len: 160
required_substrings:
- ATCTT3xFfGN0
target:
Expand Down Expand Up @@ -997,9 +998,10 @@
confidence: strong
type: pattern
values:
- (?<![0-9A-Za-z_+-])(?P<value>ATATT3xFfGF0[a-zA-Z0-9_-]{171}=[A-F0-9]{8})(?![=0-9A-Za-z_+-])
filter_type: TokenPattern
min_line_len: 191
- (?<![0-9A-Za-z_+-])(?P<value>ATATT3xFfGF0[a-zA-Z0-9_-]{80,800}(\\?=|%3[dD])[A-F0-9]{8})(?![=0-9A-Za-z_+-])
filter_type:
- ValueAtlassianTokenCheck
min_line_len: 160
required_substrings:
- ATATT3xFfGF0
target:
Expand Down
6 changes: 2 additions & 4 deletions credsweeper/scanner/scan_type/scan_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,16 +114,14 @@ def get_line_data_list(
bypass_end = offset_end

if config.use_filters and cls.filtering(config, target, line_data, filters):
if 0 < line_data.variable_end:
if line_data.variable and 0 <= line_data.variable_start < line_data.variable_end:
# may be next matched item will be not filtered - let search it after variable
bypass_start = line_data.variable_end
bypass_end = offset_end
# offsets.add((line_data.variable_end, offset_end))
elif 0 < line_data.value_end:
elif line_data.value and 0 <= line_data.value_start < line_data.value_end:
# may be next matched item will be not filtered - let search it after variable
bypass_start = line_data.value_end
bypass_end = offset_end
# offsets.add((line_data.value_end, offset_end))
continue

if target.offset is not None:
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ PyYAML==6.0.1
python-docx==1.1.0
requests==2.32.0
typing_extensions==4.9.0
whatthepatch==1.0.5
whatthepatch==1.0.6
pdfminer.six==20231228
password-strength==0.0.3.post2
python-dateutil==2.8.2
Expand Down
8 changes: 4 additions & 4 deletions tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,14 @@
NEGLIGIBLE_ML_THRESHOLD = 0.0001

# credentials count after scan
SAMPLES_CRED_COUNT: int = 425
SAMPLES_CRED_LINE_COUNT: int = 442
SAMPLES_CRED_COUNT: int = 429
SAMPLES_CRED_LINE_COUNT: int = 446

# credentials count after post-processing
SAMPLES_POST_CRED_COUNT: int = 383
SAMPLES_POST_CRED_COUNT: int = 387

# with option --doc
SAMPLES_IN_DOC = 407
SAMPLES_IN_DOC = 410

# archived credentials that are not found without --depth
SAMPLES_IN_DEEP_1 = SAMPLES_POST_CRED_COUNT + 25
Expand Down
Loading

0 comments on commit 0a3b8d2

Please sign in to comment.