Skip to content

Commit

Permalink
improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
babenek committed Jul 16, 2024
1 parent 0f86c96 commit d5c64c4
Show file tree
Hide file tree
Showing 9 changed files with 60 additions and 28 deletions.
12 changes: 8 additions & 4 deletions .github/workflows/benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ jobs:
- name: Checkout CredData
uses: actions/checkout@v4
with:
repository: Samsung/CredData
repository: babenek/CredData
ref: valsanitizer

- name: Markup hashing
run: |
Expand Down Expand Up @@ -72,7 +73,8 @@ jobs:
- name: Checkout CredData
uses: actions/checkout@v4
with:
repository: Samsung/CredData
repository: babenek/CredData
ref: valsanitizer

- name: Markup hashing
run: |
Expand Down Expand Up @@ -169,7 +171,8 @@ jobs:
- name: Checkout CredData
uses: actions/checkout@v4
with:
repository: Samsung/CredData
repository: babenek/CredData
ref: valsanitizer

- name: Markup hashing
run: |
Expand Down Expand Up @@ -350,7 +353,8 @@ jobs:
- name: Checkout CredData
uses: actions/checkout@v4
with:
repository: Samsung/CredData
repository: babenek/CredData
ref: valsanitizer

- name: Markup hashing
run: |
Expand Down
2 changes: 1 addition & 1 deletion credsweeper/common/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

class KeywordPattern:
"""Pattern set of keyword types"""
key_left = r"(?P<variable>(([`'\"]+[^:='\"`}<>\\/&?]*|[^:='\"`}<>\s()\\/&?]*)" \
key_left = r"(\\[nrt])?(?P<variable>(([`'\"]+[^:='\"`}<>\\/&?]*|[^:='\"`}<>\s()\\/&?]*)" \
r"(?P<keyword>"
# there will be inserted a keyword
key_right = r")" \
Expand Down
5 changes: 5 additions & 0 deletions credsweeper/credentials/line_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ class LineData:
quotation_marks = ('"', "'", '`')
comment_starts = ("//", "* ", "#", "/*", "<!––", "%{", "%", "...", "(*", "--", "--[[", "#=")
bash_param_split = re.compile("\\s+(\\-|\\||\\>|\\w+?\\>|\\&)")
line_endings = re.compile(r"\\{1,8}[nr]")
url_param_split = re.compile(r"(%|\\u(00){0,2})(26|3f)", flags=re.IGNORECASE)
# some symbols e.g. double quotes cannot be in URL string https://www.ietf.org/rfc/rfc1738.txt
# \ - was added for case of url in escaped string \u0026amp; - means escaped & in HTML
Expand Down Expand Up @@ -181,6 +182,10 @@ def clean_bash_parameters(self) -> None:
# and value can be split by bash special characters
if len(value_spl) > 1:
self.value = value_spl[0]
if ' ' not in self.value and ("\\n" in self.value or "\\r" in self.value):
value_whsp = self.line_endings.split(self.value)
if len(value_whsp) > 1:
self.value = value_whsp[0]

def sanitize_variable(self) -> None:
"""Remove trailing spaces, dashes and quotations around the variable. Correct position."""
Expand Down
32 changes: 21 additions & 11 deletions credsweeper/filters/value_file_path_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ class ValueFilePathCheck(Filter):
Check if a value contains either '/' or ':\' separators (but not both)
and do not have any special characters ( !$@`&*()+)
"""
base64_possible_set = set(Chars.BASE64_CHARS.value) | set(Chars.BASE64URL_CHARS.value)
unusual_windows_symbols_in_path = "\t\n\r !$@`&*()[]{}<>+=;,~"
unusual_linux_symbols_in_path = unusual_windows_symbols_in_path + ":\\"

def __init__(self, config: Config = None) -> None:
pass
Expand All @@ -30,25 +33,32 @@ def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
value = line_data.value
contains_unix_separator = '/' in value
if contains_unix_separator:
if "://" in value or value.startswith("~/") or value.startswith("./") or "../" in value or "/.." in value:
# common case for url definition or aliases
return True
# base64 encoded data might look like linux path
min_entropy = ValueEntropyBase64Check.get_min_data_entropy(len(value))
# get minimal entropy to compare with shannon entropy of found value
# min_entropy == 0 means that the value cannot be checked with the entropy due high variance
if 0 == min_entropy or min_entropy > Util.get_shannon_entropy(value, Chars.BASE64STD_CHARS.value):
for i in value:
if i not in Chars.BASE64STD_CHARS.value:
# value contains wrong BASE64STD_CHARS symbols
break
else:
# all symbols are from base64 alphabet
contains_unix_separator = 1 < value.count('/')
for i in value:
if i not in self.base64_possible_set:
# value contains wrong BASE64STD_CHARS symbols like .
break
else:
# high entropy means base64 encoded data
contains_unix_separator = False
# all symbols are from base64 alphabet
entropy = Util.get_shannon_entropy(value, Chars.BASE64STD_CHARS.value)
if 0 == min_entropy or min_entropy > entropy:
contains_unix_separator = 1 < value.count('/')
else:
# high entropy means base64 encoded data
contains_unix_separator = False

# low shannon entropy points that the value maybe not a high randomized value in base64
contains_windows_separator = ':\\' in value
if contains_unix_separator or contains_windows_separator:
for i in " !$@`&*()[]{}+=;,":
unusual_symbols_in_path = self.unusual_linux_symbols_in_path if contains_unix_separator \
else self.unusual_windows_symbols_in_path
for i in unusual_symbols_in_path:
if i in value:
# the symbols which not passed in a path usually
break
Expand Down
7 changes: 4 additions & 3 deletions credsweeper/ml_model/ml_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,8 +220,9 @@ def validate_groups(self, group_list: List[Tuple[CandidateKey, List[Candidate]]]
probability[head:tail] = self._batch_call_model(line_input_list, variable_input_list, value_input_list,
features_list)
is_cred = probability > self.threshold
for i in range(len(is_cred)):
logger.debug("ML decision: %s with prediction: %s for value: %s", is_cred[i], round(probability[i], 8),
group_list[i][0])
if logger.isEnabledFor(logging.DEBUG):
for i in range(len(is_cred)):
logger.debug("ML decision: %s with prediction: %s for value: %s", is_cred[i], probability[i],
group_list[i][0])
# apply cast to float to avoid json export issue
return is_cred, probability.astype(float)
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ PyYAML==6.0.1
python-docx==1.1.0
requests==2.32.0
typing_extensions==4.9.0
whatthepatch==1.0.5
whatthepatch==1.0.6
pdfminer.six==20231228
password-strength==0.0.3.post2
python-dateutil==2.8.2
Expand Down
8 changes: 4 additions & 4 deletions tests/data/depth_3.json
Original file line number Diff line number Diff line change
Expand Up @@ -7718,7 +7718,7 @@
{
"api_validation": "NOT_AVAILABLE",
"ml_validation": "VALIDATED_KEY",
"ml_probability": 0.78,
"ml_probability": 0.756,
"rule": "Github Old Token",
"severity": "high",
"confidence": "moderate",
Expand All @@ -7745,7 +7745,7 @@
{
"api_validation": "NOT_AVAILABLE",
"ml_validation": "VALIDATED_KEY",
"ml_probability": 0.78,
"ml_probability": 0.756,
"rule": "Token",
"severity": "medium",
"confidence": "moderate",
Expand All @@ -7758,8 +7758,8 @@
"value": "gireogicracklecrackle1231567190113413981",
"value_start": 15,
"value_end": 55,
"variable": "ngit_token",
"variable_start": 1,
"variable": "git_token",
"variable_start": 2,
"variable_end": 11,
"entropy_validation": {
"iterator": "BASE36_CHARS",
Expand Down
12 changes: 9 additions & 3 deletions tests/filters/test_value_file_path_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,24 @@

class TestValueFilePathCheck:

def test_value_file_path_check_p(self, file_path: pytest.fixture, success_line: pytest.fixture) -> None:
line_data = get_line_data(file_path, line=success_line, pattern=LINE_VALUE_PATTERN)
@pytest.mark.parametrize("line", [
"5//0KCPafDhZvtCwqrsyiKFeDGT_0ZGHiI-E0ClIWrLC7tZ1WE5vHc4-Y2qi1IhPy3Pz5fmCe9OPIxEZUONUg7SWJF9nwQ_j2lIdXU0",
])
def test_value_file_path_check_p(self, file_path: pytest.fixture, line: str) -> None:
line_data = get_line_data(file_path, line=line, pattern=LINE_VALUE_PATTERN)
assert ValueFilePathCheck().run(line_data, DUMMY_ANALYSIS_TARGET) is False

@pytest.mark.parametrize(
"line",
[
"crackle/filepath.txt",
"/home/user/tmp", # simple path
"../..", # path
"dir/..", # path
"../dir", # path
"file:///Crackle/filepath/", # path from browser url
"~/.custompass", # path with synonym
"crackle/filepath_txt",
"./sshpass.sh", # path with synonym
"crackle/file.path", #
"C:\\Crackle\\filepath", #
])
Expand Down
8 changes: 7 additions & 1 deletion tests/test_main.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import io
import io
import os
import random
import shutil
Expand Down Expand Up @@ -806,6 +805,13 @@ def test_param_n(self) -> None:
def test_param_p(self) -> None:
# internal parametrized tests for quick debug
items = [ #
("slt.py", b'\\t\\tsalt = "\\x187bhgerjhqw\\n iKa\\tW_R~0/8"', "salt", "\\x187bhgerjhqw\\n iKa\\tW_R~0/8"),
("log.txt",
b'json\\nAuthorization: Basic jfhlksadjiu9813ryiuhdfskadjlkjh34\\n\\u003c/code\\u003e\\u003c/pre\\u003e"',
"Authorization", "jfhlksadjiu9813ryiuhdfskadjlkjh34"),
("pwd.py", b'password = "ji3_8iKgaW_R~0/8"', "password", "ji3_8iKgaW_R~0/8"),
("pwd.py", b'password = "/_tcTz<D8sWXsW<E"', "password", "/_tcTz<D8sWXsW<E"),
("pwd.py", b'password = "I:FbCnXQc/9E02Il"', "password", "I:FbCnXQc/9E02Il"),
("url_part.py", b'39084?token=3487263-2384579834-234732875-345&key=DnBeiGdgy6253fytfdDHGg&hasToBeFound=2',
'token', '3487263-2384579834-234732875-345'),
("prod.py", b"secret_api_key='Ahga%$FiQ@Ei8'", "secret_api_key", "Ahga%$FiQ@Ei8"), #
Expand Down

0 comments on commit d5c64c4

Please sign in to comment.