Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

KeywordPattern regex improvement #606

Merged
merged 4 commits into from
Sep 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 54 additions & 53 deletions .ci/benchmark.txt

Large diffs are not rendered by default.

12 changes: 8 additions & 4 deletions .github/workflows/benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ jobs:
- name: Checkout CredData
uses: actions/checkout@v4
with:
repository: Samsung/CredData
repository: babenek/CredData
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

will be rollback in #600 after merge conflicts an so on...

ref: awsmulti

- name: Markup hashing
run: |
Expand Down Expand Up @@ -72,7 +73,8 @@ jobs:
- name: Checkout CredData
uses: actions/checkout@v4
with:
repository: Samsung/CredData
repository: babenek/CredData
ref: awsmulti

- name: Markup hashing
run: |
Expand Down Expand Up @@ -169,7 +171,8 @@ jobs:
- name: Checkout CredData
uses: actions/checkout@v4
with:
repository: Samsung/CredData
repository: babenek/CredData
ref: awsmulti

- name: Markup hashing
run: |
Expand Down Expand Up @@ -351,7 +354,8 @@ jobs:
- name: Checkout CredData
uses: actions/checkout@v4
with:
repository: Samsung/CredData
repository: babenek/CredData
ref: awsmulti

- name: Markup hashing
run: |
Expand Down
28 changes: 0 additions & 28 deletions credsweeper/common/constants.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,8 @@
import re
import typing
from enum import Enum
from typing import Optional, Union


class KeywordPattern:
"""Pattern set of keyword types"""
key_left = r"(\\[nrt])?(?P<variable>(([`'\"]+[^:='\"`}<>\\/&?]*|[^:='\"`}<>\s()\\/&?]*)" \
r"(?P<keyword>"
# there will be inserted a keyword
key_right = r")" \
r"[^:='\"`<>{?!&]*)[`'\"]*)" # <variable>
separator = r"\s*\]?\s*" \
r"(?P<separator>:( [a-z]{3,9}[?]? )?=" \
r"|:|=>|!=|===|==|=)" \
r"\s*(?P<wrap>((new\s*)?\w|\.|->|\(|\[)*[\[\(\{](\w{1,32}=)?\s*)?"
# Authentication scheme ( oauth | basic | bearer | apikey ) precedes to credential
value = r"(?P<value_leftquote>((b|r|br|rb|u|f|rf|fr|\\{0,8})?[`'\"]){1,4})?" \
r"( ?(oauth|bot|basic|bearer|apikey|accesskey) )?" \
r"(?P<value>" \
r"(?(value_leftquote)(?:\\[tnrux0-7][0-9a-f]*|[^`'\"\\])|(?:\\n|\\r|\\?[^\s`'\"\\,;])){1,8000}" \
r"|(?:\{[^}]{3,8000}\})|(?:<[^>]{3,8000}>)" \
r")" \
r"(?(value_leftquote)(?P<value_rightquote>(\\{0,8}[`'\"]){1,4})?|(?(wrap)[\]\)\},;]))"

@classmethod
def get_keyword_pattern(cls, keyword: str) -> re.Pattern:
"""Returns compiled regex pattern"""
expression = "".join([cls.key_left, keyword, cls.key_right, cls.separator, cls.value])
return re.compile(expression, flags=re.IGNORECASE)


class Severity(Enum):
"""Severity of candidate"""
CRITICAL = "critical"
Expand Down
58 changes: 58 additions & 0 deletions credsweeper/common/keyword_pattern.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import re


class KeywordPattern:
"""Pattern set of keyword types"""
key_left = r"(\\[nrt])?"\
r"(?P<variable>(([`'\"]+[^:='\"`}<>\\/&?]*|[^:='\"`}<>\s()\\/&?]*)" \
r"(?P<keyword>"
# there will be inserted a keyword
key_right = r")" \
r"[^:='\"`<>{?!&]*)[`'\"]*)" # <variable>
separator = r"(\s|\\+[tnr])*\]?(\s|\\+[tnr])*" \
r"(?P<separator>:( [a-z]{3,9}[?]? )?=|:|=(>|&gt;|\\u0026gt;)|!=|===|==|=)" \
r"(\s|\\+[tnr])*"
# might be curly, square or parenthesis with words before
wrap = r"(?P<wrap>(" \
r"(new(\s|\\+[tnr])+)?" \
r"([0-9a-z_.]|-(>|(&|\\\\*u0026)gt;))*" \
r"[\[\(\{]"\
r"(\s|\\+[tnr])*" \
r"([0-9a-z_]{1,32}=)?" \
r")+)?"
string_prefix = r"(((b|r|br|rb|u|f|rf|fr|l|@)(?=(\\*[`'\"])))?"
left_quote = r"(?P<value_leftquote>((?P<esq>\\{1,8})?[`'\"]){1,4}))?"
# Authentication scheme ( oauth | basic | bearer | apikey ) precedes to credential
auth_keywords = r"( ?(oauth|bot|basic|bearer|apikey|accesskey) )?"
value = r"(?P<value>" \
r"(?(value_leftquote)" \
r"(" \
r"(?!(?P=value_leftquote))" \
r"(?(esq)((?!(?P=esq)['`\"]).)|((?!(?P=value_leftquote)).)))" \
r"|" \
r"(\\+([ tnr]|[^\s`'\"])|[^\s`'\",;\\])" \
r"){3,8000}" \
r"|(\{[^}]{3,8000}\})" \
r"|(<[^>]{3,8000}>)" \
r")"
right_quote = r"(?(value_leftquote)" \
r"(?P<value_rightquote>(?<!\\)(?P=value_leftquote)|\\$|(?<=[0-9a-z+_/-])$)" \
r"|" \
r"(?(wrap)[\]\)\},;]))"

@classmethod
def get_keyword_pattern(cls, keyword: str) -> re.Pattern:
"""Returns compiled regex pattern"""
expression = "".join([ #
cls.key_left, #
keyword, #
cls.key_right, #
cls.separator, #
cls.wrap, #
cls.string_prefix, #
cls.left_quote, #
cls.auth_keywords, #
cls.value, #
cls.right_quote, #
])
return re.compile(expression, flags=re.IGNORECASE | re.DOTALL)
2 changes: 1 addition & 1 deletion credsweeper/rules/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@
type: multi
values:
- (?<![0-9A-Za-z_+-])(?P<value>(ABIA|ACCA|AGPA|AIDA|AIPA|AKIA|ANPA|ANVA|AROA|APKA|ASCA|ASIA)[0-9A-Z]{16,17})(?![=0-9A-Za-z_+-])
- (?<![0-9A-Za-z_/+-])(?P<value>[0-9A-Za-z/+]{40,80})(?![=0-9A-Za-z_/+-])
- (?<![0-9A-Za-z_/+-])(?P<value>[0-9A-Za-z/+]{35,80})(?![=0-9A-Za-z_/+-])
filter_type: GeneralPattern
required_substrings:
- A
Expand Down
3 changes: 2 additions & 1 deletion credsweeper/rules/rule.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
from typing import Dict, List, Optional, Union, Set

from credsweeper import validations, filters
from credsweeper.common.constants import RuleType, Severity, MAX_LINE_LENGTH, KeywordPattern, Confidence
from credsweeper.common.constants import RuleType, Severity, MAX_LINE_LENGTH, Confidence
from credsweeper.common.keyword_pattern import KeywordPattern
from credsweeper.config import Config
from credsweeper.filters import Filter, group
from credsweeper.filters.group import Group
Expand Down
8 changes: 8 additions & 0 deletions docs/source/credsweeper.common.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,14 @@ credsweeper.common.keyword\_checklist module
:undoc-members:
:show-inheritance:

credsweeper.common.keyword\_pattern module
------------------------------------------

.. automodule:: credsweeper.common.keyword_pattern
:members:
:undoc-members:
:show-inheritance:

Module contents
---------------

Expand Down
10 changes: 5 additions & 5 deletions tests/__init__.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
from pathlib import Path

# total number of files in test samples
SAMPLES_FILES_COUNT: int = 132
SAMPLES_FILES_COUNT: int = 133

# the lowest value of ML threshold is used to display possible lowest values
NEGLIGIBLE_ML_THRESHOLD = 0.0001

# credentials count after scan
SAMPLES_CRED_COUNT: int = 378
SAMPLES_CRED_LINE_COUNT: int = 395
SAMPLES_CRED_COUNT: int = 386
SAMPLES_CRED_LINE_COUNT: int = 404

# credentials count after post-processing
SAMPLES_POST_CRED_COUNT: int = 347
SAMPLES_POST_CRED_COUNT: int = 355

# with option --doc
SAMPLES_IN_DOC = 419
SAMPLES_IN_DOC = 423

# archived credentials that are not found without --depth
SAMPLES_IN_DEEP_1 = SAMPLES_POST_CRED_COUNT + 23
Expand Down
48 changes: 0 additions & 48 deletions tests/common/test_constants.py

This file was deleted.

137 changes: 137 additions & 0 deletions tests/common/test_keyword_pattern.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
import pytest

from credsweeper.common.keyword_pattern import KeywordPattern
from credsweeper.config import Config
from credsweeper.credentials import LineData
from credsweeper.utils import Util


class TestKeywordPattern:

@pytest.mark.parametrize("line", ["melon is 'banana'"])
def test_separator_n(self, config: Config, file_path: pytest.fixture, line: str) -> None:
pattern = KeywordPattern.get_keyword_pattern("melon")
line_data = LineData(config,
line,
0,
1,
file_path,
Util.get_extension(file_path),
info="dummy",
pattern=pattern)
assert line_data.value is None

@pytest.mark.parametrize("line", ["melon = 'banAna'", "melon : 'banAna'", "melon := 'banAna'"])
def test_separator_p(self, config: Config, file_path: pytest.fixture, line: str) -> None:
pattern = KeywordPattern.get_keyword_pattern("melon")
line_data = LineData(config,
line,
0,
1,
file_path,
Util.get_extension(file_path),
info="dummy",
pattern=pattern)
assert line_data.value == "banAna"

@pytest.mark.parametrize(
"line, value",
[
# ['''...log=1;User ID=X3;password=Quantum42!\\""''', '''Quantum42!'''], # todo
# ["""password='\\\\'secret-1\\\\''""", """\\'secret-1\\'"""], # todo
# ['''password="\\"secret-2\\""''', '''\\"secret-2\\"'''], # todo
# ["""password=rb'\\'secret=1\\''""", """\\'secret=1\\'"""], # todo
# ['''password=f"\\"secret=2\\""''', '''\\"secret=2\\"'''], # todo
# ['''password=r"\\\\"secret=3\\\\""''', '''\\"secret=3\\"'''], # todo
# ['''"password = 'sec;$2`\\'[\\/*;ret';";''', '''sec;$2`\\'[\\/*;ret'''], # todo
['''"$password = "10qoakxncnfh47t_''', '''10qoakxncnfh47t_'''], #
[
'''copes\":[\"user\"],\"note\":\"Note\",\"password\":\"cc6323cb2223f82f01\",\"upd_at\":\"1765....\",''',
'''cc6323cb2223f82f01'''
], #
['''"password = pas:sword # comment''', '''pas:sword'''],
['''x.password=pK5C4tlA/w1cO\\=\\=''', '''pK5C4tlA/w1cO\\=\\='''], #
['''final String body = \"{ \\"passwords\\":\\"i0sEcReT\\\\/MwX3X\\","''', '''i0sEcReT\\\\/MwX3X'''],
[
'''\\\"password\\\"=\\u0026gt;\t\\n\\t\\\"lfFTfDT1roc4YbG9hy5cnvX\\n oZ+Sc/wb+CvdF4s==\\\",\\n",''',
'''lfFTfDT1roc4YbG9hy5cnvX\\n oZ+Sc/wb+CvdF4s=='''
],
[
'''var request = {"password": "{\\"wks\\": \\"8x9s3ga7\\", \\"uzr\": \\"wbm\\"}","Any-Tail":"x\r"};''',
'''{\\"wks\\": \\"8x9s3ga7\\", \\"uzr": \\"wbm\\"}'''
],
['''passwords: ["1029384756",''', '''1029384756'''], #
['''passwords:[ "1029384756", "9801726354" ]''', '''1029384756'''], #
['''password="\\"secret-line-wrap\\''', '''secret-line-wrap'''], #
['''password=r"""secret4"""''', '''secret4'''], #
['''password=r\\"\\"\\"secret5\\"\\"\\"''', '''secret5'''], #
['''password="""secret6"""''', '''secret6'''], #
['''password=\\\\"\\\\"\\\\"secret7\\\\"\\\\"\\\\"''', '''secret7'''], #
['''password=\\\\"\\\\"\\\\"secret"7\\\\"\\\\"\\\\"''', '''secret"7'''], #
['''password="""{\\"secret8\\"}"""''', '''{\\"secret8\\"}'''], #
['''password="""secret'9"""''', '''secret'9'''], #
["""password='''secret'6'''""", '''secret'6'''], #
["""password='''secret`8'''""", '''secret`8'''], #
["""password=``secret`7``""", '''secret`7'''], #
["""password=``secret 5``""", '''secret 5'''], #
["""password='secret\\ 5''""", '''secret\\ 5'''], #
["""password=secret\\ 5""", '''secret\\ 5'''], #
["""password=secret 0""", '''secret'''], #
["""password=secret0\\""", '''secret0'''], #
["""password=r'\\"secret\\"'""", '''\\"secret\\"'''], #
['''password=r\\"{\\\\"secret\\\\"}\\"''', '{\\\\"secret\\\\"}'], #
['''password=r"{\\"secret\\"}"''', '{\\"secret\\"}'], #
["""password=b'"secret4"'""", '"secret4"'], #
["""password=rb'\\\\"secret\\\\"'""", '\\\\"secret\\\\"'], #
["""password=r\\'"sec'"'"'"ret"\\'""", '''"sec'"'"'"ret"'''], #
["""\\'\\\\\\\\'password\\\\\\\\': b\\\\\\\\'secret\\\\\\\\'\\'""", "secret"], #
["""'password': b'secret'""", """secret"""], #
["""'password': r'secret'""", """secret"""], #
["""'password': fr'secret'""", """secret"""], #
["""\\'password\\': \\'secret\\'""", """secret"""], #
['''db.setCred("{ \"password\" : \"" + SECRET + "\" }");''', ''' + SECRET + '''],
['''\\"password\\": \\"{\\\\"secret\\\\": \\\\"test\\\\"}\\"''', '{\\\\"secret\\\\": \\\\"test\\\\"}'], #
['''"password": "{\\\\"secret\\\\": \\\\"test\\\\"}"''', '{\\\\"secret\\\\": \\\\"test\\\\"}'], #
#normal_str = "First line.\nSecond line.\nEnd of message.\n";
['''std::string password = R"multiline\\npassword";''', '''multiline\\npassword'''], #
['''const wchar_t* password = L"wchar_t*secret";''', '''wchar_t*secret'''], #
['''const char16_t* password = U"char16_t*secret";''', '''char16_t*secret'''], #
[
'''char password[] = {'S', 'E', 'C', 'R', 'E', 'T', '\\0'};''',
'''{'S', 'E', 'C', 'R', 'E', 'T', '\\0'}'''
], #
['''"password": "{8vi6wL+10fI/eibC7wFwc}"''', '{8vi6wL+10fI/eibC7wFwc}'], #
['''final String password = new String("SECRET") {''', '''SECRET'''], #
['''final OAuth2AccessToken password = new OAuth2AccessToken(\"SEC.RET\");''', '''SEC.RET'''], #
['''password = obfuscate(arg="SECRET") {''', '''SECRET'''], #
['''final String password = new String(Super(Encrypted("SECRET"))) {''', '''SECRET'''], #
['''final String password = new String(Super( Encrypted("SECRET", "dummy"))) {''', '''SECRET'''], #
["""'password': 'ENC(lqjdoxlandicpfpqk)'""", """ENC(lqjdoxlandicpfpqk)"""], #
["""'password': 'ENC[lqjdoxlandicpfpqk]'""", """ENC[lqjdoxlandicpfpqk]"""], #
['''password24=secret42''', 'secret42'], #
['''password24=secret42\\ ''', 'secret42\\ '], #
['''password24=secret42\\''', 'secret42'], #
['''password24=secret42\\n''', 'secret42'], #
['password = 3VNdhWT3oFo5I7faffKO\\\neEagnK7tYBcGxhla\n;', '''3VNdhWT3oFo5I7faffKO'''],
['password = "3VNdhWT3oFo5I7faffKO\n gnK7tYBcGxhla\n";', '''3VNdhWT3oFo5I7faffKO\n gnK7tYBcGxhla\n'''],
])
def test_keyword_pattern_p(self, config: Config, file_path: pytest.fixture, line: str, value: str) -> None:
pattern = KeywordPattern.get_keyword_pattern("password")
line_data = LineData(config,
line,
0,
1,
file_path,
Util.get_extension(file_path),
info="dummy",
pattern=pattern)
assert line_data.value == value

@pytest.mark.parametrize("line", [
"https://fonts.googleapis.com/css2?family=Montserrat:wght@500;700;900&family=Roboto:wght@300;400;500;700;900"
"&family=Roboto+Mono:wght@300;400;600;900&display=swap"
])
def test_keyword_pattern_n(self, config: Config, file_path: pytest.fixture, line: str) -> None:
pattern = KeywordPattern.get_keyword_pattern("api")
line_data = LineData(config, line, 0, 1, file_path, "file_type", "info", pattern)
assert line_data.value is None
Loading
Loading