Skip to content

Commit

Permalink
Merge branch 'main' into babenek-patch-1
Browse files Browse the repository at this point in the history
  • Loading branch information
babenek authored Dec 18, 2024
2 parents a846659 + ee0c9b5 commit 6df255e
Show file tree
Hide file tree
Showing 48 changed files with 10,775 additions and 4,790 deletions.
30 changes: 15 additions & 15 deletions .ci/benchmark.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
META MD5 30ecf5f4796a36b60ca12cb702152bab
META MD5 b33b22ce3adc2141bcf91e4cdd6f1cab
DATA MD5 9ac09dae7d8873d53e1fbf18da2d71c4
DATA: 16329853 interested lines. MARKUP: 59549 items
FileType FileNumber ValidLines Positives Negatives Templates
Expand Down Expand Up @@ -82,7 +82,7 @@ FileType FileNumber ValidLines Positives Negatives Templat
.ipynb 1 134 6
.j 1 241 4
.j2 30 5530 6 174 10
.java 613 133184 347 1321 171
.java 613 133184 347 1323 171
.jenkinsfile 1 58 2 6
.jinja2 1 64 2
.js 653 532652 512 2450 331
Expand Down Expand Up @@ -222,23 +222,23 @@ FileType FileNumber ValidLines Positives Negatives Templat
.yml 418 36057 522 910 376
.zsh 6 872 12
.zsh-theme 1 97 1
TOTAL: 10003 16329853 11856 46609 5084
credsweeper result_cnt : 11613, lost_cnt : 0, true_cnt : 11349, false_cnt : 264
TOTAL: 10003 16329853 11856 46611 5084
credsweeper result_cnt : 11623, lost_cnt : 0, true_cnt : 11391, false_cnt : 232
Rules Positives Negatives Templates Reported TP FP TN FN FPR FNR ACC PRC RCL F1
------------------------------ ----------- ----------- ----------- ---------- ----- ---- ----- ---- -------- -------- -------- -------- -------- --------
API 130 3166 188 125 123 2 3352 7 0.000596 0.053846 0.997417 0.984000 0.946154 0.964706
API 130 3166 188 126 125 1 3353 5 0.000298 0.038462 0.998278 0.992063 0.961538 0.976562
AWS Client ID 168 21 0 160 160 0 21 8 0.000000 0.047619 0.957672 1.000000 0.952381 0.975610
AWS Multi 82 10 0 84 82 1 9 0 0.100000 0.000000 0.989130 0.987952 1.000000 0.993939
AWS S3 Bucket 67 23 0 92 67 23 0 0 1.000000 0.000000 0.744444 0.744444 1.000000 0.853503
Atlassian Old PAT token 3 7 0 10 3 7 0 0 1.000000 0.000000 0.300000 0.300000 1.000000 0.461538
Auth 417 2739 82 393 390 3 2818 27 0.001063 0.064748 0.990735 0.992366 0.935252 0.962963
Auth 417 2741 82 392 387 5 2818 30 0.001771 0.071942 0.989198 0.987245 0.928058 0.956737
Azure Access Token 19 0 0 12 12 0 0 7 0.368421 0.631579 1.000000 0.631579 0.774194
BASE64 Private Key 12 4 0 12 12 0 4 0 0.000000 0.000000 1.000000 1.000000 1.000000 1.000000
BASE64 encoded PEM Private Key 7 0 0 5 5 0 0 2 0.285714 0.714286 1.000000 0.714286 0.833333
Bitbucket Client ID 19 53 0 75 19 53 0 0 1.000000 0.000000 0.263889 0.263889 1.000000 0.417582
Bitbucket Client Secret 28 66 1 98 28 67 0 0 1.000000 0.000000 0.294737 0.294737 1.000000 0.455285
CMD ConvertTo-SecureString 13 4 0 13 13 0 4 0 0.000000 0.000000 1.000000 1.000000 1.000000 1.000000
CMD Password 21 128 6 18 18 0 134 3 0.000000 0.142857 0.980645 1.000000 0.857143 0.923077
CMD Password 21 128 6 20 20 0 134 1 0.000000 0.047619 0.993548 1.000000 0.952381 0.975610
CMD Secret 1 1 0 1 1 0 1 0 0.000000 0.000000 1.000000 1.000000 1.000000 1.000000
CMD Token 6 0 0 6 6 0 0 0 0.000000 1.000000 1.000000 1.000000 1.000000
Certificate 24 471 0 20 20 0 471 4 0.000000 0.166667 0.991919 1.000000 0.833333 0.909091
Expand All @@ -257,19 +257,19 @@ Grafana Provisioned API Key 22 1 0
JSON Web Token 170 61 0 131 131 0 61 39 0.000000 0.229412 0.831169 1.000000 0.770588 0.870432
Jira / Confluence PAT token 0 4 0 0 0 4 0 0.000000 1.000000
Jira 2FA 15 6 1 12 12 0 7 3 0.000000 0.200000 0.863636 1.000000 0.800000 0.888889
Key 3911 15715 485 3944 3893 51 16149 18 0.003148 0.004602 0.996569 0.987069 0.995398 0.991216
Nonce 93 49 0 91 90 1 48 3 0.020408 0.032258 0.971831 0.989011 0.967742 0.978261
Key 3911 15715 485 3921 3896 25 16175 15 0.001543 0.003835 0.998011 0.993624 0.996165 0.994893
Nonce 93 49 0 91 91 0 49 2 0.000000 0.021505 0.985915 1.000000 0.978495 0.989130
Other 9 7447 5 0 0 7452 9 0.000000 1.000000 0.998794 0.000000
PEM Private Key 1019 1483 0 1023 1019 4 1479 0 0.002697 0.000000 0.998401 0.996090 1.000000 0.998041
Password 1869 7536 2680 1774 1756 18 10198 113 0.001762 0.060460 0.989160 0.989853 0.939540 0.964041
Salt 47 76 1 44 44 0 77 3 0.000000 0.063830 0.975806 1.000000 0.936170 0.967033
Secret 1297 1576 802 1288 1283 5 2373 14 0.002103 0.010794 0.994830 0.996118 0.989206 0.992650
Password 1869 7536 2680 1795 1782 13 10203 87 0.001273 0.046549 0.991725 0.992758 0.953451 0.972707
Salt 47 76 1 45 45 0 77 2 0.000000 0.042553 0.983871 1.000000 0.957447 0.978261
Secret 1297 1576 802 1292 1288 4 2374 9 0.001682 0.006939 0.996463 0.996904 0.993061 0.994979
Seed 1 6 0 0 0 6 1 0.000000 1.000000 0.857143 0.000000
Slack Token 4 1 0 4 4 0 1 0 0.000000 0.000000 1.000000 1.000000 1.000000 1.000000
Stripe Credentials 2 0 0 2 2 0 0 0 0.000000 1.000000 1.000000 1.000000 1.000000
Tencent WeChat API App ID 6 0 0 6 6 0 0 0 0.000000 1.000000 1.000000 1.000000 1.000000
Token 644 4170 454 617 615 2 4622 29 0.000433 0.045031 0.994115 0.996759 0.954969 0.975416
Token 644 4170 454 618 617 1 4623 27 0.000216 0.041925 0.994685 0.998382 0.958075 0.977813
Twilio Credentials 30 39 0 30 30 0 39 0 0.000000 0.000000 1.000000 1.000000 1.000000 1.000000
URL Credentials 210 157 215 205 205 0 372 5 0.000000 0.023810 0.991409 1.000000 0.976190 0.987952
URL Credentials 210 157 215 209 208 1 371 2 0.002688 0.009524 0.994845 0.995215 0.990476 0.992840
UUID 1075 265 0 1074 1073 1 264 2 0.003774 0.001860 0.997761 0.999069 0.998140 0.998604
11856 46609 5084 11626 11349 264 46345 507 0.005664 0.042763 0.986813 0.977267 0.957237 0.967148
11856 46611 5084 11636 11391 232 46379 465 0.004977 0.039221 0.988079 0.980040 0.960779 0.970314
4 changes: 2 additions & 2 deletions .github/workflows/check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@ jobs:
- name: Check ml_model.onnx integrity
if: ${{ always() && steps.code_checkout.conclusion == 'success' }}
run: |
md5sum --binary credsweeper/ml_model/ml_config.json | grep 49c4352ae9ec82ad432d49d7e51c27f1
md5sum --binary credsweeper/ml_model/ml_model.onnx | grep ff66e97c446d0f2bbd8d37b7dfff7361
md5sum --binary credsweeper/ml_model/ml_config.json | grep ec3ac77a923fed769fd95d567ef75137
md5sum --binary credsweeper/ml_model/ml_model.onnx | grep a707745d781517556fd58890cb2812be
# # # line ending

Expand Down
2 changes: 1 addition & 1 deletion .readthedocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ version: 2

# Use build.os instead of build.image on your configuration file https://blog.readthedocs.com/use-build-os-config/
build:
os: "ubuntu-24.04"
os: "ubuntu-latest"
tools:
python: "3.10"

Expand Down
54 changes: 27 additions & 27 deletions credsweeper/common/constants.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import string
import typing
from enum import Enum
from typing import Optional, Union
Expand Down Expand Up @@ -59,41 +60,37 @@ def get(confidence: Union[str, "Confidence"]) -> Optional["Confidence"]:
return None


class Base(Enum):
"""Stores types of character sets in lower case"""
digits = "digits"
ascii_uppercase = "ascii_uppercase"
ascii_lowercase = "ascii_lowercase"
base16upper = "base16upper"
base16lower = "base16lower"
base32 = "base32"
base36 = "base36"
base64 = "base64"
base64std = "base64std"
base64url = "base64url"
hex = "hex"


class Chars(Enum):
"""Stores three types characters sets.
"""
"""Stores enumeration of characters sets of encoding dictionaries"""

# set of characters, hexadecimal numeral system (Base16). Upper- and lowercase
HEX_CHARS = "0123456789ABCDEFabcdef"
HEX_CHARS = string.digits + "ABCDEFabcdef"
# UUID charset in uppercase
UUID_UPPER_CHARS = string.digits + "ABCDEF-"
# UUID charset in lowercase
UUID_LOWER_CHARS = string.digits + "abcdef-"
# set of characters, hexadecimal numeral system (Base16). Uppercase
BASE16UPPER = "0123456789ABCDEF"
BASE16UPPER = string.digits + "ABCDEF"
# set of characters, hexadecimal numeral system (Base16). Lowercase
BASE16LOWER = "0123456789abcdef"
BASE16LOWER = string.digits + "abcdef"
# set of 32 characters, used in Base32 encoding
BASE32_CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZ234567"
BASE32_CHARS = string.ascii_uppercase + "234567"
# set of 36 characters, used in Base36 encoding
BASE36_CHARS = "abcdefghijklmnopqrstuvwxyz1234567890"
# standard base64 with padding sign
BASE64_CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/="
BASE36_CHARS = string.digits + string.ascii_lowercase
# base62 set https://en.wikipedia.org/wiki/Base62
BASE62_CHARS = string.digits + string.ascii_letters
# URL- and filename-safe standard
BASE64URL_CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"
# standard base64
BASE64STD_CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
BASE64URL_CHARS = string.digits + string.ascii_letters + "-_"
# URL- and filename-safe standard plus padding sign
BASE64URLPAD_CHARS = string.digits + string.ascii_letters + "-_="
# standard base64 charset
BASE64STD_CHARS = string.digits + string.ascii_letters + "+/"
# standard base64 plus padding sign
BASE64STDPAD_CHARS = string.digits + string.ascii_letters + "+/="
# except whitespaces
ASCII_VISIBLE = string.digits + string.ascii_letters + string.punctuation
# all printable symbols
ASCII_PRINTABLE = string.printable


ENTROPY_LIMIT_BASE64 = 4.5
Expand Down Expand Up @@ -179,3 +176,6 @@ class DiffRowType(Enum):
# PEM x509 patterns
PEM_BEGIN_PATTERN = "-----BEGIN"
PEM_END_PATTERN = "-----END"

# similar min_line_len in rule_template - no real credential in data less than 8 bytes
MIN_DATA_LEN = 8
2 changes: 1 addition & 1 deletion credsweeper/deep_scanner/pdf_scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def data_scan(
pdf_content_provider = DataContentProvider(
data=element_text.encode(),
file_path=data_provider.file_path,
file_type=".xml",
file_type=data_provider.file_type,
info=f"{data_provider.info}|PDF:{page.pageid}")
new_limit = recursive_limit_size - len(pdf_content_provider.data)
element_candidates = self.recursive_scan(pdf_content_provider, depth, new_limit)
Expand Down
5 changes: 1 addition & 4 deletions credsweeper/file_handler/data_content_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,14 @@
import yaml
from bs4 import BeautifulSoup, Tag, XMLParsedAsHTMLWarning

from credsweeper.common.constants import DEFAULT_ENCODING, ASCII
from credsweeper.common.constants import DEFAULT_ENCODING, ASCII, MIN_DATA_LEN
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.file_handler.content_provider import ContentProvider
from credsweeper.utils import Util

warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning, module='bs4')
logger = logging.getLogger(__name__)

# similar min_line_len in rule_template - no real credential in data less than 8 bytes
MIN_DATA_LEN = 8

# 8 bytes encodes to 12 symbols 12345678 -> MTIzNDU2NzgK
MIN_ENCODED_DATA_LEN = 12

Expand Down
2 changes: 1 addition & 1 deletion credsweeper/filters/value_base64_part_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ class ValueBase64PartCheck(Filter):
"""

base64_pattern = re.compile(r"^(\\{1,8}[0abfnrtv]|[0-9A-Za-z+/=]){1,4000}")
base64_set = set(Chars.BASE64_CHARS.value)
base64_set = set(Chars.BASE64STDPAD_CHARS.value)

def __init__(self, config: Config = None) -> None:
pass
Expand Down
12 changes: 6 additions & 6 deletions credsweeper/filters/value_file_path_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@ class ValueFilePathCheck(Filter):
Check if a value contains either '/' or ':\' separators (but not both)
and do not have any special characters ( !$@`&*()+)
"""
base64_possible_set = set(Chars.BASE64_CHARS.value) | set(Chars.BASE64URL_CHARS.value)
unusual_windows_symbols_in_path = "\t\n\r !$@`&*()[]{}<>+=;,~^"
unusual_linux_symbols_in_path = unusual_windows_symbols_in_path + ":\\"
base64stdpad_possible_set = set(Chars.BASE64STDPAD_CHARS.value)
unusual_windows_symbols_in_path = "\t\n\r!$@`&*(){}<>+=;,~^"
unusual_linux_symbols_in_path = "\t\n\r!@`&*<>+=;,~^:\\"

def __init__(self, config: Config = None) -> None:
pass
Expand Down Expand Up @@ -48,12 +48,12 @@ def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
# get minimal entropy to compare with shannon entropy of found value
# min_entropy == 0 means that the value cannot be checked with the entropy due high variance
for i in value:
if i not in self.base64_possible_set:
# value contains wrong BASE64STD_CHARS symbols like .
if i not in self.base64stdpad_possible_set:
# value contains wrong BASE64STDPAD_CHARS symbols like -_
break
else:
# all symbols are from base64 alphabet
entropy = Util.get_shannon_entropy(value, Chars.BASE64STD_CHARS.value)
entropy = Util.get_shannon_entropy(value, Chars.BASE64STDPAD_CHARS.value)
if 0 == min_entropy or min_entropy > entropy:
contains_unix_separator = 1 < value.count('/')
else:
Expand Down
7 changes: 3 additions & 4 deletions credsweeper/ml_model/features/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
from credsweeper.ml_model.features.char_set import CharSet
from credsweeper.ml_model.features.entropy_evaluation import EntropyEvaluation
from credsweeper.ml_model.features.file_extension import FileExtension
from credsweeper.ml_model.features.hartley_entropy import HartleyEntropy
from credsweeper.ml_model.features.has_html_tag import HasHtmlTag
from credsweeper.ml_model.features.is_secret_numeric import IsSecretNumeric
from credsweeper.ml_model.features.reny_entropy import RenyiEntropy
from credsweeper.ml_model.features.length_of_attribute import LengthOfAttribute
from credsweeper.ml_model.features.morpheme_dense import MorphemeDense
from credsweeper.ml_model.features.rule_name import RuleName
from credsweeper.ml_model.features.search_in_attribute import SearchInAttribute
from credsweeper.ml_model.features.shannon_entropy import ShannonEntropy
from credsweeper.ml_model.features.word_in_line import WordInLine
from credsweeper.ml_model.features.word_in_path import WordInPath
from credsweeper.ml_model.features.word_in_value import WordInValue
Expand Down
41 changes: 0 additions & 41 deletions credsweeper/ml_model/features/char_set.py

This file was deleted.

Loading

0 comments on commit 6df255e

Please sign in to comment.