Skip to content

Commit

Permalink
fix \r in xlsx (#631)
Browse files Browse the repository at this point in the history
* fix \r in xlsx

* Update credsweeper/filters/value_discord_bot_check.py
  • Loading branch information
babenek authored Dec 12, 2024
1 parent 7838888 commit e5a4ee8
Show file tree
Hide file tree
Showing 10 changed files with 667 additions and 250 deletions.
7 changes: 3 additions & 4 deletions credsweeper/deep_scanner/xlsx_scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,10 @@ def data_scan(
candidates = []
try:
book = pd.read_excel(io.BytesIO(data_provider.data), sheet_name=None, header=None)
sheet_lines = []
for sheet_name, sheet_data in book.items():
text = sheet_data.fillna('').astype(str)
for i in text.values:
sheet_lines.append('\t'.join(i))
# replace open xml carriage returns _x000D_ before line feed only
df = sheet_data.replace(to_replace="_x000D_\n", value='\n', regex=True).fillna('').astype(str)
sheet_lines = ['\t'.join(x) for x in df.values]
string_data_provider = StringContentProvider(lines=sheet_lines,
file_path=data_provider.file_path,
file_type=data_provider.file_type,
Expand Down
8 changes: 7 additions & 1 deletion credsweeper/filters/value_discord_bot_check.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import contextlib

from credsweeper.common.constants import Chars
from credsweeper.config import Config
from credsweeper.credentials import LineData
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.filters import Filter
from credsweeper.filters.value_entropy_base64_check import ValueEntropyBase64Check
from credsweeper.utils import Util


Expand All @@ -28,6 +30,10 @@ def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
# . must be in value according regex
dot_separator_index = line_data.value.index('.')
id_part = line_data.value[:dot_separator_index]
if int(Util.decode_base64(id_part, padding_safe=True, urlsafe_detect=True)):
discord_id = int(Util.decode_base64(id_part, padding_safe=True, urlsafe_detect=True))
entropy_part = line_data.value[dot_separator_index:]
entropy = Util.get_shannon_entropy(entropy_part, Chars.BASE64STD_CHARS.value)
min_entropy = ValueEntropyBase64Check.get_min_data_entropy(len(line_data.value))
if not 1000 > discord_id and not min_entropy < entropy:
return False
return True
4 changes: 2 additions & 2 deletions tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,10 @@
SAMPLES_POST_CRED_COUNT = SAMPLES_CRED_COUNT - ML_FILTERED

# with option --doc
SAMPLES_IN_DOC = 453
SAMPLES_IN_DOC = 463

# archived credentials that are not found without --depth
SAMPLES_IN_DEEP_1 = SAMPLES_POST_CRED_COUNT + 29
SAMPLES_IN_DEEP_1 = SAMPLES_POST_CRED_COUNT + 33
SAMPLES_IN_DEEP_2 = SAMPLES_IN_DEEP_1 + 54
SAMPLES_IN_DEEP_3 = SAMPLES_IN_DEEP_2 + 1

Expand Down
432 changes: 270 additions & 162 deletions tests/data/depth_3.json

Large diffs are not rendered by default.

Loading

0 comments on commit e5a4ee8

Please sign in to comment.