Skip to content

Commit

Permalink
Export results transformation (subtext and hashed) (#582)
Browse files Browse the repository at this point in the history
* square bracket workaround in keywort regex

* path filter

* BM score fix

* ValueStringTypeCheck workaround for heterogenous source

* wrap added to filter array definitions

* TOML format sanitizer

* YAML case

* BM fix

* BM scores fix

* [skip actions] [subhashtext] 2024-08-12T21:32:30+03:00

* variable is hashed too

* hash & subtext test

* testBM

* updBMscor

* refactoring

* skip f* in BM experiment

* keep 0*-3* meta for experiment

* less repos in test

* refactoring2

* read_text.cache_clear()

* --subtext in benchmark

* [skip actions] [subhashtext] 2024-08-13T12:52:11+03:00

* [skip actions] [subhashtext] 2024-08-13T12:55:14+03:00

* fix

* subtext

* experiment ml rollback

* BM scores  with hashes

* some rollbacks
  • Loading branch information
babenek authored Aug 14, 2024
1 parent 45e0643 commit 5e2bf59
Show file tree
Hide file tree
Showing 20 changed files with 1,243 additions and 1,135 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ jobs:
- name: Run CredSweeper tool
run: |
credsweeper --banner --log info --jobs $(nproc) --path data --save-json report.${{ github.event.pull_request.head.sha }}.json | tee credsweeper.${{ github.event.pull_request.head.sha }}.log
credsweeper --banner --log info --jobs $(nproc) --subtext --path data --save-json report.${{ github.event.pull_request.head.sha }}.json | tee credsweeper.${{ github.event.pull_request.head.sha }}.log
- name: Run Benchmark
run: |
Expand Down
62 changes: 32 additions & 30 deletions cicd/benchmark.txt

Large diffs are not rendered by default.

12 changes: 11 additions & 1 deletion credsweeper/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from credsweeper import __version__
from credsweeper.app import APP_PATH, CredSweeper
from credsweeper.common.constants import ThresholdPreset, Severity, RuleType, DiffRowType
from credsweeper.common.constants import ThresholdPreset, Severity, RuleType, DiffRowType, ML_HUNK
from credsweeper.file_handler.abstract_provider import AbstractProvider
from credsweeper.file_handler.files_provider import FilesProvider
from credsweeper.file_handler.patches_provider import PatchesProvider
Expand Down Expand Up @@ -215,6 +215,14 @@ def get_arguments() -> Namespace:
const="output.xlsx",
dest="xlsx_filename",
metavar="PATH")
parser.add_argument("--hashed",
help="line, variable, value will be hashed in output",
action="store_const",
const=True)
parser.add_argument("--subtext",
help=f"line text will be stripped in {2 * ML_HUNK} symbols but value and variable are kept",
action="store_const",
const=True)
parser.add_argument("--sort", help="enable output sorting", dest="sort_output", action="store_true")
parser.add_argument("--log",
"-l",
Expand Down Expand Up @@ -282,6 +290,8 @@ def scan(args: Namespace, content_provider: AbstractProvider, json_filename: Opt
api_validation=args.api_validation,
json_filename=json_filename,
xlsx_filename=xlsx_filename,
hashed=args.hashed,
subtext=args.subtext,
sort_output=args.sort_output,
use_filters=args.no_filters,
pool_count=args.jobs,
Expand Down
13 changes: 10 additions & 3 deletions credsweeper/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ def __init__(self,
api_validation: bool = False,
json_filename: Union[None, str, Path] = None,
xlsx_filename: Union[None, str, Path] = None,
hashed: bool = False,
subtext: bool = False,
sort_output: bool = False,
use_filters: bool = True,
pool_count: int = 1,
Expand Down Expand Up @@ -70,6 +72,8 @@ def __init__(self,
to json
xlsx_filename: optional string variable, path to save result
to xlsx
hashed: use hash of line, value and variable instead plain text
subtext: use subtext of line near variable-value like it performed in ML
use_filters: boolean variable, specifying the need of rule filters
pool_count: int value, number of parallel processes to use
ml_batch_size: int value, size of the batch for model inference
Expand Down Expand Up @@ -104,6 +108,8 @@ def __init__(self,
self.credential_manager = CredentialManager()
self.json_filename: Union[None, str, Path] = json_filename
self.xlsx_filename: Union[None, str, Path] = xlsx_filename
self.hashed = hashed
self.subtext = subtext
self.sort_output = sort_output
self.ml_batch_size = ml_batch_size if ml_batch_size and 0 < ml_batch_size else 16
self.ml_threshold = ml_threshold
Expand Down Expand Up @@ -400,16 +406,17 @@ def export_results(self) -> None:

if self.json_filename:
is_exported = True
Util.json_dump([credential.to_json() for credential in credentials], file_path=self.json_filename)
Util.json_dump([credential.to_json(hashed=self.hashed, subtext=self.subtext) for credential in credentials],
file_path=self.json_filename)

if self.xlsx_filename:
is_exported = True
data_list = []
for credential in credentials:
data_list.extend(credential.to_dict_list())
data_list.extend(credential.to_dict_list(hashed=self.hashed, subtext=self.subtext))
df = pd.DataFrame(data=data_list)
df.to_excel(self.xlsx_filename, index=False)

if is_exported is False:
for credential in credentials:
print(credential)
print(credential.to_str(hashed=self.hashed, subtext=self.subtext))
3 changes: 3 additions & 0 deletions credsweeper/common/constants.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import re
import typing
from enum import Enum
from typing import Optional, Union

Expand Down Expand Up @@ -167,6 +168,8 @@ class DiffRowType(Enum):
DELETED = "deleted"


StartEnd = typing.NamedTuple("StartEnd", [("start", int), ("end", int)])

MIN_VARIABLE_LENGTH = 1
MIN_SEPARATOR_LENGTH = 1
MIN_VALUE_LENGTH = 4
Expand Down
18 changes: 11 additions & 7 deletions credsweeper/credentials/candidate.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,18 +88,22 @@ def is_api_validation_available(self) -> bool:
"""
return len(self.validations) > 0

def __str__(self) -> str:
def to_str(self, subtext: bool = False, hashed: bool = False) -> str:
"""Represent candidate with subtext or|and hashed values"""
return f"rule: {self.rule_name}" \
f" | severity: {self.severity.value}" \
f" | confidence: {self.confidence.value}" \
f" | line_data_list: {self.line_data_list}" \
f" | line_data_list: [{', '.join([x.to_str(subtext, hashed) for x in self.line_data_list])}]" \
f" | api_validation: {self.api_validation.name}" \
f" | ml_validation: {self.ml_validation.name}"

def __str__(self):
return self.to_str()

def __repr__(self):
return str(self)
return self.to_str(subtext=True)

def to_json(self) -> Dict:
def to_json(self, hashed: bool, subtext: bool) -> Dict:
"""Convert credential candidate object to dictionary.
Return:
Expand All @@ -116,23 +120,23 @@ def to_json(self) -> Dict:
"confidence": self.confidence.value,
"use_ml": self.use_ml,
# put the array to end to make json more readable
"line_data_list": [line_data.to_json() for line_data in self.line_data_list],
"line_data_list": [line_data.to_json(hashed, subtext) for line_data in self.line_data_list],
}
if self.config is not None:
reported_output = {k: v for k, v in full_output.items() if k in self.config.candidate_output}
else:
reported_output = full_output
return reported_output

def to_dict_list(self) -> List[dict]:
def to_dict_list(self, hashed: bool, subtext: bool) -> List[dict]:
"""Convert credential candidate object to List[dict].
Return:
List[dict] object generated from current credential candidate
"""
reported_output = []
json_output = self.to_json()
json_output = self.to_json(hashed, subtext)
refined_data = copy.deepcopy(json_output)
del refined_data["line_data_list"]
for line_data in json_output["line_data_list"]:
Expand Down
66 changes: 57 additions & 9 deletions credsweeper/credentials/line_data.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import contextlib
import hashlib
import re
import string
from functools import cached_property
from typing import Any, Dict, Optional, Tuple

from credsweeper.common.constants import MAX_LINE_LENGTH
from credsweeper.common.constants import MAX_LINE_LENGTH, UTF_8, StartEnd, ML_HUNK
from credsweeper.config import Config
from credsweeper.utils import Util
from credsweeper.utils.entropy_validator import EntropyValidator
Expand Down Expand Up @@ -300,34 +301,81 @@ def is_source_file_with_quotes(self) -> bool:
return True
return False

@staticmethod
def get_hash_or_subtext(
text: Optional[str], #
hashed: bool, #
cut_pos: Optional[StartEnd] = None, #
) -> Optional[str]:
"""Represent not empty text with hash or a "beauty" subtext if required
Args:
text: str - input string
hashed: bool - whether the text will be hashed and returned
cut_pos: Optional[StartEnd] - start, end positions which text must be kept in output
Return:
sha256 hash in hex representation of input text with UTF-8 encodings
or
subtext from start to end, or original text as is
"""
if text:
if hashed:
text = hashlib.sha256(text.encode(UTF_8, errors="strict")).hexdigest()
elif cut_pos is not None:
if 2 * ML_HUNK < cut_pos.end - cut_pos.start:
# subtext positions exceed the limit
text = text[cut_pos.start:cut_pos.end]
else:
strip_text = text.strip()
if 2 * ML_HUNK >= len(strip_text):
# stripped text length meets the limit
text = strip_text
else:
offset = len(text) - len(text.lstrip())
center = (cut_pos.end + cut_pos.start - offset) >> 1
text = Util.subtext(strip_text, center, ML_HUNK)
return text

def to_str(self, subtext: bool = False, hashed: bool = False) -> str:
"""Represent line_data with subtext or|and hashed values"""
cut_pos = StartEnd(self.variable_start, self.value_end) if subtext else None
return f"line: '{self.get_hash_or_subtext(self.line, hashed, cut_pos)}'" \
f" | line_num: {self.line_num} | path: {self.path}" \
f" | value: '{self.get_hash_or_subtext(self.value, hashed)}'" \
f" | entropy_validation: {EntropyValidator(self.value)}"

def __str__(self):
return f"line: '{self.line}' | line_num: {self.line_num} | path: {self.path}" \
f" | value: '{self.value}' | entropy_validation: {EntropyValidator(self.value)}"
return self.to_str()

def __repr__(self):
return str(self)
return self.to_str(subtext=True)

def to_json(self) -> Dict:
def to_json(self, hashed: bool, subtext: bool) -> Dict:
"""Convert line data object to dictionary.
Return:
Dictionary object generated from current line data
"""
cut_pos = StartEnd(self.variable_start if 0 <= self.variable_start else self.value_start,
self.value_end) if subtext else None
full_output = {
"key": self.key,
"line": self.line,
"line": self.get_hash_or_subtext(self.line, hashed, cut_pos),
"line_num": self.line_num,
"path": self.path,
"info": self.info,
# info may contain variable name - so let it be hashed if requested
"info": self.get_hash_or_subtext(self.info, hashed),
"pattern": self.pattern.pattern,
"separator": self.separator,
"separator_start": self.separator_start,
"separator_end": self.separator_end,
"value": self.value,
"value": self.get_hash_or_subtext(self.value, hashed),
"value_start": self.value_start,
"value_end": self.value_end,
"variable": self.variable,
"variable": self.get_hash_or_subtext(self.variable, hashed),
"variable_start": self.variable_start,
"variable_end": self.variable_end,
"value_leftquote": self.value_leftquote,
Expand Down
4 changes: 2 additions & 2 deletions credsweeper/utils/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,8 +226,8 @@ def decode_bytes(content: bytes, encodings: Optional[List[str]] = None) -> List[
text = content.decode(encoding, errors="strict")
if content != text.encode(encoding, errors="strict"):
raise UnicodeError
# windows style workaround
lines = text.replace('\r\n', '\n').replace('\r', '\n').split("\n")
# windows & macos styles workaround
lines = text.replace('\r\n', '\n').replace('\r', '\n').split('\n')
break
except UnicodeError:
binary_suggest = True
Expand Down
4 changes: 3 additions & 1 deletion docs/source/guide.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ Get all argument list:
usage: python -m credsweeper [-h] (--path PATH [PATH ...] | --diff_path PATH [PATH ...] | --export_config [PATH] | --export_log_config [PATH]) [--rules [PATH]] [--severity SEVERITY] [--config [PATH]]
[--log_config [PATH]] [--denylist PATH] [--find-by-ext] [--depth POSITIVE_INT] [--no-filters] [--doc] [--ml_threshold FLOAT_OR_STR] [--ml_batch_size POSITIVE_INT]
[--azure | --cuda] [--api_validation] [--jobs POSITIVE_INT] [--skip_ignored] [--save-json [PATH]] [--save-xlsx [PATH]] [--sort] [--log LOG_LEVEL] [--size_limit SIZE_LIMIT]
[--azure | --cuda] [--api_validation] [--jobs POSITIVE_INT] [--skip_ignored] [--save-json [PATH]] [--save-xlsx [PATH]] [--hashed] [--subtext] [--sort] [--log LOG_LEVEL] [--size_limit SIZE_LIMIT]
[--banner] [--version]
options:
-h, --help show this help message and exit
Expand Down Expand Up @@ -49,6 +49,8 @@ Get all argument list:
--skip_ignored parse .gitignore files and skip credentials from ignored objects
--save-json [PATH] save result to json file (default: output.json)
--save-xlsx [PATH] save result to xlsx file (default: output.xlsx)
--hashed line, variable, value will be hashed in output
--subtext line text will be stripped in 160 symbols but value and variable are kept
--sort enable output sorting
--log LOG_LEVEL, -l LOG_LEVEL
provide logging level of ['DEBUG', 'INFO', 'WARN', 'WARNING', 'ERROR', 'FATAL', 'CRITICAL', 'SILENCE'](default: 'warning', case insensitive)
Expand Down
4 changes: 2 additions & 2 deletions experiment/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from experiment.src.features import prepare_data
from experiment.src.lstm_model import get_model
from experiment.src.model_config_preprocess import model_config_preprocess
from experiment.src.prepare_data import prepare_train_data, meta_checksum
from experiment.src.prepare_data import prepare_train_data, data_checksum


def evaluate_model(thresholds: dict, keras_model: Model, x_data: List[np.ndarray], y_label: np.ndarray):
Expand Down Expand Up @@ -59,7 +59,7 @@ def main(cred_data_location: str, jobs: int) -> str:
prepare_train_data(_cred_data_location, jobs)

# detected data means which data is passed to ML validator of credsweeper after filters with RuleName
detected_data = read_detected_data(f"results/detected_data.{meta_checksum(cred_data_location)}.json")
detected_data = read_detected_data(f"results/detected_data.{data_checksum(cred_data_location)}.json")
print(f"CredSweeper detected {len(detected_data)} credentials without ML")
# all markup data
meta_data = read_metadata(f"{cred_data_location}/meta")
Expand Down
4 changes: 2 additions & 2 deletions experiment/main.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ if [ 0 -ne ${error_code} ]; then exit ${error_code}; fi

cd ${CREDSWEEPER_DIR}
report_file=${RESULT_DIR}/${now}.json
${CREDSWEEPER_DIR}/.venv/bin/python -m credsweeper --sort --path ~/q/DataCred/auxiliary/data/ --log info --job $(nproc) --save-json ${report_file}
${CREDSWEEPER_DIR}/.venv/bin/python -m credsweeper --sort --path ~/q/DataCred/abspos/data/ --log info --job $(nproc) --subtext --save-json ${report_file}

cd ~/q/DataCred/auxiliary/
cd ~/q/DataCred/abspos/
.venv/bin/python -m benchmark --scanner credsweeper --load ${report_file} | tee ${report_file}.log
32 changes: 16 additions & 16 deletions experiment/src/data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os
import pathlib
from copy import deepcopy
from functools import cache
from typing import Tuple, Dict, Set, Any

import numpy as np
Expand Down Expand Up @@ -38,14 +39,7 @@ def read_detected_data(file_path: str) -> Dict[identifier, Dict]:
line_data = deepcopy(cred["line_data_list"][0])
line_data.pop("entropy_validation")
line_data.pop("info")
line = line_data["line"].lstrip()
offset = len(line_data["line"]) - len(line)
line_data["line"] = line.rstrip()
line_data["value_start"] -= offset
line_data["value_end"] -= offset
line_data["variable_start"] -= offset
line_data["variable_end"] -= offset
assert line_data["value"] == line_data["line"][line_data["value_start"]:line_data["value_end"]], line_data
line_data["line"] = None # will be read during join_label with data for ML input only
meta_path = transform_to_meta_path(line_data["path"])
line_data["path"] = meta_path
line_data["RuleName"] = [rule_name]
Expand Down Expand Up @@ -143,11 +137,20 @@ def get_colored_line(line_data: Dict[str, Any]) -> str:

def join_label(detected_data: Dict[identifier, Dict], meta_data: Dict[identifier, Dict],
cred_data_location: str) -> pd.DataFrame:

@cache
def read_text(path) -> list[str]:
with open(path, "r", encoding="utf8") as f:
return f.read().replace("\r\n", '\n').replace('\r', '\n').split('\n')

values = []
detected_rules: Set[str] = set()
for index, line_data in detected_data.items():
for i in line_data["RuleName"]:
detected_rules.add(i)
text = read_text(f'{cred_data_location}/{line_data["path"]}')
line = text[line_data["line_num"] - 1]
line_data["line"] = line
if not line_data["value"]:
print(f"WARNING: empty value\n{line_data}")
continue
Expand Down Expand Up @@ -184,11 +187,9 @@ def join_label(detected_data: Dict[identifier, Dict], meta_data: Dict[identifier
f"\nvariable:'{line_data['variable']}' value:'{line_data['value']}'"
f"\nsub_line:'{get_colored_line(line_data)}'")
continue
line = line_data["line"]
# the line in detected data must be striped
assert line == line.strip(), line_data
# check the value in detected data
assert line[line_data["value_start"]:line_data["value_end"]] == line_data["value"]
assert line[line_data["value_start"]:line_data["value_end"]] == line_data["value"], (
line_data, line[line_data["value_start"]:line_data["value_end"]], line_data["value"])
# todo: variable input has to be markup in meta too, or/and new feature "VariableExists" created ???
line_data["GroundTruth"] = label
line_data["ext"] = Util.get_extension(line_data["path"])
Expand All @@ -206,18 +207,17 @@ def join_label(detected_data: Dict[identifier, Dict], meta_data: Dict[identifier
print(','.join(markup.keys()))
all_meta_found = False
print(','.join(str(x) for x in markup.values()))
text = Util.read_file(f'{cred_data_location}/{markup["FilePath"]}')
line = text[markup["LineStart"] - 1].strip()
text = read_text(f'{cred_data_location}/{markup["FilePath"]}')
line = text[markup["LineStart"] - 1]
if 0 <= markup["ValueStart"] and 0 <= markup["ValueEnd"]:
line = line[:markup["ValueStart"]] \
+ Fore.LIGHTGREEN_EX \
+ line[markup["ValueStart"]:markup["ValueEnd"]] \
+ Style.RESET_ALL \
+ line[markup["ValueEnd"]:]
print(line)
# print(Util.subtext(line, markup['ValueStart'], ML_HUNK))
break

read_text.cache_clear()
df = pd.DataFrame(values)
return df

Expand Down
Loading

0 comments on commit 5e2bf59

Please sign in to comment.