diff --git a/credsweeper/file_handler/data_content_provider.py b/credsweeper/file_handler/data_content_provider.py
index 581352d47..d7279b566 100644
--- a/credsweeper/file_handler/data_content_provider.py
+++ b/credsweeper/file_handler/data_content_provider.py
@@ -183,8 +183,10 @@ def _check_multiline_cell(self, cell: Tag) -> Optional[Tuple[int, str]]:
def _simple_html_representation(self, html: BeautifulSoup):
# simple parse as it is displayed to user
# dbg = html.find_all(text=True)
- for p in html.find_all("p"):
+ for p in html.find_all(["p", "br", "tr", "li", "ol", "h1", "h2", "h3", "h4", "h5", "h6", "blockquote", "pre"]):
p.append('\n')
+ for p in html.find_all(["th", "td"]):
+ p.append('\t')
lines = html.get_text().splitlines()
for line_number, doc_line in enumerate(lines):
line = doc_line.strip()
@@ -219,8 +221,10 @@ def _table_representation(
logger.warning("Recursive depth limit was reached during HTML table combinations")
return
table_header: Optional[List[Optional[str]]] = None
- for tr in table.find_all('tr'):
+ rowspan_columns = []
+ for tr in table.find_all("tr"):
if recursive_limit_size < self.__html_lines_size:
+ # weird tables may lead to oversize memory
break
record_numbers = []
record_lines = []
@@ -228,16 +232,21 @@ def _table_representation(
if table_header is None:
table_header = []
# first row in table may be a header with
and a style, but search | too
- for cell in tr.find_all(['th', 'td']):
+ for cell in tr.find_all(["th", "td"]):
if recursive_limit_size < self.__html_lines_size:
+ # keep the duplicates for early breaks!
break
+ colspan_header = int(cell.get("colspan", 1))
if td_numbered_line := self._check_multiline_cell(cell):
td_text = td_numbered_line[1]
td_text_has_keywords = keywords_required_substrings_check(td_text.lower())
- if td_text_has_keywords:
- table_header.append(td_text)
- else:
- table_header.append(None)
+ for _ in range(colspan_header):
+ rowspan_header = int(cell.get("rowspan", 1))
+ rowspan_columns.append(rowspan_header)
+ if td_text_has_keywords:
+ table_header.append(td_text)
+ else:
+ table_header.append(None)
if record_leading is None:
if td_text_has_keywords:
record_leading = td_text
@@ -245,40 +254,53 @@ def _table_representation(
record_leading = ""
else:
record_numbers.append(td_numbered_line[0])
- record_lines.append(f"{record_leading} = {td_text}")
+ record_lines.append(f"{record_leading} : {td_text}")
# add single text to lines for analysis
self.line_numbers.append(td_numbered_line[0])
self.lines.append(td_text)
self.__html_lines_size += len(td_text)
else:
# empty cell or multiline cell
- table_header.append(None)
- continue
+ for _ in range(colspan_header):
+ # number of columns is defined with header only
+ rowspan_header = int(cell.get("rowspan", 1))
+ rowspan_columns.append(rowspan_header)
+ table_header.append(None)
else:
+ header_pos = 0
# not a first line in table - may be combined with a header
- for header_pos, cell in enumerate(tr.find_all('td')):
+ for cell in tr.find_all("td"):
if recursive_limit_size < self.__html_lines_size:
+ # keep the duplicates for early breaks!
break
+ while header_pos < len(rowspan_columns) and 1 < rowspan_columns[header_pos]:
+ rowspan_columns[header_pos] -= 1
+ header_pos += 1
+ colspan_cell = int(cell.get("colspan", 1))
+ rowspan_cell = int(cell.get("rowspan", 1))
+ for i in range(header_pos, header_pos + colspan_cell):
+ if i < len(rowspan_columns):
+ rowspan_columns[i] += rowspan_cell - 1
if td_numbered_line := self._check_multiline_cell(cell):
td_text = td_numbered_line[1]
- td_text_has_keywords = keywords_required_substrings_check(td_text.lower())
if record_leading is None:
+ td_text_has_keywords = keywords_required_substrings_check(td_text.lower())
if td_text_has_keywords:
record_leading = td_text
else:
record_leading = ""
elif record_leading:
record_numbers.append(td_numbered_line[0])
- record_lines.append(f"{record_leading} = {td_text}")
+ record_lines.append(f"{record_leading} : {td_text}")
if header_pos < len(table_header):
if header_text := table_header[header_pos]:
self.line_numbers.append(td_numbered_line[0])
- self.lines.append(f"{header_text} = {td_text}")
+ self.lines.append(f"{header_text} : {td_text}")
self.__html_lines_size += len(td_text)
else:
# empty cell or multiline cell
table_header.append(None)
- continue
+ header_pos += colspan_cell
if record_lines:
# add combinations with left column
self.line_numbers.extend(record_numbers)
@@ -295,7 +317,7 @@ def _html_tables_representation(
depth -= 1
if 0 > depth:
return
- for table in html.find_all('table'):
+ for table in html.find_all("table"):
if recursive_limit_size < self.__html_lines_size:
logger.warning("Recursive size limit was reached during HTML table combinations")
break
diff --git a/tests/__init__.py b/tests/__init__.py
index c752fc4a6..88a906408 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -1,7 +1,7 @@
from pathlib import Path
# total number of files in test samples
-SAMPLES_FILES_COUNT: int = 133
+SAMPLES_FILES_COUNT: int = 134
# the lowest value of ML threshold is used to display possible lowest values
NEGLIGIBLE_ML_THRESHOLD = 0.0001
@@ -18,7 +18,7 @@
# archived credentials that are not found without --depth
SAMPLES_IN_DEEP_1 = SAMPLES_POST_CRED_COUNT + 23
-SAMPLES_IN_DEEP_2 = SAMPLES_IN_DEEP_1 + 21
+SAMPLES_IN_DEEP_2 = SAMPLES_IN_DEEP_1 + 53
SAMPLES_IN_DEEP_3 = SAMPLES_IN_DEEP_2 + 1
# well known string with all latin letters
diff --git a/tests/data/depth_3.json b/tests/data/depth_3.json
index 1c8d6e667..528e5a31b 100644
--- a/tests/data/depth_3.json
+++ b/tests/data/depth_3.json
@@ -11233,6 +11233,870 @@
}
]
},
+ {
+ "api_validation": "NOT_AVAILABLE",
+ "ml_validation": "VALIDATED_KEY",
+ "ml_probability": 0.999,
+ "rule": "Token",
+ "severity": "medium",
+ "confidence": "moderate",
+ "line_data_list": [
+ {
+ "line": "token : bace4d19-fa7e-beer-care-9129474bcd81",
+ "line_num": 19,
+ "path": "./tests/samples/table.html",
+ "info": "./tests/samples/table.html|HTML",
+ "value": "bace4d19-fa7e-beer-care-9129474bcd81",
+ "value_start": 8,
+ "value_end": 44,
+ "variable": "token",
+ "variable_start": 0,
+ "variable_end": 5,
+ "entropy_validation": {
+ "iterator": "BASE36_CHARS",
+ "entropy": 3.313850959964899,
+ "valid": true
+ }
+ }
+ ]
+ },
+ {
+ "api_validation": "NOT_AVAILABLE",
+ "ml_validation": "VALIDATED_KEY",
+ "ml_probability": 0.792,
+ "rule": "Secret",
+ "severity": "medium",
+ "confidence": "moderate",
+ "line_data_list": [
+ {
+ "line": "secret : 5EcRe7_r0",
+ "line_num": 20,
+ "path": "./tests/samples/table.html",
+ "info": "./tests/samples/table.html|HTML",
+ "value": "5EcRe7_r0",
+ "value_start": 9,
+ "value_end": 18,
+ "variable": "secret",
+ "variable_start": 0,
+ "variable_end": 6,
+ "entropy_validation": {
+ "iterator": "BASE64_CHARS",
+ "entropy": 2.8177111123931664,
+ "valid": false
+ }
+ }
+ ]
+ },
+ {
+ "api_validation": "NOT_AVAILABLE",
+ "ml_validation": "VALIDATED_KEY",
+ "ml_probability": 0.969,
+ "rule": "Key",
+ "severity": "medium",
+ "confidence": "moderate",
+ "line_data_list": [
+ {
+ "line": "key : AK1AGIREOGIAWSKEY555",
+ "line_num": 21,
+ "path": "./tests/samples/table.html",
+ "info": "./tests/samples/table.html|HTML",
+ "value": "AK1AGIREOGIAWSKEY555",
+ "value_start": 6,
+ "value_end": 26,
+ "variable": "key",
+ "variable_start": 0,
+ "variable_end": 3,
+ "entropy_validation": {
+ "iterator": "BASE64_CHARS",
+ "entropy": 3.446439344671016,
+ "valid": false
+ }
+ }
+ ]
+ },
+ {
+ "api_validation": "NOT_AVAILABLE",
+ "ml_validation": "VALIDATED_KEY",
+ "ml_probability": 0.863,
+ "rule": "Password",
+ "severity": "medium",
+ "confidence": "moderate",
+ "line_data_list": [
+ {
+ "line": "password : Pas1wrD0",
+ "line_num": 22,
+ "path": "./tests/samples/table.html",
+ "info": "./tests/samples/table.html|HTML",
+ "value": "Pas1wrD0",
+ "value_start": 11,
+ "value_end": 19,
+ "variable": "password",
+ "variable_start": 0,
+ "variable_end": 8,
+ "entropy_validation": {
+ "iterator": "BASE64_CHARS",
+ "entropy": 3.0,
+ "valid": false
+ }
+ }
+ ]
+ },
+ {
+ "api_validation": "NOT_AVAILABLE",
+ "ml_validation": "VALIDATED_KEY",
+ "ml_probability": 1.0,
+ "rule": "Token",
+ "severity": "medium",
+ "confidence": "moderate",
+ "line_data_list": [
+ {
+ "line": "token : bace4d19-fa7e-beer-care-9129474bcd82",
+ "line_num": 25,
+ "path": "./tests/samples/table.html",
+ "info": "./tests/samples/table.html|HTML",
+ "value": "bace4d19-fa7e-beer-care-9129474bcd82",
+ "value_start": 8,
+ "value_end": 44,
+ "variable": "token",
+ "variable_start": 0,
+ "variable_end": 5,
+ "entropy_validation": {
+ "iterator": "BASE36_CHARS",
+ "entropy": 3.3348200572472178,
+ "valid": true
+ }
+ }
+ ]
+ },
+ {
+ "api_validation": "NOT_AVAILABLE",
+ "ml_validation": "VALIDATED_KEY",
+ "ml_probability": 0.713,
+ "rule": "Token",
+ "severity": "medium",
+ "confidence": "moderate",
+ "line_data_list": [
+ {
+ "line": "token : AKDR_C1r17-K3Y0-SeCrt-2",
+ "line_num": 26,
+ "path": "./tests/samples/table.html",
+ "info": "./tests/samples/table.html|HTML",
+ "value": "AKDR_C1r17-K3Y0-SeCrt-2",
+ "value_start": 8,
+ "value_end": 31,
+ "variable": "token",
+ "variable_start": 0,
+ "variable_end": 5,
+ "entropy_validation": {
+ "iterator": "BASE64_CHARS",
+ "entropy": 3.389029441960142,
+ "valid": false
+ }
+ }
+ ]
+ },
+ {
+ "api_validation": "NOT_AVAILABLE",
+ "ml_validation": "VALIDATED_KEY",
+ "ml_probability": 0.975,
+ "rule": "Key",
+ "severity": "medium",
+ "confidence": "moderate",
+ "line_data_list": [
+ {
+ "line": "key : SDFHBH2398SG5VF62VZVQFG2TYGVF9WYSGR",
+ "line_num": 27,
+ "path": "./tests/samples/table.html",
+ "info": "./tests/samples/table.html|HTML",
+ "value": "SDFHBH2398SG5VF62VZVQFG2TYGVF9WYSGR",
+ "value_start": 6,
+ "value_end": 41,
+ "variable": "key",
+ "variable_start": 0,
+ "variable_end": 3,
+ "entropy_validation": {
+ "iterator": "BASE64_CHARS",
+ "entropy": 4.000432302535625,
+ "valid": false
+ }
+ }
+ ]
+ },
+ {
+ "api_validation": "NOT_AVAILABLE",
+ "ml_validation": "VALIDATED_KEY",
+ "ml_probability": 0.995,
+ "rule": "Key",
+ "severity": "medium",
+ "confidence": "moderate",
+ "line_data_list": [
+ {
+ "line": "key : 3TJDSLKGDFG4MTB34UHWYYSDFHKSDKJH34S",
+ "line_num": 29,
+ "path": "./tests/samples/table.html",
+ "info": "./tests/samples/table.html|HTML",
+ "value": "3TJDSLKGDFG4MTB34UHWYYSDFHKSDKJH34S",
+ "value_start": 6,
+ "value_end": 41,
+ "variable": "key",
+ "variable_start": 0,
+ "variable_end": 3,
+ "entropy_validation": {
+ "iterator": "BASE64_CHARS",
+ "entropy": 3.843010159554856,
+ "valid": false
+ }
+ }
+ ]
+ },
+ {
+ "api_validation": "NOT_AVAILABLE",
+ "ml_validation": "VALIDATED_KEY",
+ "ml_probability": 0.66,
+ "rule": "Password",
+ "severity": "medium",
+ "confidence": "moderate",
+ "line_data_list": [
+ {
+ "line": "password : Pas1wrD2",
+ "line_num": 30,
+ "path": "./tests/samples/table.html",
+ "info": "./tests/samples/table.html|HTML",
+ "value": "Pas1wrD2",
+ "value_start": 11,
+ "value_end": 19,
+ "variable": "password",
+ "variable_start": 0,
+ "variable_end": 8,
+ "entropy_validation": {
+ "iterator": "BASE64_CHARS",
+ "entropy": 3.0,
+ "valid": false
+ }
+ }
+ ]
+ },
+ {
+ "api_validation": "NOT_AVAILABLE",
+ "ml_validation": "VALIDATED_KEY",
+ "ml_probability": 1.0,
+ "rule": "Token",
+ "severity": "medium",
+ "confidence": "moderate",
+ "line_data_list": [
+ {
+ "line": "token : bace4d19-fa7e-beer-care-9129474bcd83",
+ "line_num": 34,
+ "path": "./tests/samples/table.html",
+ "info": "./tests/samples/table.html|HTML",
+ "value": "bace4d19-fa7e-beer-care-9129474bcd83",
+ "value_start": 8,
+ "value_end": 44,
+ "variable": "token",
+ "variable_start": 0,
+ "variable_end": 5,
+ "entropy_validation": {
+ "iterator": "BASE36_CHARS",
+ "entropy": 3.390375612802773,
+ "valid": true
+ }
+ }
+ ]
+ },
+ {
+ "api_validation": "NOT_AVAILABLE",
+ "ml_validation": "VALIDATED_KEY",
+ "ml_probability": 0.913,
+ "rule": "Password",
+ "severity": "medium",
+ "confidence": "moderate",
+ "line_data_list": [
+ {
+ "line": "password : Pas1wrD3",
+ "line_num": 38,
+ "path": "./tests/samples/table.html",
+ "info": "./tests/samples/table.html|HTML",
+ "value": "Pas1wrD3",
+ "value_start": 11,
+ "value_end": 19,
+ "variable": "password",
+ "variable_start": 0,
+ "variable_end": 8,
+ "entropy_validation": {
+ "iterator": "BASE64_CHARS",
+ "entropy": 3.0,
+ "valid": false
+ }
+ }
+ ]
+ },
+ {
+ "api_validation": "NOT_AVAILABLE",
+ "ml_validation": "VALIDATED_KEY",
+ "ml_probability": 1.0,
+ "rule": "Token",
+ "severity": "medium",
+ "confidence": "moderate",
+ "line_data_list": [
+ {
+ "line": "token : bace4d19-fa7e-beer-care-9129474bcd85",
+ "line_num": 42,
+ "path": "./tests/samples/table.html",
+ "info": "./tests/samples/table.html|HTML",
+ "value": "bace4d19-fa7e-beer-care-9129474bcd85",
+ "value_start": 8,
+ "value_end": 44,
+ "variable": "token",
+ "variable_start": 0,
+ "variable_end": 5,
+ "entropy_validation": {
+ "iterator": "BASE36_CHARS",
+ "entropy": 3.390375612802773,
+ "valid": true
+ }
+ }
+ ]
+ },
+ {
+ "api_validation": "NOT_AVAILABLE",
+ "ml_validation": "VALIDATED_KEY",
+ "ml_probability": 1.0,
+ "rule": "Token",
+ "severity": "medium",
+ "confidence": "moderate",
+ "line_data_list": [
+ {
+ "line": "token : bace4d19-fa7e-beer-care-9129474bcd86",
+ "line_num": 43,
+ "path": "./tests/samples/table.html",
+ "info": "./tests/samples/table.html|HTML",
+ "value": "bace4d19-fa7e-beer-care-9129474bcd86",
+ "value_start": 8,
+ "value_end": 44,
+ "variable": "token",
+ "variable_start": 0,
+ "variable_end": 5,
+ "entropy_validation": {
+ "iterator": "BASE36_CHARS",
+ "entropy": 3.390375612802773,
+ "valid": true
+ }
+ }
+ ]
+ },
+ {
+ "api_validation": "NOT_AVAILABLE",
+ "ml_validation": "VALIDATED_KEY",
+ "ml_probability": 0.881,
+ "rule": "Secret",
+ "severity": "medium",
+ "confidence": "moderate",
+ "line_data_list": [
+ {
+ "line": "secret : 5EcRe7_r4",
+ "line_num": 44,
+ "path": "./tests/samples/table.html",
+ "info": "./tests/samples/table.html|HTML",
+ "value": "5EcRe7_r4",
+ "value_start": 9,
+ "value_end": 18,
+ "variable": "secret",
+ "variable_start": 0,
+ "variable_end": 6,
+ "entropy_validation": {
+ "iterator": "BASE64_CHARS",
+ "entropy": 2.8177111123931664,
+ "valid": false
+ }
+ }
+ ]
+ },
+ {
+ "api_validation": "NOT_AVAILABLE",
+ "ml_validation": "VALIDATED_KEY",
+ "ml_probability": 0.95,
+ "rule": "Key",
+ "severity": "medium",
+ "confidence": "moderate",
+ "line_data_list": [
+ {
+ "line": "key : 741852-321654-963852-654913",
+ "line_num": 45,
+ "path": "./tests/samples/table.html",
+ "info": "./tests/samples/table.html|HTML",
+ "value": "741852-321654-963852-654913",
+ "value_start": 6,
+ "value_end": 33,
+ "variable": "key",
+ "variable_start": 0,
+ "variable_end": 3,
+ "entropy_validation": {
+ "iterator": "BASE64_CHARS",
+ "entropy": 2.901587501522441,
+ "valid": false
+ }
+ }
+ ]
+ },
+ {
+ "api_validation": "NOT_AVAILABLE",
+ "ml_validation": "VALIDATED_KEY",
+ "ml_probability": 0.944,
+ "rule": "Password",
+ "severity": "medium",
+ "confidence": "moderate",
+ "line_data_list": [
+ {
+ "line": "password : Pas1wrD4",
+ "line_num": 47,
+ "path": "./tests/samples/table.html",
+ "info": "./tests/samples/table.html|HTML",
+ "value": "Pas1wrD4",
+ "value_start": 11,
+ "value_end": 19,
+ "variable": "password",
+ "variable_start": 0,
+ "variable_end": 8,
+ "entropy_validation": {
+ "iterator": "BASE64_CHARS",
+ "entropy": 3.0,
+ "valid": false
+ }
+ }
+ ]
+ },
+ {
+ "api_validation": "NOT_AVAILABLE",
+ "ml_validation": "VALIDATED_KEY",
+ "ml_probability": 0.999,
+ "rule": "Token",
+ "severity": "medium",
+ "confidence": "moderate",
+ "line_data_list": [
+ {
+ "line": "token : bace4d19-fa7e-beer-care-9129474bcd87",
+ "line_num": 51,
+ "path": "./tests/samples/table.html",
+ "info": "./tests/samples/table.html|HTML",
+ "value": "bace4d19-fa7e-beer-care-9129474bcd87",
+ "value_start": 8,
+ "value_end": 44,
+ "variable": "token",
+ "variable_start": 0,
+ "variable_end": 5,
+ "entropy_validation": {
+ "iterator": "BASE36_CHARS",
+ "entropy": 3.313850959964899,
+ "valid": true
+ }
+ }
+ ]
+ },
+ {
+ "api_validation": "NOT_AVAILABLE",
+ "ml_validation": "VALIDATED_KEY",
+ "ml_probability": 0.999,
+ "rule": "Token",
+ "severity": "medium",
+ "confidence": "moderate",
+ "line_data_list": [
+ {
+ "line": "token : bace4d19-fa7e-beer-care-9129474bcd88",
+ "line_num": 52,
+ "path": "./tests/samples/table.html",
+ "info": "./tests/samples/table.html|HTML",
+ "value": "bace4d19-fa7e-beer-care-9129474bcd88",
+ "value_start": 8,
+ "value_end": 44,
+ "variable": "token",
+ "variable_start": 0,
+ "variable_end": 5,
+ "entropy_validation": {
+ "iterator": "BASE36_CHARS",
+ "entropy": 3.3348200572472178,
+ "valid": true
+ }
+ }
+ ]
+ },
+ {
+ "api_validation": "NOT_AVAILABLE",
+ "ml_validation": "VALIDATED_KEY",
+ "ml_probability": 0.858,
+ "rule": "Secret",
+ "severity": "medium",
+ "confidence": "moderate",
+ "line_data_list": [
+ {
+ "line": "secret : 5EcRe7_r5",
+ "line_num": 53,
+ "path": "./tests/samples/table.html",
+ "info": "./tests/samples/table.html|HTML",
+ "value": "5EcRe7_r5",
+ "value_start": 9,
+ "value_end": 18,
+ "variable": "secret",
+ "variable_start": 0,
+ "variable_end": 6,
+ "entropy_validation": {
+ "iterator": "BASE64_CHARS",
+ "entropy": 2.595488890170944,
+ "valid": false
+ }
+ }
+ ]
+ },
+ {
+ "api_validation": "NOT_AVAILABLE",
+ "ml_validation": "VALIDATED_KEY",
+ "ml_probability": 0.934,
+ "rule": "Key",
+ "severity": "medium",
+ "confidence": "moderate",
+ "line_data_list": [
+ {
+ "line": "key : 321654-741852-963852-654980",
+ "line_num": 54,
+ "path": "./tests/samples/table.html",
+ "info": "./tests/samples/table.html|HTML",
+ "value": "321654-741852-963852-654980",
+ "value_start": 6,
+ "value_end": 33,
+ "variable": "key",
+ "variable_start": 0,
+ "variable_end": 3,
+ "entropy_validation": {
+ "iterator": "BASE36_CHARS",
+ "entropy": 3.0036203719729397,
+ "valid": true
+ }
+ }
+ ]
+ },
+ {
+ "api_validation": "NOT_AVAILABLE",
+ "ml_validation": "VALIDATED_KEY",
+ "ml_probability": 0.955,
+ "rule": "Key",
+ "severity": "medium",
+ "confidence": "moderate",
+ "line_data_list": [
+ {
+ "line": "key : 321654-741852-963852-654981",
+ "line_num": 55,
+ "path": "./tests/samples/table.html",
+ "info": "./tests/samples/table.html|HTML",
+ "value": "321654-741852-963852-654981",
+ "value_start": 6,
+ "value_end": 33,
+ "variable": "key",
+ "variable_start": 0,
+ "variable_end": 3,
+ "entropy_validation": {
+ "iterator": "BASE64_CHARS",
+ "entropy": 2.9015875015224406,
+ "valid": false
+ }
+ }
+ ]
+ },
+ {
+ "api_validation": "NOT_AVAILABLE",
+ "ml_validation": "VALIDATED_KEY",
+ "ml_probability": 0.964,
+ "rule": "Password",
+ "severity": "medium",
+ "confidence": "moderate",
+ "line_data_list": [
+ {
+ "line": "password : Pas1wrD5",
+ "line_num": 56,
+ "path": "./tests/samples/table.html",
+ "info": "./tests/samples/table.html|HTML",
+ "value": "Pas1wrD5",
+ "value_start": 11,
+ "value_end": 19,
+ "variable": "password",
+ "variable_start": 0,
+ "variable_end": 8,
+ "entropy_validation": {
+ "iterator": "BASE64_CHARS",
+ "entropy": 3.0,
+ "valid": false
+ }
+ }
+ ]
+ },
+ {
+ "api_validation": "NOT_AVAILABLE",
+ "ml_validation": "VALIDATED_KEY",
+ "ml_probability": 0.999,
+ "rule": "Token",
+ "severity": "medium",
+ "confidence": "moderate",
+ "line_data_list": [
+ {
+ "line": "token : bace4d19-fa7e-beer-care-9129474bcd89",
+ "line_num": 60,
+ "path": "./tests/samples/table.html",
+ "info": "./tests/samples/table.html|HTML",
+ "value": "bace4d19-fa7e-beer-care-9129474bcd89",
+ "value_start": 8,
+ "value_end": 44,
+ "variable": "token",
+ "variable_start": 0,
+ "variable_end": 5,
+ "entropy_validation": {
+ "iterator": "BASE36_CHARS",
+ "entropy": 3.3002335989739806,
+ "valid": true
+ }
+ }
+ ]
+ },
+ {
+ "api_validation": "NOT_AVAILABLE",
+ "ml_validation": "VALIDATED_KEY",
+ "ml_probability": 0.999,
+ "rule": "Token",
+ "severity": "medium",
+ "confidence": "moderate",
+ "line_data_list": [
+ {
+ "line": "token : bace4d19-fa7e-beer-care-9129474bcd80",
+ "line_num": 61,
+ "path": "./tests/samples/table.html",
+ "info": "./tests/samples/table.html|HTML",
+ "value": "bace4d19-fa7e-beer-care-9129474bcd80",
+ "value_start": 8,
+ "value_end": 44,
+ "variable": "token",
+ "variable_start": 0,
+ "variable_end": 5,
+ "entropy_validation": {
+ "iterator": "BASE36_CHARS",
+ "entropy": 3.390375612802773,
+ "valid": true
+ }
+ }
+ ]
+ },
+ {
+ "api_validation": "NOT_AVAILABLE",
+ "ml_validation": "VALIDATED_KEY",
+ "ml_probability": 0.955,
+ "rule": "Key",
+ "severity": "medium",
+ "confidence": "moderate",
+ "line_data_list": [
+ {
+ "line": "key : 321654-963852-654987-741851",
+ "line_num": 62,
+ "path": "./tests/samples/table.html",
+ "info": "./tests/samples/table.html|HTML",
+ "value": "321654-963852-654987-741851",
+ "value_start": 6,
+ "value_end": 33,
+ "variable": "key",
+ "variable_start": 0,
+ "variable_end": 3,
+ "entropy_validation": {
+ "iterator": "BASE64_CHARS",
+ "entropy": 2.9295462978988653,
+ "valid": false
+ }
+ }
+ ]
+ },
+ {
+ "api_validation": "NOT_AVAILABLE",
+ "ml_validation": "VALIDATED_KEY",
+ "ml_probability": 0.944,
+ "rule": "Key",
+ "severity": "medium",
+ "confidence": "moderate",
+ "line_data_list": [
+ {
+ "line": "key : 321654-963852-654987-741853",
+ "line_num": 63,
+ "path": "./tests/samples/table.html",
+ "info": "./tests/samples/table.html|HTML",
+ "value": "321654-963852-654987-741853",
+ "value_start": 6,
+ "value_end": 33,
+ "variable": "key",
+ "variable_start": 0,
+ "variable_end": 3,
+ "entropy_validation": {
+ "iterator": "BASE64_CHARS",
+ "entropy": 2.9295462978988653,
+ "valid": false
+ }
+ }
+ ]
+ },
+ {
+ "api_validation": "NOT_AVAILABLE",
+ "ml_validation": "VALIDATED_KEY",
+ "ml_probability": 0.996,
+ "rule": "Password",
+ "severity": "medium",
+ "confidence": "moderate",
+ "line_data_list": [
+ {
+ "line": "password : Pas1wrD67",
+ "line_num": 64,
+ "path": "./tests/samples/table.html",
+ "info": "./tests/samples/table.html|HTML",
+ "value": "Pas1wrD67",
+ "value_start": 11,
+ "value_end": 20,
+ "variable": "password",
+ "variable_start": 0,
+ "variable_end": 8,
+ "entropy_validation": {
+ "iterator": "BASE64_CHARS",
+ "entropy": 3.169925001442312,
+ "valid": false
+ }
+ }
+ ]
+ },
+ {
+ "api_validation": "NOT_AVAILABLE",
+ "ml_validation": "VALIDATED_KEY",
+ "ml_probability": 0.999,
+ "rule": "Token",
+ "severity": "medium",
+ "confidence": "moderate",
+ "line_data_list": [
+ {
+ "line": "token : bace4d19-fa7e-beer-care-9129474bcd11",
+ "line_num": 67,
+ "path": "./tests/samples/table.html",
+ "info": "./tests/samples/table.html|HTML",
+ "value": "bace4d19-fa7e-beer-care-9129474bcd11",
+ "value_start": 8,
+ "value_end": 44,
+ "variable": "token",
+ "variable_start": 0,
+ "variable_end": 5,
+ "entropy_validation": {
+ "iterator": "BASE36_CHARS",
+ "entropy": 3.2237089461361066,
+ "valid": true
+ }
+ }
+ ]
+ },
+ {
+ "api_validation": "NOT_AVAILABLE",
+ "ml_validation": "VALIDATED_KEY",
+ "ml_probability": 0.999,
+ "rule": "Token",
+ "severity": "medium",
+ "confidence": "moderate",
+ "line_data_list": [
+ {
+ "line": "token : bace4d19-fa7e-beer-care-9129474bcd22",
+ "line_num": 68,
+ "path": "./tests/samples/table.html",
+ "info": "./tests/samples/table.html|HTML",
+ "value": "bace4d19-fa7e-beer-care-9129474bcd22",
+ "value_start": 8,
+ "value_end": 44,
+ "variable": "token",
+ "variable_start": 0,
+ "variable_end": 5,
+ "entropy_validation": {
+ "iterator": "BASE36_CHARS",
+ "entropy": 3.2582954044093437,
+ "valid": true
+ }
+ }
+ ]
+ },
+ {
+ "api_validation": "NOT_AVAILABLE",
+ "ml_validation": "VALIDATED_KEY",
+ "ml_probability": 0.992,
+ "rule": "Key",
+ "severity": "medium",
+ "confidence": "moderate",
+ "line_data_list": [
+ {
+ "line": "key : 654987-321654-963852-741851",
+ "line_num": 69,
+ "path": "./tests/samples/table.html",
+ "info": "./tests/samples/table.html|HTML",
+ "value": "654987-321654-963852-741851",
+ "value_start": 6,
+ "value_end": 33,
+ "variable": "key",
+ "variable_start": 0,
+ "variable_end": 3,
+ "entropy_validation": {
+ "iterator": "BASE64_CHARS",
+ "entropy": 2.9295462978988653,
+ "valid": false
+ }
+ }
+ ]
+ },
+ {
+ "api_validation": "NOT_AVAILABLE",
+ "ml_validation": "VALIDATED_KEY",
+ "ml_probability": 0.991,
+ "rule": "Key",
+ "severity": "medium",
+ "confidence": "moderate",
+ "line_data_list": [
+ {
+ "line": "key : 654987-321654-963852-741852",
+ "line_num": 70,
+ "path": "./tests/samples/table.html",
+ "info": "./tests/samples/table.html|HTML",
+ "value": "654987-321654-963852-741852",
+ "value_start": 6,
+ "value_end": 33,
+ "variable": "key",
+ "variable_start": 0,
+ "variable_end": 3,
+ "entropy_validation": {
+ "iterator": "BASE64_CHARS",
+ "entropy": 2.9295462978988653,
+ "valid": false
+ }
+ }
+ ]
+ },
+ {
+ "api_validation": "NOT_AVAILABLE",
+ "ml_validation": "VALIDATED_KEY",
+ "ml_probability": 0.99,
+ "rule": "Key",
+ "severity": "medium",
+ "confidence": "moderate",
+ "line_data_list": [
+ {
+ "line": "key : 654987-321654-963852-741853",
+ "line_num": 71,
+ "path": "./tests/samples/table.html",
+ "info": "./tests/samples/table.html|HTML",
+ "value": "654987-321654-963852-741853",
+ "value_start": 6,
+ "value_end": 33,
+ "variable": "key",
+ "variable_start": 0,
+ "variable_end": 3,
+ "entropy_validation": {
+ "iterator": "BASE64_CHARS",
+ "entropy": 2.9295462978988653,
+ "valid": false
+ }
+ }
+ ]
+ },
{
"api_validation": "NOT_AVAILABLE",
"ml_validation": "NOT_AVAILABLE",
@@ -11290,13 +12154,13 @@
{
"api_validation": "NOT_AVAILABLE",
"ml_validation": "VALIDATED_KEY",
- "ml_probability": 0.937,
+ "ml_probability": 0.903,
"rule": "Password",
"severity": "medium",
"confidence": "moderate",
"line_data_list": [
{
- "line": "password = 0dm1nk0",
+ "line": "password : 0dm1nk0",
"line_num": 29,
"path": "./tests/samples/test.html",
"info": "./tests/samples/test.html|HTML",
@@ -11323,7 +12187,7 @@
"confidence": "moderate",
"line_data_list": [
{
- "line": "secret = BNbNbws73bdhss329ssakKhds120384",
+ "line": "secret : BNbNbws73bdhss329ssakKhds120384",
"line_num": 32,
"path": "./tests/samples/test.html",
"info": "./tests/samples/test.html|HTML",
@@ -11344,13 +12208,13 @@
{
"api_validation": "NOT_AVAILABLE",
"ml_validation": "VALIDATED_KEY",
- "ml_probability": 0.993,
+ "ml_probability": 0.988,
"rule": "Password",
"severity": "medium",
"confidence": "moderate",
"line_data_list": [
{
- "line": "password = Cr3DeHTbIal",
+ "line": "password : Cr3DeHTbIal",
"line_num": 42,
"path": "./tests/samples/test.html",
"info": "./tests/samples/test.html|HTML",
@@ -11377,7 +12241,7 @@
"confidence": "moderate",
"line_data_list": [
{
- "line": "secret = Ndjbwu88s22ygavsdhgt5454v3h1x",
+ "line": "secret : Ndjbwu88s22ygavsdhgt5454v3h1x",
"line_num": 45,
"path": "./tests/samples/test.html",
"info": "./tests/samples/test.html|HTML",
@@ -11398,13 +12262,13 @@
{
"api_validation": "NOT_AVAILABLE",
"ml_validation": "VALIDATED_KEY",
- "ml_probability": 0.997,
+ "ml_probability": 0.996,
"rule": "Token",
"severity": "medium",
"confidence": "moderate",
"line_data_list": [
{
- "line": "token = H72gsdv2dswPneHduwhfd",
+ "line": "token : H72gsdv2dswPneHduwhfd",
"line_num": 65,
"path": "./tests/samples/test.html",
"info": "./tests/samples/test.html|HTML",
@@ -11431,7 +12295,7 @@
"confidence": "moderate",
"line_data_list": [
{
- "line": "password = p@$$w0Rd42",
+ "line": "password : p@$$w0Rd42",
"line_num": 71,
"path": "./tests/samples/test.html",
"info": "./tests/samples/test.html|HTML",
@@ -11459,7 +12323,7 @@
"line_data_list": [
{
"line": "508627689:AAEuLPKs-EhrjrYGnz60bnYNZqakf6HJxc0",
- "line_num": 76,
+ "line_num": 83,
"path": "./tests/samples/test.html",
"info": "./tests/samples/test.html|HTML",
"value": "508627689:AAEuLPKs-EhrjrYGnz60bnYNZqakf6HJxc0",
@@ -11486,7 +12350,7 @@
"line_data_list": [
{
"line": "# 94 ya29.dshMb48ehfXwydAj34D32J",
- "line_num": 89,
+ "line_num": 92,
"path": "./tests/samples/test.html",
"info": "./tests/samples/test.html|HTML",
"value": "ya29.dshMb48ehfXwydAj34D32J",
@@ -11513,7 +12377,7 @@
"line_data_list": [
{
"line": "# 95 dop_v1_425522a565f532bc6532d453422e50334a42f5242a3090fbe553b543b124259b",
- "line_num": 91,
+ "line_num": 94,
"path": "./tests/samples/test.html",
"info": "./tests/samples/test.html|HTML",
"value": "dop_v1_425522a565f532bc6532d453422e50334a42f5242a3090fbe553b543b124259b",
@@ -11540,7 +12404,7 @@
"line_data_list": [
{
"line": "# 94 ya29.dshMb48ehfXwydAj34D32J",
- "line_num": 92,
+ "line_num": 96,
"path": "./tests/samples/test.html",
"info": "./tests/samples/test.html|HTML",
"value": "ya29.dshMb48ehfXwydAj34D32J",
@@ -11567,7 +12431,7 @@
"line_data_list": [
{
"line": "# 95 dop_v1_425522a565f532bc6532d453422e50334a42f5242a3090fbe553b543b124259b",
- "line_num": 94,
+ "line_num": 98,
"path": "./tests/samples/test.html",
"info": "./tests/samples/test.html|HTML",
"value": "dop_v1_425522a565f532bc6532d453422e50334a42f5242a3090fbe553b543b124259b",
@@ -11594,7 +12458,7 @@
"line_data_list": [
{
"line": "the line will be found twice # 100 EAACEb00Kse0BAlGy7KeQ5YnaCEd09Eose0cBAlGy7KeQ5Yna9CoDsup39tiYdoQ4jH9Coup39tiYdWoQ4jHFZD",
- "line_num": 97,
+ "line_num": 100,
"path": "./tests/samples/test.html",
"info": "./tests/samples/test.html|HTML",
"value": "EAACEb00Kse0BAlGy7KeQ5YnaCEd09Eose0cBAlGy7KeQ5Yna9CoDsup39tiYdoQ4jH9Coup39tiYdWoQ4jHFZD",
@@ -11621,7 +12485,7 @@
"line_data_list": [
{
"line": "the line will be found twice # 100 EAACEb00Kse0BAlGy7KeQ5YnaCEd09Eose0cBAlGy7KeQ5Yna9CoDsup39tiYdoQ4jH9Coup39tiYdWoQ4jHFZD",
- "line_num": 100,
+ "line_num": 104,
"path": "./tests/samples/test.html",
"info": "./tests/samples/test.html|HTML",
"value": "EAACEb00Kse0BAlGy7KeQ5YnaCEd09Eose0cBAlGy7KeQ5Yna9CoDsup39tiYdoQ4jH9Coup39tiYdWoQ4jHFZD",
diff --git a/tests/data/doc.json b/tests/data/doc.json
index 1a3fce99e..d5da09f84 100644
--- a/tests/data/doc.json
+++ b/tests/data/doc.json
@@ -13530,7 +13530,7 @@
"line_data_list": [
{
"line": "508627689:AAEuLPKs-EhrjrYGnz60bnYNZqakf6HJxc0",
- "line_num": 76,
+ "line_num": 83,
"path": "./tests/samples/test.html",
"info": "./tests/samples/test.html|HTML",
"value": "508627689:AAEuLPKs-EhrjrYGnz60bnYNZqakf6HJxc0",
@@ -13557,7 +13557,7 @@
"line_data_list": [
{
"line": "# 94 ya29.dshMb48ehfXwydAj34D32J",
- "line_num": 89,
+ "line_num": 96,
"path": "./tests/samples/test.html",
"info": "./tests/samples/test.html|HTML",
"value": "ya29.dshMb48ehfXwydAj34D32J",
@@ -13584,7 +13584,7 @@
"line_data_list": [
{
"line": "# 95 dop_v1_425522a565f532bc6532d453422e50334a42f5242a3090fbe553b543b124259b",
- "line_num": 91,
+ "line_num": 98,
"path": "./tests/samples/test.html",
"info": "./tests/samples/test.html|HTML",
"value": "dop_v1_425522a565f532bc6532d453422e50334a42f5242a3090fbe553b543b124259b",
@@ -13611,7 +13611,7 @@
"line_data_list": [
{
"line": "the line will be found twice # 100 EAACEb00Kse0BAlGy7KeQ5YnaCEd09Eose0cBAlGy7KeQ5Yna9CoDsup39tiYdoQ4jH9Coup39tiYdWoQ4jHFZD",
- "line_num": 97,
+ "line_num": 104,
"path": "./tests/samples/test.html",
"info": "./tests/samples/test.html|HTML",
"value": "EAACEb00Kse0BAlGy7KeQ5YnaCEd09Eose0cBAlGy7KeQ5Yna9CoDsup39tiYdoQ4jH9Coup39tiYdWoQ4jHFZD",
diff --git a/tests/samples/table.html b/tests/samples/table.html
new file mode 100644
index 000000000..c30256549
--- /dev/null
+++ b/tests/samples/table.html
@@ -0,0 +1,77 @@
+
+
+
+
+ Title
+
+
+
+
+
+
+ 3 |
+ token |
+ secret |
+ key |
+ password |
+
+
+ bace4d19-fa7e-beer-care-9129474bcd81 |
+ 5EcRe7_r0 |
+ AK1AGIREOGIAWSKEY555 |
+ Pas1wrD0 |
+
+
+ bace4d19-fa7e-beer-care-9129474bcd82 |
+ AKDR_C1r17-K3Y0-SeCrt-2 |
+ SDFHBH2398SG5VF62VZVQFG2TYGVF9WYSGR |
+ DEAD-BEEF |
+ 3TJDSLKGDFG4MTB34UHWYYSDFHKSDKJH34S |
+ Pas1wrD2 |
+
+
+ 3 |
+ bace4d19-fa7e-beer-care-9129474bcd83 |
+ c1R345_4s12fey1 |
+ 0284-8946-3216-4010 |
+ 3216-4010-0284-8946 |
+ Pas1wrD3 |
+
+
+ 4 |
+ bace4d19-fa7e-beer-care-9129474bcd85 |
+ bace4d19-fa7e-beer-care-9129474bcd86 |
+ 5EcRe7_r4 |
+ 741852-321654-963852-654913 |
+ 184824-202847-623730-837462 |
+ Pas1wrD4 |
+
+
+ 5 |
+ bace4d19-fa7e-beer-care-9129474bcd87 |
+ bace4d19-fa7e-beer-care-9129474bcd88 |
+ 5EcRe7_r5 |
+ 321654-741852-963852-654980 |
+ 321654-741852-963852-654981 |
+ Pas1wrD5 |
+
+
+ 6 |
+ bace4d19-fa7e-beer-care-9129474bcd89 |
+ bace4d19-fa7e-beer-care-9129474bcd80 |
+ 321654-963852-654987-741851 |
+ 321654-963852-654987-741853 |
+ Pas1wrD67 |
+
+
+ bace4d19-fa7e-beer-care-9129474bcd11 |
+ bace4d19-fa7e-beer-care-9129474bcd22 |
+ 654987-321654-963852-741851 |
+ 654987-321654-963852-741852 |
+ 654987-321654-963852-741853 |
+
+
+
+
+
+
\ No newline at end of file
diff --git a/tests/test_main.py b/tests/test_main.py
index e38a662be..ead40c8f1 100644
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -630,12 +630,12 @@ def test_html_p(self) -> None:
found_credentials = cred_sweeper.credential_manager.get_credentials()
expected_credential_lines = [
"508627689:AAEuLPKs-EhrjrYGnz60bnYNZqakf6HJxc0",
- "secret = Ndjbwu88s22ygavsdhgt5454v3h1x",
- "password = Cr3DeHTbIal",
- "password = 0dm1nk0",
- "password = p@$$w0Rd42",
- "secret = BNbNbws73bdhss329ssakKhds120384",
- "token = H72gsdv2dswPneHduwhfd",
+ "secret : Ndjbwu88s22ygavsdhgt5454v3h1x",
+ "password : Cr3DeHTbIal",
+ "password : 0dm1nk0",
+ "password : p@$$w0Rd42",
+ "secret : BNbNbws73bdhss329ssakKhds120384",
+ "token : H72gsdv2dswPneHduwhfd",
"td : Password: MU$T6Ef09#D!",
"# 94 ya29.dshMb48ehfXwydAj34D32J",
"# 95 dop_v1_425522a565f532bc6532d453422e50334a42f5242a3090fbe553b543b124259b",
|