diff --git a/credsweeper/file_handler/data_content_provider.py b/credsweeper/file_handler/data_content_provider.py index 581352d47..d7279b566 100644 --- a/credsweeper/file_handler/data_content_provider.py +++ b/credsweeper/file_handler/data_content_provider.py @@ -183,8 +183,10 @@ def _check_multiline_cell(self, cell: Tag) -> Optional[Tuple[int, str]]: def _simple_html_representation(self, html: BeautifulSoup): # simple parse as it is displayed to user # dbg = html.find_all(text=True) - for p in html.find_all("p"): + for p in html.find_all(["p", "br", "tr", "li", "ol", "h1", "h2", "h3", "h4", "h5", "h6", "blockquote", "pre"]): p.append('\n') + for p in html.find_all(["th", "td"]): + p.append('\t') lines = html.get_text().splitlines() for line_number, doc_line in enumerate(lines): line = doc_line.strip() @@ -219,8 +221,10 @@ def _table_representation( logger.warning("Recursive depth limit was reached during HTML table combinations") return table_header: Optional[List[Optional[str]]] = None - for tr in table.find_all('tr'): + rowspan_columns = [] + for tr in table.find_all("tr"): if recursive_limit_size < self.__html_lines_size: + # weird tables may lead to oversize memory break record_numbers = [] record_lines = [] @@ -228,16 +232,21 @@ def _table_representation( if table_header is None: table_header = [] # first row in table may be a header with and a style, but search too - for cell in tr.find_all(['th', 'td']): + for cell in tr.find_all(["th", "td"]): if recursive_limit_size < self.__html_lines_size: + # keep the duplicates for early breaks! break + colspan_header = int(cell.get("colspan", 1)) if td_numbered_line := self._check_multiline_cell(cell): td_text = td_numbered_line[1] td_text_has_keywords = keywords_required_substrings_check(td_text.lower()) - if td_text_has_keywords: - table_header.append(td_text) - else: - table_header.append(None) + for _ in range(colspan_header): + rowspan_header = int(cell.get("rowspan", 1)) + rowspan_columns.append(rowspan_header) + if td_text_has_keywords: + table_header.append(td_text) + else: + table_header.append(None) if record_leading is None: if td_text_has_keywords: record_leading = td_text @@ -245,40 +254,53 @@ def _table_representation( record_leading = "" else: record_numbers.append(td_numbered_line[0]) - record_lines.append(f"{record_leading} = {td_text}") + record_lines.append(f"{record_leading} : {td_text}") # add single text to lines for analysis self.line_numbers.append(td_numbered_line[0]) self.lines.append(td_text) self.__html_lines_size += len(td_text) else: # empty cell or multiline cell - table_header.append(None) - continue + for _ in range(colspan_header): + # number of columns is defined with header only + rowspan_header = int(cell.get("rowspan", 1)) + rowspan_columns.append(rowspan_header) + table_header.append(None) else: + header_pos = 0 # not a first line in table - may be combined with a header - for header_pos, cell in enumerate(tr.find_all('td')): + for cell in tr.find_all("td"): if recursive_limit_size < self.__html_lines_size: + # keep the duplicates for early breaks! break + while header_pos < len(rowspan_columns) and 1 < rowspan_columns[header_pos]: + rowspan_columns[header_pos] -= 1 + header_pos += 1 + colspan_cell = int(cell.get("colspan", 1)) + rowspan_cell = int(cell.get("rowspan", 1)) + for i in range(header_pos, header_pos + colspan_cell): + if i < len(rowspan_columns): + rowspan_columns[i] += rowspan_cell - 1 if td_numbered_line := self._check_multiline_cell(cell): td_text = td_numbered_line[1] - td_text_has_keywords = keywords_required_substrings_check(td_text.lower()) if record_leading is None: + td_text_has_keywords = keywords_required_substrings_check(td_text.lower()) if td_text_has_keywords: record_leading = td_text else: record_leading = "" elif record_leading: record_numbers.append(td_numbered_line[0]) - record_lines.append(f"{record_leading} = {td_text}") + record_lines.append(f"{record_leading} : {td_text}") if header_pos < len(table_header): if header_text := table_header[header_pos]: self.line_numbers.append(td_numbered_line[0]) - self.lines.append(f"{header_text} = {td_text}") + self.lines.append(f"{header_text} : {td_text}") self.__html_lines_size += len(td_text) else: # empty cell or multiline cell table_header.append(None) - continue + header_pos += colspan_cell if record_lines: # add combinations with left column self.line_numbers.extend(record_numbers) @@ -295,7 +317,7 @@ def _html_tables_representation( depth -= 1 if 0 > depth: return - for table in html.find_all('table'): + for table in html.find_all("table"): if recursive_limit_size < self.__html_lines_size: logger.warning("Recursive size limit was reached during HTML table combinations") break diff --git a/tests/__init__.py b/tests/__init__.py index c752fc4a6..88a906408 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,7 +1,7 @@ from pathlib import Path # total number of files in test samples -SAMPLES_FILES_COUNT: int = 133 +SAMPLES_FILES_COUNT: int = 134 # the lowest value of ML threshold is used to display possible lowest values NEGLIGIBLE_ML_THRESHOLD = 0.0001 @@ -18,7 +18,7 @@ # archived credentials that are not found without --depth SAMPLES_IN_DEEP_1 = SAMPLES_POST_CRED_COUNT + 23 -SAMPLES_IN_DEEP_2 = SAMPLES_IN_DEEP_1 + 21 +SAMPLES_IN_DEEP_2 = SAMPLES_IN_DEEP_1 + 53 SAMPLES_IN_DEEP_3 = SAMPLES_IN_DEEP_2 + 1 # well known string with all latin letters diff --git a/tests/data/depth_3.json b/tests/data/depth_3.json index 1c8d6e667..528e5a31b 100644 --- a/tests/data/depth_3.json +++ b/tests/data/depth_3.json @@ -11233,6 +11233,870 @@ } ] }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "VALIDATED_KEY", + "ml_probability": 0.999, + "rule": "Token", + "severity": "medium", + "confidence": "moderate", + "line_data_list": [ + { + "line": "token : bace4d19-fa7e-beer-care-9129474bcd81", + "line_num": 19, + "path": "./tests/samples/table.html", + "info": "./tests/samples/table.html|HTML", + "value": "bace4d19-fa7e-beer-care-9129474bcd81", + "value_start": 8, + "value_end": 44, + "variable": "token", + "variable_start": 0, + "variable_end": 5, + "entropy_validation": { + "iterator": "BASE36_CHARS", + "entropy": 3.313850959964899, + "valid": true + } + } + ] + }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "VALIDATED_KEY", + "ml_probability": 0.792, + "rule": "Secret", + "severity": "medium", + "confidence": "moderate", + "line_data_list": [ + { + "line": "secret : 5EcRe7_r0", + "line_num": 20, + "path": "./tests/samples/table.html", + "info": "./tests/samples/table.html|HTML", + "value": "5EcRe7_r0", + "value_start": 9, + "value_end": 18, + "variable": "secret", + "variable_start": 0, + "variable_end": 6, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 2.8177111123931664, + "valid": false + } + } + ] + }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "VALIDATED_KEY", + "ml_probability": 0.969, + "rule": "Key", + "severity": "medium", + "confidence": "moderate", + "line_data_list": [ + { + "line": "key : AK1AGIREOGIAWSKEY555", + "line_num": 21, + "path": "./tests/samples/table.html", + "info": "./tests/samples/table.html|HTML", + "value": "AK1AGIREOGIAWSKEY555", + "value_start": 6, + "value_end": 26, + "variable": "key", + "variable_start": 0, + "variable_end": 3, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 3.446439344671016, + "valid": false + } + } + ] + }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "VALIDATED_KEY", + "ml_probability": 0.863, + "rule": "Password", + "severity": "medium", + "confidence": "moderate", + "line_data_list": [ + { + "line": "password : Pas1wrD0", + "line_num": 22, + "path": "./tests/samples/table.html", + "info": "./tests/samples/table.html|HTML", + "value": "Pas1wrD0", + "value_start": 11, + "value_end": 19, + "variable": "password", + "variable_start": 0, + "variable_end": 8, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 3.0, + "valid": false + } + } + ] + }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "VALIDATED_KEY", + "ml_probability": 1.0, + "rule": "Token", + "severity": "medium", + "confidence": "moderate", + "line_data_list": [ + { + "line": "token : bace4d19-fa7e-beer-care-9129474bcd82", + "line_num": 25, + "path": "./tests/samples/table.html", + "info": "./tests/samples/table.html|HTML", + "value": "bace4d19-fa7e-beer-care-9129474bcd82", + "value_start": 8, + "value_end": 44, + "variable": "token", + "variable_start": 0, + "variable_end": 5, + "entropy_validation": { + "iterator": "BASE36_CHARS", + "entropy": 3.3348200572472178, + "valid": true + } + } + ] + }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "VALIDATED_KEY", + "ml_probability": 0.713, + "rule": "Token", + "severity": "medium", + "confidence": "moderate", + "line_data_list": [ + { + "line": "token : AKDR_C1r17-K3Y0-SeCrt-2", + "line_num": 26, + "path": "./tests/samples/table.html", + "info": "./tests/samples/table.html|HTML", + "value": "AKDR_C1r17-K3Y0-SeCrt-2", + "value_start": 8, + "value_end": 31, + "variable": "token", + "variable_start": 0, + "variable_end": 5, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 3.389029441960142, + "valid": false + } + } + ] + }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "VALIDATED_KEY", + "ml_probability": 0.975, + "rule": "Key", + "severity": "medium", + "confidence": "moderate", + "line_data_list": [ + { + "line": "key : SDFHBH2398SG5VF62VZVQFG2TYGVF9WYSGR", + "line_num": 27, + "path": "./tests/samples/table.html", + "info": "./tests/samples/table.html|HTML", + "value": "SDFHBH2398SG5VF62VZVQFG2TYGVF9WYSGR", + "value_start": 6, + "value_end": 41, + "variable": "key", + "variable_start": 0, + "variable_end": 3, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 4.000432302535625, + "valid": false + } + } + ] + }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "VALIDATED_KEY", + "ml_probability": 0.995, + "rule": "Key", + "severity": "medium", + "confidence": "moderate", + "line_data_list": [ + { + "line": "key : 3TJDSLKGDFG4MTB34UHWYYSDFHKSDKJH34S", + "line_num": 29, + "path": "./tests/samples/table.html", + "info": "./tests/samples/table.html|HTML", + "value": "3TJDSLKGDFG4MTB34UHWYYSDFHKSDKJH34S", + "value_start": 6, + "value_end": 41, + "variable": "key", + "variable_start": 0, + "variable_end": 3, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 3.843010159554856, + "valid": false + } + } + ] + }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "VALIDATED_KEY", + "ml_probability": 0.66, + "rule": "Password", + "severity": "medium", + "confidence": "moderate", + "line_data_list": [ + { + "line": "password : Pas1wrD2", + "line_num": 30, + "path": "./tests/samples/table.html", + "info": "./tests/samples/table.html|HTML", + "value": "Pas1wrD2", + "value_start": 11, + "value_end": 19, + "variable": "password", + "variable_start": 0, + "variable_end": 8, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 3.0, + "valid": false + } + } + ] + }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "VALIDATED_KEY", + "ml_probability": 1.0, + "rule": "Token", + "severity": "medium", + "confidence": "moderate", + "line_data_list": [ + { + "line": "token : bace4d19-fa7e-beer-care-9129474bcd83", + "line_num": 34, + "path": "./tests/samples/table.html", + "info": "./tests/samples/table.html|HTML", + "value": "bace4d19-fa7e-beer-care-9129474bcd83", + "value_start": 8, + "value_end": 44, + "variable": "token", + "variable_start": 0, + "variable_end": 5, + "entropy_validation": { + "iterator": "BASE36_CHARS", + "entropy": 3.390375612802773, + "valid": true + } + } + ] + }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "VALIDATED_KEY", + "ml_probability": 0.913, + "rule": "Password", + "severity": "medium", + "confidence": "moderate", + "line_data_list": [ + { + "line": "password : Pas1wrD3", + "line_num": 38, + "path": "./tests/samples/table.html", + "info": "./tests/samples/table.html|HTML", + "value": "Pas1wrD3", + "value_start": 11, + "value_end": 19, + "variable": "password", + "variable_start": 0, + "variable_end": 8, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 3.0, + "valid": false + } + } + ] + }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "VALIDATED_KEY", + "ml_probability": 1.0, + "rule": "Token", + "severity": "medium", + "confidence": "moderate", + "line_data_list": [ + { + "line": "token : bace4d19-fa7e-beer-care-9129474bcd85", + "line_num": 42, + "path": "./tests/samples/table.html", + "info": "./tests/samples/table.html|HTML", + "value": "bace4d19-fa7e-beer-care-9129474bcd85", + "value_start": 8, + "value_end": 44, + "variable": "token", + "variable_start": 0, + "variable_end": 5, + "entropy_validation": { + "iterator": "BASE36_CHARS", + "entropy": 3.390375612802773, + "valid": true + } + } + ] + }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "VALIDATED_KEY", + "ml_probability": 1.0, + "rule": "Token", + "severity": "medium", + "confidence": "moderate", + "line_data_list": [ + { + "line": "token : bace4d19-fa7e-beer-care-9129474bcd86", + "line_num": 43, + "path": "./tests/samples/table.html", + "info": "./tests/samples/table.html|HTML", + "value": "bace4d19-fa7e-beer-care-9129474bcd86", + "value_start": 8, + "value_end": 44, + "variable": "token", + "variable_start": 0, + "variable_end": 5, + "entropy_validation": { + "iterator": "BASE36_CHARS", + "entropy": 3.390375612802773, + "valid": true + } + } + ] + }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "VALIDATED_KEY", + "ml_probability": 0.881, + "rule": "Secret", + "severity": "medium", + "confidence": "moderate", + "line_data_list": [ + { + "line": "secret : 5EcRe7_r4", + "line_num": 44, + "path": "./tests/samples/table.html", + "info": "./tests/samples/table.html|HTML", + "value": "5EcRe7_r4", + "value_start": 9, + "value_end": 18, + "variable": "secret", + "variable_start": 0, + "variable_end": 6, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 2.8177111123931664, + "valid": false + } + } + ] + }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "VALIDATED_KEY", + "ml_probability": 0.95, + "rule": "Key", + "severity": "medium", + "confidence": "moderate", + "line_data_list": [ + { + "line": "key : 741852-321654-963852-654913", + "line_num": 45, + "path": "./tests/samples/table.html", + "info": "./tests/samples/table.html|HTML", + "value": "741852-321654-963852-654913", + "value_start": 6, + "value_end": 33, + "variable": "key", + "variable_start": 0, + "variable_end": 3, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 2.901587501522441, + "valid": false + } + } + ] + }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "VALIDATED_KEY", + "ml_probability": 0.944, + "rule": "Password", + "severity": "medium", + "confidence": "moderate", + "line_data_list": [ + { + "line": "password : Pas1wrD4", + "line_num": 47, + "path": "./tests/samples/table.html", + "info": "./tests/samples/table.html|HTML", + "value": "Pas1wrD4", + "value_start": 11, + "value_end": 19, + "variable": "password", + "variable_start": 0, + "variable_end": 8, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 3.0, + "valid": false + } + } + ] + }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "VALIDATED_KEY", + "ml_probability": 0.999, + "rule": "Token", + "severity": "medium", + "confidence": "moderate", + "line_data_list": [ + { + "line": "token : bace4d19-fa7e-beer-care-9129474bcd87", + "line_num": 51, + "path": "./tests/samples/table.html", + "info": "./tests/samples/table.html|HTML", + "value": "bace4d19-fa7e-beer-care-9129474bcd87", + "value_start": 8, + "value_end": 44, + "variable": "token", + "variable_start": 0, + "variable_end": 5, + "entropy_validation": { + "iterator": "BASE36_CHARS", + "entropy": 3.313850959964899, + "valid": true + } + } + ] + }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "VALIDATED_KEY", + "ml_probability": 0.999, + "rule": "Token", + "severity": "medium", + "confidence": "moderate", + "line_data_list": [ + { + "line": "token : bace4d19-fa7e-beer-care-9129474bcd88", + "line_num": 52, + "path": "./tests/samples/table.html", + "info": "./tests/samples/table.html|HTML", + "value": "bace4d19-fa7e-beer-care-9129474bcd88", + "value_start": 8, + "value_end": 44, + "variable": "token", + "variable_start": 0, + "variable_end": 5, + "entropy_validation": { + "iterator": "BASE36_CHARS", + "entropy": 3.3348200572472178, + "valid": true + } + } + ] + }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "VALIDATED_KEY", + "ml_probability": 0.858, + "rule": "Secret", + "severity": "medium", + "confidence": "moderate", + "line_data_list": [ + { + "line": "secret : 5EcRe7_r5", + "line_num": 53, + "path": "./tests/samples/table.html", + "info": "./tests/samples/table.html|HTML", + "value": "5EcRe7_r5", + "value_start": 9, + "value_end": 18, + "variable": "secret", + "variable_start": 0, + "variable_end": 6, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 2.595488890170944, + "valid": false + } + } + ] + }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "VALIDATED_KEY", + "ml_probability": 0.934, + "rule": "Key", + "severity": "medium", + "confidence": "moderate", + "line_data_list": [ + { + "line": "key : 321654-741852-963852-654980", + "line_num": 54, + "path": "./tests/samples/table.html", + "info": "./tests/samples/table.html|HTML", + "value": "321654-741852-963852-654980", + "value_start": 6, + "value_end": 33, + "variable": "key", + "variable_start": 0, + "variable_end": 3, + "entropy_validation": { + "iterator": "BASE36_CHARS", + "entropy": 3.0036203719729397, + "valid": true + } + } + ] + }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "VALIDATED_KEY", + "ml_probability": 0.955, + "rule": "Key", + "severity": "medium", + "confidence": "moderate", + "line_data_list": [ + { + "line": "key : 321654-741852-963852-654981", + "line_num": 55, + "path": "./tests/samples/table.html", + "info": "./tests/samples/table.html|HTML", + "value": "321654-741852-963852-654981", + "value_start": 6, + "value_end": 33, + "variable": "key", + "variable_start": 0, + "variable_end": 3, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 2.9015875015224406, + "valid": false + } + } + ] + }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "VALIDATED_KEY", + "ml_probability": 0.964, + "rule": "Password", + "severity": "medium", + "confidence": "moderate", + "line_data_list": [ + { + "line": "password : Pas1wrD5", + "line_num": 56, + "path": "./tests/samples/table.html", + "info": "./tests/samples/table.html|HTML", + "value": "Pas1wrD5", + "value_start": 11, + "value_end": 19, + "variable": "password", + "variable_start": 0, + "variable_end": 8, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 3.0, + "valid": false + } + } + ] + }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "VALIDATED_KEY", + "ml_probability": 0.999, + "rule": "Token", + "severity": "medium", + "confidence": "moderate", + "line_data_list": [ + { + "line": "token : bace4d19-fa7e-beer-care-9129474bcd89", + "line_num": 60, + "path": "./tests/samples/table.html", + "info": "./tests/samples/table.html|HTML", + "value": "bace4d19-fa7e-beer-care-9129474bcd89", + "value_start": 8, + "value_end": 44, + "variable": "token", + "variable_start": 0, + "variable_end": 5, + "entropy_validation": { + "iterator": "BASE36_CHARS", + "entropy": 3.3002335989739806, + "valid": true + } + } + ] + }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "VALIDATED_KEY", + "ml_probability": 0.999, + "rule": "Token", + "severity": "medium", + "confidence": "moderate", + "line_data_list": [ + { + "line": "token : bace4d19-fa7e-beer-care-9129474bcd80", + "line_num": 61, + "path": "./tests/samples/table.html", + "info": "./tests/samples/table.html|HTML", + "value": "bace4d19-fa7e-beer-care-9129474bcd80", + "value_start": 8, + "value_end": 44, + "variable": "token", + "variable_start": 0, + "variable_end": 5, + "entropy_validation": { + "iterator": "BASE36_CHARS", + "entropy": 3.390375612802773, + "valid": true + } + } + ] + }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "VALIDATED_KEY", + "ml_probability": 0.955, + "rule": "Key", + "severity": "medium", + "confidence": "moderate", + "line_data_list": [ + { + "line": "key : 321654-963852-654987-741851", + "line_num": 62, + "path": "./tests/samples/table.html", + "info": "./tests/samples/table.html|HTML", + "value": "321654-963852-654987-741851", + "value_start": 6, + "value_end": 33, + "variable": "key", + "variable_start": 0, + "variable_end": 3, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 2.9295462978988653, + "valid": false + } + } + ] + }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "VALIDATED_KEY", + "ml_probability": 0.944, + "rule": "Key", + "severity": "medium", + "confidence": "moderate", + "line_data_list": [ + { + "line": "key : 321654-963852-654987-741853", + "line_num": 63, + "path": "./tests/samples/table.html", + "info": "./tests/samples/table.html|HTML", + "value": "321654-963852-654987-741853", + "value_start": 6, + "value_end": 33, + "variable": "key", + "variable_start": 0, + "variable_end": 3, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 2.9295462978988653, + "valid": false + } + } + ] + }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "VALIDATED_KEY", + "ml_probability": 0.996, + "rule": "Password", + "severity": "medium", + "confidence": "moderate", + "line_data_list": [ + { + "line": "password : Pas1wrD67", + "line_num": 64, + "path": "./tests/samples/table.html", + "info": "./tests/samples/table.html|HTML", + "value": "Pas1wrD67", + "value_start": 11, + "value_end": 20, + "variable": "password", + "variable_start": 0, + "variable_end": 8, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 3.169925001442312, + "valid": false + } + } + ] + }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "VALIDATED_KEY", + "ml_probability": 0.999, + "rule": "Token", + "severity": "medium", + "confidence": "moderate", + "line_data_list": [ + { + "line": "token : bace4d19-fa7e-beer-care-9129474bcd11", + "line_num": 67, + "path": "./tests/samples/table.html", + "info": "./tests/samples/table.html|HTML", + "value": "bace4d19-fa7e-beer-care-9129474bcd11", + "value_start": 8, + "value_end": 44, + "variable": "token", + "variable_start": 0, + "variable_end": 5, + "entropy_validation": { + "iterator": "BASE36_CHARS", + "entropy": 3.2237089461361066, + "valid": true + } + } + ] + }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "VALIDATED_KEY", + "ml_probability": 0.999, + "rule": "Token", + "severity": "medium", + "confidence": "moderate", + "line_data_list": [ + { + "line": "token : bace4d19-fa7e-beer-care-9129474bcd22", + "line_num": 68, + "path": "./tests/samples/table.html", + "info": "./tests/samples/table.html|HTML", + "value": "bace4d19-fa7e-beer-care-9129474bcd22", + "value_start": 8, + "value_end": 44, + "variable": "token", + "variable_start": 0, + "variable_end": 5, + "entropy_validation": { + "iterator": "BASE36_CHARS", + "entropy": 3.2582954044093437, + "valid": true + } + } + ] + }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "VALIDATED_KEY", + "ml_probability": 0.992, + "rule": "Key", + "severity": "medium", + "confidence": "moderate", + "line_data_list": [ + { + "line": "key : 654987-321654-963852-741851", + "line_num": 69, + "path": "./tests/samples/table.html", + "info": "./tests/samples/table.html|HTML", + "value": "654987-321654-963852-741851", + "value_start": 6, + "value_end": 33, + "variable": "key", + "variable_start": 0, + "variable_end": 3, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 2.9295462978988653, + "valid": false + } + } + ] + }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "VALIDATED_KEY", + "ml_probability": 0.991, + "rule": "Key", + "severity": "medium", + "confidence": "moderate", + "line_data_list": [ + { + "line": "key : 654987-321654-963852-741852", + "line_num": 70, + "path": "./tests/samples/table.html", + "info": "./tests/samples/table.html|HTML", + "value": "654987-321654-963852-741852", + "value_start": 6, + "value_end": 33, + "variable": "key", + "variable_start": 0, + "variable_end": 3, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 2.9295462978988653, + "valid": false + } + } + ] + }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "VALIDATED_KEY", + "ml_probability": 0.99, + "rule": "Key", + "severity": "medium", + "confidence": "moderate", + "line_data_list": [ + { + "line": "key : 654987-321654-963852-741853", + "line_num": 71, + "path": "./tests/samples/table.html", + "info": "./tests/samples/table.html|HTML", + "value": "654987-321654-963852-741853", + "value_start": 6, + "value_end": 33, + "variable": "key", + "variable_start": 0, + "variable_end": 3, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 2.9295462978988653, + "valid": false + } + } + ] + }, { "api_validation": "NOT_AVAILABLE", "ml_validation": "NOT_AVAILABLE", @@ -11290,13 +12154,13 @@ { "api_validation": "NOT_AVAILABLE", "ml_validation": "VALIDATED_KEY", - "ml_probability": 0.937, + "ml_probability": 0.903, "rule": "Password", "severity": "medium", "confidence": "moderate", "line_data_list": [ { - "line": "password = 0dm1nk0", + "line": "password : 0dm1nk0", "line_num": 29, "path": "./tests/samples/test.html", "info": "./tests/samples/test.html|HTML", @@ -11323,7 +12187,7 @@ "confidence": "moderate", "line_data_list": [ { - "line": "secret = BNbNbws73bdhss329ssakKhds120384", + "line": "secret : BNbNbws73bdhss329ssakKhds120384", "line_num": 32, "path": "./tests/samples/test.html", "info": "./tests/samples/test.html|HTML", @@ -11344,13 +12208,13 @@ { "api_validation": "NOT_AVAILABLE", "ml_validation": "VALIDATED_KEY", - "ml_probability": 0.993, + "ml_probability": 0.988, "rule": "Password", "severity": "medium", "confidence": "moderate", "line_data_list": [ { - "line": "password = Cr3DeHTbIal", + "line": "password : Cr3DeHTbIal", "line_num": 42, "path": "./tests/samples/test.html", "info": "./tests/samples/test.html|HTML", @@ -11377,7 +12241,7 @@ "confidence": "moderate", "line_data_list": [ { - "line": "secret = Ndjbwu88s22ygavsdhgt5454v3h1x", + "line": "secret : Ndjbwu88s22ygavsdhgt5454v3h1x", "line_num": 45, "path": "./tests/samples/test.html", "info": "./tests/samples/test.html|HTML", @@ -11398,13 +12262,13 @@ { "api_validation": "NOT_AVAILABLE", "ml_validation": "VALIDATED_KEY", - "ml_probability": 0.997, + "ml_probability": 0.996, "rule": "Token", "severity": "medium", "confidence": "moderate", "line_data_list": [ { - "line": "token = H72gsdv2dswPneHduwhfd", + "line": "token : H72gsdv2dswPneHduwhfd", "line_num": 65, "path": "./tests/samples/test.html", "info": "./tests/samples/test.html|HTML", @@ -11431,7 +12295,7 @@ "confidence": "moderate", "line_data_list": [ { - "line": "password = p@$$w0Rd42", + "line": "password : p@$$w0Rd42", "line_num": 71, "path": "./tests/samples/test.html", "info": "./tests/samples/test.html|HTML", @@ -11459,7 +12323,7 @@ "line_data_list": [ { "line": "508627689:AAEuLPKs-EhrjrYGnz60bnYNZqakf6HJxc0", - "line_num": 76, + "line_num": 83, "path": "./tests/samples/test.html", "info": "./tests/samples/test.html|HTML", "value": "508627689:AAEuLPKs-EhrjrYGnz60bnYNZqakf6HJxc0", @@ -11486,7 +12350,7 @@ "line_data_list": [ { "line": "# 94 ya29.dshMb48ehfXwydAj34D32J", - "line_num": 89, + "line_num": 92, "path": "./tests/samples/test.html", "info": "./tests/samples/test.html|HTML", "value": "ya29.dshMb48ehfXwydAj34D32J", @@ -11513,7 +12377,7 @@ "line_data_list": [ { "line": "# 95 dop_v1_425522a565f532bc6532d453422e50334a42f5242a3090fbe553b543b124259b", - "line_num": 91, + "line_num": 94, "path": "./tests/samples/test.html", "info": "./tests/samples/test.html|HTML", "value": "dop_v1_425522a565f532bc6532d453422e50334a42f5242a3090fbe553b543b124259b", @@ -11540,7 +12404,7 @@ "line_data_list": [ { "line": "# 94 ya29.dshMb48ehfXwydAj34D32J", - "line_num": 92, + "line_num": 96, "path": "./tests/samples/test.html", "info": "./tests/samples/test.html|HTML", "value": "ya29.dshMb48ehfXwydAj34D32J", @@ -11567,7 +12431,7 @@ "line_data_list": [ { "line": "# 95 dop_v1_425522a565f532bc6532d453422e50334a42f5242a3090fbe553b543b124259b", - "line_num": 94, + "line_num": 98, "path": "./tests/samples/test.html", "info": "./tests/samples/test.html|HTML", "value": "dop_v1_425522a565f532bc6532d453422e50334a42f5242a3090fbe553b543b124259b", @@ -11594,7 +12458,7 @@ "line_data_list": [ { "line": "the line will be found twice # 100 EAACEb00Kse0BAlGy7KeQ5YnaCEd09Eose0cBAlGy7KeQ5Yna9CoDsup39tiYdoQ4jH9Coup39tiYdWoQ4jHFZD", - "line_num": 97, + "line_num": 100, "path": "./tests/samples/test.html", "info": "./tests/samples/test.html|HTML", "value": "EAACEb00Kse0BAlGy7KeQ5YnaCEd09Eose0cBAlGy7KeQ5Yna9CoDsup39tiYdoQ4jH9Coup39tiYdWoQ4jHFZD", @@ -11621,7 +12485,7 @@ "line_data_list": [ { "line": "the line will be found twice # 100 EAACEb00Kse0BAlGy7KeQ5YnaCEd09Eose0cBAlGy7KeQ5Yna9CoDsup39tiYdoQ4jH9Coup39tiYdWoQ4jHFZD", - "line_num": 100, + "line_num": 104, "path": "./tests/samples/test.html", "info": "./tests/samples/test.html|HTML", "value": "EAACEb00Kse0BAlGy7KeQ5YnaCEd09Eose0cBAlGy7KeQ5Yna9CoDsup39tiYdoQ4jH9Coup39tiYdWoQ4jHFZD", diff --git a/tests/data/doc.json b/tests/data/doc.json index 1a3fce99e..d5da09f84 100644 --- a/tests/data/doc.json +++ b/tests/data/doc.json @@ -13530,7 +13530,7 @@ "line_data_list": [ { "line": "508627689:AAEuLPKs-EhrjrYGnz60bnYNZqakf6HJxc0", - "line_num": 76, + "line_num": 83, "path": "./tests/samples/test.html", "info": "./tests/samples/test.html|HTML", "value": "508627689:AAEuLPKs-EhrjrYGnz60bnYNZqakf6HJxc0", @@ -13557,7 +13557,7 @@ "line_data_list": [ { "line": "# 94 ya29.dshMb48ehfXwydAj34D32J", - "line_num": 89, + "line_num": 96, "path": "./tests/samples/test.html", "info": "./tests/samples/test.html|HTML", "value": "ya29.dshMb48ehfXwydAj34D32J", @@ -13584,7 +13584,7 @@ "line_data_list": [ { "line": "# 95 dop_v1_425522a565f532bc6532d453422e50334a42f5242a3090fbe553b543b124259b", - "line_num": 91, + "line_num": 98, "path": "./tests/samples/test.html", "info": "./tests/samples/test.html|HTML", "value": "dop_v1_425522a565f532bc6532d453422e50334a42f5242a3090fbe553b543b124259b", @@ -13611,7 +13611,7 @@ "line_data_list": [ { "line": "the line will be found twice # 100 EAACEb00Kse0BAlGy7KeQ5YnaCEd09Eose0cBAlGy7KeQ5Yna9CoDsup39tiYdoQ4jH9Coup39tiYdWoQ4jHFZD", - "line_num": 97, + "line_num": 104, "path": "./tests/samples/test.html", "info": "./tests/samples/test.html|HTML", "value": "EAACEb00Kse0BAlGy7KeQ5YnaCEd09Eose0cBAlGy7KeQ5Yna9CoDsup39tiYdoQ4jH9Coup39tiYdWoQ4jHFZD", diff --git a/tests/samples/table.html b/tests/samples/table.html new file mode 100644 index 000000000..c30256549 --- /dev/null +++ b/tests/samples/table.html @@ -0,0 +1,77 @@ + + + + + Title + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
3tokensecretkeypassword
bace4d19-fa7e-beer-care-9129474bcd815EcRe7_r0AK1AGIREOGIAWSKEY555Pas1wrD0
bace4d19-fa7e-beer-care-9129474bcd82AKDR_C1r17-K3Y0-SeCrt-2SDFHBH2398SG5VF62VZVQFG2TYGVF9WYSGRDEAD-BEEF3TJDSLKGDFG4MTB34UHWYYSDFHKSDKJH34SPas1wrD2
3bace4d19-fa7e-beer-care-9129474bcd83c1R345_4s12fey10284-8946-3216-40103216-4010-0284-8946Pas1wrD3
4bace4d19-fa7e-beer-care-9129474bcd85bace4d19-fa7e-beer-care-9129474bcd865EcRe7_r4741852-321654-963852-654913184824-202847-623730-837462Pas1wrD4
5bace4d19-fa7e-beer-care-9129474bcd87bace4d19-fa7e-beer-care-9129474bcd885EcRe7_r5321654-741852-963852-654980321654-741852-963852-654981Pas1wrD5
6bace4d19-fa7e-beer-care-9129474bcd89bace4d19-fa7e-beer-care-9129474bcd80321654-963852-654987-741851321654-963852-654987-741853Pas1wrD67
bace4d19-fa7e-beer-care-9129474bcd11bace4d19-fa7e-beer-care-9129474bcd22654987-321654-963852-741851654987-321654-963852-741852654987-321654-963852-741853
+ + + \ No newline at end of file diff --git a/tests/test_main.py b/tests/test_main.py index e38a662be..ead40c8f1 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -630,12 +630,12 @@ def test_html_p(self) -> None: found_credentials = cred_sweeper.credential_manager.get_credentials() expected_credential_lines = [ "508627689:AAEuLPKs-EhrjrYGnz60bnYNZqakf6HJxc0", - "secret = Ndjbwu88s22ygavsdhgt5454v3h1x", - "password = Cr3DeHTbIal", - "password = 0dm1nk0", - "password = p@$$w0Rd42", - "secret = BNbNbws73bdhss329ssakKhds120384", - "token = H72gsdv2dswPneHduwhfd", + "secret : Ndjbwu88s22ygavsdhgt5454v3h1x", + "password : Cr3DeHTbIal", + "password : 0dm1nk0", + "password : p@$$w0Rd42", + "secret : BNbNbws73bdhss329ssakKhds120384", + "token : H72gsdv2dswPneHduwhfd", "td : Password: MU$T6Ef09#D!", "# 94 ya29.dshMb48ehfXwydAj34D32J", "# 95 dop_v1_425522a565f532bc6532d453422e50334a42f5242a3090fbe553b543b124259b",