From 85691678324ff334a60b48ff8333d32514668006 Mon Sep 17 00:00:00 2001 From: Roman Babenko Date: Mon, 16 Oct 2023 08:35:29 +0300 Subject: [PATCH] from docx --- credsweeper/app.py | 15 +++++- tests/test_main.py | 117 ++++++++++++++++++++++++++++++++++++--------- 2 files changed, 107 insertions(+), 25 deletions(-) diff --git a/credsweeper/app.py b/credsweeper/app.py index 1a96ce6bb..8050b1fd4 100644 --- a/credsweeper/app.py +++ b/credsweeper/app.py @@ -160,7 +160,11 @@ def _use_ml_validation(self) -> bool: if not self.credential_manager.candidates: logger.info("Skipping ML validation due to no candidates found") return False - return True + for i in self.credential_manager.candidates: + if i.use_ml: + return True + logger.info("Skipp ML validation due no candidates support it") + return False # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # @@ -170,11 +174,18 @@ def _use_ml_validation(self) -> bool: # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # + @property + def is_ml_validator_inited(self) -> bool: + """method to check whether ml_validator was inited without creation""" + return bool(self.__ml_validator) + + # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # + @property def ml_validator(self) -> MlValidator: """ml_validator getter""" from credsweeper.ml_model import MlValidator - if not self.__ml_validator: + if not self.is_ml_validator_inited: self.__ml_validator: MlValidator = MlValidator(threshold=self.ml_threshold) assert self.__ml_validator, "self.__ml_validator was not initialized" return self.__ml_validator diff --git a/tests/test_main.py b/tests/test_main.py index 2081369b8..d94b76a02 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -362,30 +362,101 @@ def test_find_by_ext_and_not_ignore_p(self) -> None: def test_multiple_invocation_p(self) -> None: # test whether ml_validator is created once - files_counter = 0 - candidates_number = 0 - post_credentials_number = 0 + self.maxDiff = None cred_sweeper = CredSweeper() - validator_id = None - for dir_path, _, filenames in os.walk(SAMPLES_PATH): - for filename in filenames: - files_counter += 1 - provider = TextContentProvider(os.path.join(dir_path, filename)) - candidates = cred_sweeper.file_scan(provider) - candidates_number += len(candidates) - cred_sweeper.credential_manager.set_credentials(candidates) - cred_sweeper.post_processing() - post_credentials = cred_sweeper.credential_manager.get_credentials() - post_credentials_number += len(post_credentials) - # verify that validator is the same - cred_sweeper_validator = cred_sweeper.ml_validator - self.assertIsNotNone(cred_sweeper_validator) - if validator_id is None: - validator_id = id(cred_sweeper.ml_validator) - self.assertEqual(validator_id, id(cred_sweeper.ml_validator)) - self.assertEqual(SAMPLES_FILES_COUNT, files_counter) - self.assertEqual(SAMPLES_CRED_COUNT, candidates_number) - self.assertEqual(SAMPLES_POST_CRED_COUNT, post_credentials_number) + self.assertFalse(cred_sweeper.is_ml_validator_inited) + # found candidate is not ML validated + provider = TextContentProvider(SAMPLES_PATH / "small.pdf") + candidates = cred_sweeper.file_scan(provider) + self.assertEqual(1, len(candidates)) + self.assertDictEqual( + + {"api_validation": "NOT_AVAILABLE", + "line_data_list": [{ + "entropy_validation": { + "entropy": 4.620007704961091, + "iterator": "BASE64_CHARS", + "valid": True}, + "info": "", + "line": "BT /F1 24 Tf 175 720 Td (qpF8Q~PCM5MhMoyTFc5TYEomnzRUKim9UJhe8a2P)Tj ET", + "line_num": 15, + "path": f"{SAMPLES_PATH}/small.pdf", + "value": "qpF8Q~PCM5MhMoyTFc5TYEomnzRUKim9UJhe8a2P", + "value_end": 65, + "value_start": 25, + "variable": None}], + "ml_probability": None, + "ml_validation": "NOT_AVAILABLE", + "rule": "Azure Secret Value", + "severity": "high"} + , candidates[0].to_json()) + self.assertFalse(cred_sweeper.is_ml_validator_inited) + cred_sweeper.credential_manager.set_credentials(candidates) + cred_sweeper.post_processing() + self.assertFalse(cred_sweeper.is_ml_validator_inited) + + # found candidate is ML validated + provider = TextContentProvider(SAMPLES_PATH / "nonce.hs") + candidates = cred_sweeper.file_scan(provider) + self.assertEqual(1, len(candidates)) + self.assertDictEqual({ + "api_validation": "NOT_AVAILABLE", + "line_data_list": [{ + "entropy_validation": { + "entropy": 4.9260374290200755, + "iterator": "BASE64_CHARS", + "valid": True}, + "info": "", + "line": " \"nonce\": \"qPRjfoZWaBPH0KbXMCicm5v1VdG5Hj0DUFMHdSxPOiA\"", + "line_num": 2, + "path": f"{SAMPLES_PATH}/nonce.hs", + "value": "qPRjfoZWaBPH0KbXMCicm5v1VdG5Hj0DUFMHdSxPOiA", + "value_end": 57, + "value_start": 14, + "variable": "nonce"}], + "ml_probability": None, + "ml_validation": "NOT_AVAILABLE", + "rule": "Nonce", + "severity": "medium"} + , candidates[0].to_json()) + self.assertFalse(cred_sweeper.is_ml_validator_inited) + cred_sweeper.credential_manager.set_credentials(candidates) + cred_sweeper.post_processing() + self.assertTrue(cred_sweeper.is_ml_validator_inited) + # remember id of the validator + validator_id = id(cred_sweeper.ml_validator) + + # found candidate is ML validated also + provider = TextContentProvider(SAMPLES_PATH / "password.gradle") + candidates = cred_sweeper.file_scan(provider) + self.assertEqual(1, len(candidates)) + self.assertDictEqual({ + "api_validation": "NOT_AVAILABLE", + "line_data_list": [{ + "entropy_validation": { + "entropy": 2.120589933192232, + "iterator": "BASE64_CHARS", + "valid": False}, + "info": "", + "line": "password = \"cackle!\"", + "line_num": 1, + "path": f"{SAMPLES_PATH}/password.gradle", + "value": "cackle!", + "value_end": 19, + "value_start": 12, + "variable": "password"}], + "ml_probability": None, + "ml_validation": "NOT_AVAILABLE", + "rule": "Password", + "severity": "medium"} + , candidates[0].to_json()) + # the ml_validator still initialized + self.assertTrue(cred_sweeper.is_ml_validator_inited) + cred_sweeper.credential_manager.set_credentials(candidates) + cred_sweeper.post_processing() + self.assertTrue(cred_sweeper.is_ml_validator_inited) + # the same id of the validator + self.assertEqual(validator_id, id(cred_sweeper.ml_validator)) # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #