diff --git a/.mypy.ini b/.mypy.ini index 38c842392..cf4fd1082 100644 --- a/.mypy.ini +++ b/.mypy.ini @@ -45,3 +45,6 @@ ignore_missing_imports = True [mypy-password_strength.*] ignore_missing_imports = True + +[mypy-docx.*] +ignore_missing_imports = True diff --git a/credsweeper/config/config.py b/credsweeper/config/config.py index 275b49107..f53d9b794 100644 --- a/credsweeper/config/config.py +++ b/credsweeper/config/config.py @@ -19,6 +19,7 @@ def __init__(self, config: Dict[str, Any]) -> None: self.exclude_patterns: List[re.Pattern] = [re.compile(pattern) for pattern in config["exclude"]["pattern"]] self.exclude_paths: List[str] = config["exclude"]["path"] self.exclude_containers: List[str] = config["exclude"]["containers"] + self.exclude_documents: List[str] = config["exclude"]["documents"] self.exclude_extensions: List[str] = config["exclude"]["extension"] self.exclude_lines: Set[str] = set(config["exclude"].get("lines", [])) self.exclude_values: Set[str] = set(config["exclude"].get("values", [])) diff --git a/credsweeper/deep_scanner/deep_scanner.py b/credsweeper/deep_scanner/deep_scanner.py index e5db0bd76..16838cccb 100644 --- a/credsweeper/deep_scanner/deep_scanner.py +++ b/credsweeper/deep_scanner/deep_scanner.py @@ -16,6 +16,7 @@ from credsweeper.utils import Util from .byte_scanner import ByteScanner from .bzip2_scanner import Bzip2Scanner +from .docx_scanner import DocxScanner from .encoder_scanner import EncoderScanner from .gzip_scanner import GzipScanner from .html_scanner import HtmlScanner @@ -34,6 +35,7 @@ class DeepScanner( ByteScanner, # Bzip2Scanner, # + DocxScanner, # EncoderScanner, # GzipScanner, # HtmlScanner, # @@ -71,6 +73,7 @@ def get_deep_scanners(data: bytes) -> List[Any]: deep_scanners.append(ZipScanner) # probably, there might be a docx, xlxs and so on. # It might be scanned with text representation in third-party libraries. + deep_scanners.append(DocxScanner) elif Util.is_bzip2(data): deep_scanners.append(Bzip2Scanner) elif Util.is_tar(data): diff --git a/credsweeper/deep_scanner/docx_scanner.py b/credsweeper/deep_scanner/docx_scanner.py new file mode 100644 index 000000000..c40df37b7 --- /dev/null +++ b/credsweeper/deep_scanner/docx_scanner.py @@ -0,0 +1,43 @@ +import io +import logging +from abc import ABC +from typing import List + +import docx + +from credsweeper.credentials import Candidate +from credsweeper.deep_scanner.abstract_scanner import AbstractScanner +from credsweeper.file_handler.data_content_provider import DataContentProvider +from credsweeper.file_handler.string_content_provider import StringContentProvider + +logger = logging.getLogger(__name__) + + +class DocxScanner(AbstractScanner, ABC): + """Implements docx scanning""" + + def data_scan( + self, # + data_provider: DataContentProvider, # + depth: int, # + recursive_limit_size: int) -> List[Candidate]: + """Tries to scan DOCX text with splitting by lines""" + candidates: List[Candidate] = [] + + try: + docx_lines: List[str] = [] + + doc = docx.Document(io.BytesIO(data_provider.data)) + for paragraph in doc.paragraphs: + for line in paragraph.text.splitlines(): + if line: + docx_lines.append(line) + + string_data_provider = StringContentProvider(lines=docx_lines, + file_path=data_provider.file_path, + file_type=data_provider.file_type, + info=f"{data_provider.info}|DOCX") + candidates = self.scanner.scan(string_data_provider) + except Exception as docx_exc: + logger.debug(f"{data_provider.file_path}:{docx_exc}") + return candidates diff --git a/credsweeper/file_handler/file_path_extractor.py b/credsweeper/file_handler/file_path_extractor.py index 84fa1213c..ba8dc6f5b 100644 --- a/credsweeper/file_handler/file_path_extractor.py +++ b/credsweeper/file_handler/file_path_extractor.py @@ -143,6 +143,9 @@ def check_exclude_file(config: Config, path: str) -> bool: return True if not config.depth and file_extension in config.exclude_containers: return True + # --depth or --doc enables scan for all documents extensions + if not (config.depth or config.doc) and file_extension in config.exclude_documents: + return True return False @staticmethod diff --git a/credsweeper/secret/config.json b/credsweeper/secret/config.json index 6914ac849..fa50bb5f5 100644 --- a/credsweeper/secret/config.json +++ b/credsweeper/secret/config.json @@ -4,13 +4,15 @@ "containers": [ ".apk", ".bz2", - ".docx", ".gz", - ".pdf", ".tar", ".xlsx", ".zip" ], + "documents": [ + ".docx", + ".pdf" + ], "extension": [ ".7z", ".aac", @@ -71,6 +73,7 @@ "/__pycache__/", "/node_modules/", "/target/", + "/.venv/", "/venv/" ], "lines": [], diff --git a/docs/source/overall_architecture.rst b/docs/source/overall_architecture.rst index 3344454a1..47412c496 100644 --- a/docs/source/overall_architecture.rst +++ b/docs/source/overall_architecture.rst @@ -15,6 +15,7 @@ When paths to scan are entered, get the files in that paths and the files are ex - exclude - pattern: Regex patterns to exclude scan. - containers: Extensions in lower case of container files which might be scan with --depth option + - documents: Extensions in lower case of container files which might be scan with --doc and/or --depth option - extension: Extensions in lower case to exclude scan. - path: Paths to exclude scan. - source_ext: List of extensions for scanning categorized as source files. @@ -36,6 +37,11 @@ When paths to scan are entered, get the files in that paths and the files are ex ".zip", ... ], + "documents": [ + ".docx", + ".pdf", + ... + ], "extension": [ ".7z", ".jpg", diff --git a/requirements.txt b/requirements.txt index 0e903ccea..bee75005f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,6 +9,7 @@ openpyxl==3.1.2 pandas==2.0.3 # ^ the version supports by python 3.8 PyYAML==6.0.1 +python-docx==1.0.1 requests==2.31.0 schwifty==2023.9.0 typing_extensions==4.8.0 @@ -48,3 +49,4 @@ types-python-dateutil types-regex types-humanfriendly yapf + diff --git a/setup.py b/setup.py index 0353ce5e6..a5f591e8c 100644 --- a/setup.py +++ b/setup.py @@ -15,6 +15,7 @@ "password-strength", # "pdfminer.six", # "PyYAML", # + "python-docx", # "requests", # "scipy", # "schwifty", # diff --git a/tests/__init__.py b/tests/__init__.py index b62f5b743..7bdebc357 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,7 +1,7 @@ from pathlib import Path # total number of files in test samples -SAMPLES_FILES_COUNT: int = 120 +SAMPLES_FILES_COUNT: int = 123 # credentials count after scan SAMPLES_CRED_COUNT: int = 383 @@ -11,10 +11,10 @@ SAMPLES_POST_CRED_COUNT: int = 296 # with option --doc -SAMPLES_IN_DOC = 422 +SAMPLES_IN_DOC = 427 # archived credentials that are not found without --depth -SAMPLES_IN_DEEP_1 = SAMPLES_POST_CRED_COUNT + 16 +SAMPLES_IN_DEEP_1 = SAMPLES_POST_CRED_COUNT + 21 SAMPLES_IN_DEEP_2 = SAMPLES_IN_DEEP_1 + 16 SAMPLES_IN_DEEP_3 = SAMPLES_IN_DEEP_2 + 3 diff --git a/tests/conftest.py b/tests/conftest.py index 49f685146..4f8da811b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -38,6 +38,9 @@ def config() -> Config: config_dict["validation"]["api_validation"] = False config_dict["use_filters"] = True config_dict["find_by_ext"] = False + config_dict["exclude"]["containers"] = [".gz", ".zip"] + config_dict["exclude"]["documents"] = [".docx", ".pdf"] + config_dict["exclude"]["extension"] = [".jpg", ".bmp"] config_dict["depth"] = 0 config_dict["doc"] = False config_dict["find_by_ext_list"] = [".txt", ".inf"] diff --git a/tests/data/depth_3.json b/tests/data/depth_3.json index a28445ab9..880f7a216 100644 --- a/tests/data/depth_3.json +++ b/tests/data/depth_3.json @@ -6338,30 +6338,6 @@ } ] }, - { - "api_validation": "NOT_AVAILABLE", - "ml_validation": "VALIDATED_KEY", - "ml_probability": 0.97709, - "rule": "Password", - "severity": "medium", - "line_data_list": [ - { - "line": "password = Xdj@jcN834b.", - "line_num": 2, - "path": "tests/samples/password.docx", - "info": "tests/samples/password.docx|ZIP|word/document.xml|HTML", - "value": "Xdj@jcN834b.", - "value_start": 11, - "value_end": 23, - "variable": "password", - "entropy_validation": { - "iterator": "BASE64_CHARS", - "entropy": 2.8208020839342964, - "valid": false - } - } - ] - }, { "api_validation": "NOT_AVAILABLE", "ml_validation": "VALIDATED_KEY", @@ -8216,6 +8192,102 @@ } ] }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "VALIDATED_KEY", + "ml_probability": 0.94412, + "rule": "Password", + "severity": "medium", + "line_data_list": [ + { + "line": "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}t : Password = WeR15tr0n6", + "line_num": 1, + "path": "tests/samples/sample.docx", + "info": "tests/samples/sample.docx|ZIP|word/document.xml|XML", + "value": "WeR15tr0n6", + "value_start": 77, + "value_end": 87, + "variable": "Password", + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 3.321928094887362, + "valid": false + } + } + ] + }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "NOT_AVAILABLE", + "ml_probability": null, + "rule": "Github Token", + "severity": "high", + "line_data_list": [ + { + "line": "ghr_Ku7ikDwqD1Ge2u3Wf1UM3z2SLN230RP1I8Wf", + "line_num": 2, + "path": "tests/samples/sample.docx", + "info": "tests/samples/sample.docx|DOCX", + "value": "ghr_Ku7ikDwqD1Ge2u3Wf1UM3z2SLN230RP1I8Wf", + "value_start": 0, + "value_end": 40, + "variable": null, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 4.632263329852917, + "valid": true + } + } + ] + }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "VALIDATED_KEY", + "ml_probability": 0.94412, + "rule": "Password", + "severity": "medium", + "line_data_list": [ + { + "line": "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}t : Password = WeR15tr0n6", + "line_num": 1, + "path": "tests/samples/sample.docx.gz", + "info": "tests/samples/sample.docx.gz|GZIP|tests/samples/sample.docx|ZIP|word/document.xml|XML", + "value": "WeR15tr0n6", + "value_start": 77, + "value_end": 87, + "variable": "Password", + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 3.321928094887362, + "valid": false + } + } + ] + }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "NOT_AVAILABLE", + "ml_probability": null, + "rule": "Github Token", + "severity": "high", + "line_data_list": [ + { + "line": "ghr_Ku7ikDwqD1Ge2u3Wf1UM3z2SLN230RP1I8Wf", + "line_num": 2, + "path": "tests/samples/sample.docx.gz", + "info": "tests/samples/sample.docx.gz|GZIP|tests/samples/sample.docx|DOCX", + "value": "ghr_Ku7ikDwqD1Ge2u3Wf1UM3z2SLN230RP1I8Wf", + "value_start": 0, + "value_end": 40, + "variable": null, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 4.632263329852917, + "valid": true + } + } + ] + }, { "api_validation": "NOT_AVAILABLE", "ml_validation": "NOT_AVAILABLE", @@ -8264,6 +8336,30 @@ } ] }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "NOT_AVAILABLE", + "ml_probability": null, + "rule": "Github Token", + "severity": "high", + "line_data_list": [ + { + "line": "ghr_Ku7ikDwqD1Ge2u3Wf1UM3z2CLN230RP1I8Vd", + "line_num": 1, + "path": "tests/samples/sample.pdf", + "info": "tests/samples/sample.pdf|PDF:1|RAW", + "value": "ghr_Ku7ikDwqD1Ge2u3Wf1UM3z2CLN230RP1I8Vd", + "value_start": 0, + "value_end": 40, + "variable": null, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 4.732263329852917, + "valid": true + } + } + ] + }, { "api_validation": "NOT_AVAILABLE", "ml_validation": "VALIDATED_KEY", @@ -8408,6 +8504,30 @@ } ] }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "NOT_AVAILABLE", + "ml_probability": null, + "rule": "Azure Secret Value", + "severity": "high", + "line_data_list": [ + { + "line": "qpF8Q~PCM5MhMoyTFc5TYEomnzRUKim9UJhe8a2P", + "line_num": 1, + "path": "tests/samples/small.pdf", + "info": "tests/samples/small.pdf|PDF:1|RAW", + "value": "qpF8Q~PCM5MhMoyTFc5TYEomnzRUKim9UJhe8a2P", + "value_start": 0, + "value_end": 40, + "variable": null, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 4.620007704961091, + "valid": true + } + } + ] + }, { "api_validation": "NOT_AVAILABLE", "ml_validation": "NOT_AVAILABLE", diff --git a/tests/data/doc.json b/tests/data/doc.json index f505ae0ad..e88d39e40 100644 --- a/tests/data/doc.json +++ b/tests/data/doc.json @@ -11222,6 +11222,102 @@ } ] }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "NOT_AVAILABLE", + "ml_probability": null, + "rule": "PASSWD_PAIR", + "severity": "medium", + "line_data_list": [ + { + "line": "Password = WeR15tr0n6", + "line_num": 1, + "path": "tests/samples/sample.docx", + "info": "tests/samples/sample.docx|DOCX", + "value": "WeR15tr0n6", + "value_start": 11, + "value_end": 21, + "variable": "Password", + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 3.321928094887362, + "valid": false + } + } + ] + }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "NOT_AVAILABLE", + "ml_probability": null, + "rule": "Github Token", + "severity": "high", + "line_data_list": [ + { + "line": "ghr_Ku7ikDwqD1Ge2u3Wf1UM3z2SLN230RP1I8Wf", + "line_num": 2, + "path": "tests/samples/sample.docx", + "info": "tests/samples/sample.docx|DOCX", + "value": "ghr_Ku7ikDwqD1Ge2u3Wf1UM3z2SLN230RP1I8Wf", + "value_start": 0, + "value_end": 40, + "variable": null, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 4.632263329852917, + "valid": true + } + } + ] + }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "NOT_AVAILABLE", + "ml_probability": null, + "rule": "PASSWD_PAIR", + "severity": "medium", + "line_data_list": [ + { + "line": "password = Xdj@jcN834b", + "line_num": 1, + "path": "tests/samples/sample.pdf", + "info": "tests/samples/sample.pdf|PDF", + "value": "Xdj@jcN834b", + "value_start": 11, + "value_end": 22, + "variable": "password", + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 2.963119653306635, + "valid": false + } + } + ] + }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "NOT_AVAILABLE", + "ml_probability": null, + "rule": "Github Token", + "severity": "high", + "line_data_list": [ + { + "line": "ghr_Ku7ikDwqD1Ge2u3Wf1UM3z2CLN230RP1I8Vd", + "line_num": 3, + "path": "tests/samples/sample.pdf", + "info": "tests/samples/sample.pdf|PDF", + "value": "ghr_Ku7ikDwqD1Ge2u3Wf1UM3z2CLN230RP1I8Vd", + "value_start": 0, + "value_end": 40, + "variable": null, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 4.732263329852917, + "valid": true + } + } + ] + }, { "api_validation": "NOT_AVAILABLE", "ml_validation": "NOT_AVAILABLE", @@ -11342,6 +11438,30 @@ } ] }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "NOT_AVAILABLE", + "ml_probability": null, + "rule": "Azure Secret Value", + "severity": "high", + "line_data_list": [ + { + "line": "qpF8Q~PCM5MhMoyTFc5TYEomnzRUKim9UJhe8a2P", + "line_num": 1, + "path": "tests/samples/small.pdf", + "info": "tests/samples/small.pdf|PDF", + "value": "qpF8Q~PCM5MhMoyTFc5TYEomnzRUKim9UJhe8a2P", + "value_start": 0, + "value_end": 40, + "variable": null, + "entropy_validation": { + "iterator": "BASE64_CHARS", + "entropy": 4.620007704961091, + "valid": true + } + } + ] + }, { "api_validation": "NOT_AVAILABLE", "ml_validation": "NOT_AVAILABLE", diff --git a/tests/file_handler/test_file_path_extractor.py b/tests/file_handler/test_file_path_extractor.py index 16fb5236c..49bcfaf13 100644 --- a/tests/file_handler/test_file_path_extractor.py +++ b/tests/file_handler/test_file_path_extractor.py @@ -1,9 +1,11 @@ import os.path +import re import tempfile +import unittest +from typing import List from unittest import mock import git -import pytest from humanfriendly import parse_size from credsweeper.config import Config @@ -11,20 +13,62 @@ from tests import AZ_STRING -class TestFilePathExtractor: +class TestFilePathExtractor(unittest.TestCase): + + def setUp(self): + config_dict = { + "size_limit": None, + "find_by_ext": False, + "find_by_ext_list": [], + "doc": False, + "depth": 0, + "exclude": { + "path": [], + "pattern": [], + "containers": [], + "documents": [], + "extension": [] + }, + "source_ext": [], + "source_quote_ext": [], + "check_for_literals": [], + "validation": { + "api_validation": False + }, + "use_filters": False, + "line_data_output": [], + "candidate_output": [], + "min_keyword_value_length": 0, + "min_pattern_value_length": 0, + } + self.config = Config(config_dict) + + # excluded always not_allowed_path_pattern + self.paths_not = ["dummy.css", "tmp/dummy.css", "c:\\temp\\dummy.css"] + # pattern + self.paths_reg = ["tmp/Magic/dummy.Number", "/tmp/log/MagicNumber.txt"] + # "/.git/" + self.paths_git = ["C:\\.git\\dummy", "./.git/dummy.sample", "~/.git\\dummy.txt"] + # not excluded + self.paths_src = ["dummy.py", "/tmp/dummy.py", "tmp/dummy.py", "C:\\dummy.py", "temp\\dummy.py"] + # not excluded when --depth are set + self.paths_pak = ["dummy.gz", "/tmp/dummy.gz", "tmp/dummy.gz", "C:\\dummy.gz", "temp\\dummy.gz"] + # not excluded when --doc or --depth are set + self.paths_doc = ["dummy.pdf", "/tmp/dummy.pdf", "tmp/dummy.pdf", "C:\\dummy.pdf", "temp\\dummy.pdf"] + # extension to be excluded always + self.paths_ext = ["dummy.so", "dummy.so", "/tmp/dummy.so", "tmp/dummy.so", "C:\\dummy.so", "temp\\dummy.so"] + + def tearDown(self): + del self.config def test_apply_gitignore_p(self) -> None: """Evaluate that code files would be included after filtering with .gitignore""" - files = ["file.py", "src/file.py", "src/dir/file.py"] - filtered_files = FilePathExtractor.apply_gitignore(files) - - assert set(filtered_files) == set(files) + self.assertSetEqual(set(files), set(filtered_files)) def test_apply_gitignore_n(self) -> None: """Evaluate that .gitignore correctly filters out files from project""" - with tempfile.TemporaryDirectory() as tmp_dir: git.Repo.init(tmp_dir) with open(os.path.join(tmp_dir, ".gitignore"), "w") as f: @@ -40,53 +84,97 @@ def test_apply_gitignore_n(self) -> None: ] filtered_files = FilePathExtractor.apply_gitignore(files) - assert len(filtered_files) == 1 - assert filtered_files[0] == os.path.join(tmp_dir, "src", "dir", "file.cpp") - - @pytest.mark.parametrize("file_path", [ - "/tmp/test/dummy.p12", - "C:\\Users\\RUNNER~1\\AppData\\Local\\Temp\\tmptjz2p1zk\\test\\dummy.p12", - "C:\\Users\\RUNNER~1\\AppData\\Local\\Temp\\tmptjz2p1zk\\TarGet\\dummy.p12", - ]) - def test_check_exclude_file_p(self, config: Config, file_path: pytest.fixture) -> None: - config.find_by_ext = True - assert not FilePathExtractor.check_exclude_file(config, file_path), f"{file_path}" - - @pytest.mark.parametrize("file_path", [ - "dummy.JPG", - "/tmp/target/dummy.p12", - "C:\\Users\\RUNNER~1\\AppData\\Local\\Temp\\tmptjz2p1zk\\target\\dummy.p12", - ]) - def test_check_exclude_file_n(self, config: Config, file_path: pytest.fixture) -> None: - config.find_by_ext = True - assert FilePathExtractor.check_exclude_file(config, file_path) - - @pytest.mark.parametrize("file_type", [".inf", ".txt"]) - def test_find_by_ext_file_p(self, config: Config, file_type: pytest.fixture) -> None: - config.find_by_ext = True - assert FilePathExtractor.is_find_by_ext_file(config, file_type) - - @pytest.mark.parametrize("file_type", [".bmp", ".doc"]) - def test_find_by_ext_file_n(self, config: Config, file_type: pytest.fixture) -> None: - assert not FilePathExtractor.is_find_by_ext_file(config, file_type) - config.find_by_ext = False - assert not FilePathExtractor.is_find_by_ext_file(config, file_type) + self.assertEqual(1, len(filtered_files)) + expected_path = os.path.join(tmp_dir, "src", "dir", "file.cpp") + self.assertEqual(expected_path, filtered_files[0]) + + def assert_true_check_exclude_file(self, paths: List[str]): + for i in paths: + self.assertTrue(FilePathExtractor.check_exclude_file(self.config, i), i) + + def assert_false_check_exclude_file(self, paths: List[str]): + for i in paths: + self.assertFalse(FilePathExtractor.check_exclude_file(self.config, i), i) + + def test_check_exclude_file_p(self) -> None: + # matched only not_allowed_path_pattern + self.config.exclude_containers = [".gz"] + self.config.exclude_documents = [".pdf"] + self.config.exclude_extensions = [".so"] + self.config.exclude_paths = ["/.git/"] + self.config.exclude_patterns = [re.compile(r".*magic.*number.*")] + self.config.depth = 1 + self.config.doc = False + self.assert_true_check_exclude_file(self.paths_not) + self.assert_true_check_exclude_file(self.paths_reg) + self.assert_true_check_exclude_file(self.paths_git) + self.assert_false_check_exclude_file(self.paths_src) + self.assert_false_check_exclude_file(self.paths_pak) + self.assert_false_check_exclude_file(self.paths_doc) + self.assert_true_check_exclude_file(self.paths_ext) + + # pdf should be not filtered + self.config.depth = 0 + self.config.doc = True + self.assert_true_check_exclude_file(self.paths_not) + self.assert_true_check_exclude_file(self.paths_reg) + self.assert_true_check_exclude_file(self.paths_git) + self.assert_false_check_exclude_file(self.paths_src) + self.assert_true_check_exclude_file(self.paths_pak) + self.assert_false_check_exclude_file(self.paths_doc) + self.assert_true_check_exclude_file(self.paths_ext) + + def test_check_exclude_file_n(self) -> None: + # none of extension are in config, only not_allowed_path_pattern matches + self.assert_true_check_exclude_file(self.paths_not) + self.assert_false_check_exclude_file(self.paths_reg) + self.assert_false_check_exclude_file(self.paths_git) + self.assert_false_check_exclude_file(self.paths_src) + self.assert_false_check_exclude_file(self.paths_pak) + self.assert_false_check_exclude_file(self.paths_doc) + self.assert_false_check_exclude_file(self.paths_ext) + + # matched only exclude_extensions + self.config.exclude_containers = [".gz"] + self.config.exclude_documents = [".pdf"] + self.config.exclude_extensions = [".so"] + self.assert_true_check_exclude_file(self.paths_not) + self.assert_false_check_exclude_file(self.paths_reg) + self.assert_false_check_exclude_file(self.paths_git) + self.assert_false_check_exclude_file(self.paths_src) + self.assert_true_check_exclude_file(self.paths_pak) + self.assert_true_check_exclude_file(self.paths_doc) + self.assert_true_check_exclude_file(self.paths_ext) + + def test_find_by_ext_file_p(self) -> None: + self.config.find_by_ext = True + self.config.find_by_ext_list = [".p12", ".jpg"] + self.assertTrue(FilePathExtractor.is_find_by_ext_file(self.config, ".p12")) + self.assertTrue(FilePathExtractor.is_find_by_ext_file(self.config, ".jpg")) + self.assertFalse(FilePathExtractor.is_find_by_ext_file(self.config, ".bmp")) + + def test_find_by_ext_file_n(self) -> None: + self.config.find_by_ext = False + self.config.find_by_ext_list = [".p12", ".bmp"] + self.assertFalse(FilePathExtractor.is_find_by_ext_file(self.config, ".p12")) + self.assertFalse(FilePathExtractor.is_find_by_ext_file(self.config, ".bmp")) + self.assertFalse(FilePathExtractor.is_find_by_ext_file(self.config, ".jpg")) @mock.patch("os.path.getsize") - def test_check_file_size_p(self, mock_getsize, config: Config) -> None: + def test_check_file_size_p(self, mock_getsize) -> None: mock_getsize.return_value = parse_size("11MiB") - config.size_limit = parse_size("10MiB") - assert FilePathExtractor.check_file_size(config, "") + self.config.size_limit = parse_size("10MiB") + self.assertTrue(FilePathExtractor.check_file_size(self.config, "")) @mock.patch("os.path.getsize") - def test_check_file_size_n(self, mock_getsize, config: Config) -> None: + def test_check_file_size_n(self, mock_getsize) -> None: mock_getsize.return_value = parse_size("11MiB") - config.size_limit = None - assert not FilePathExtractor.check_file_size(config, "") - config.size_limit = parse_size("11MiB") - assert not FilePathExtractor.check_file_size(config, "") + self.config.size_limit = None + self.assertFalse(FilePathExtractor.check_file_size(self.config, "")) + self.config.size_limit = parse_size("11MiB") + self.assertFalse(FilePathExtractor.check_file_size(self.config, "")) - def test_skip_symlink_n(self, config: Config) -> None: + def test_skip_symlink_n(self) -> None: with tempfile.TemporaryDirectory() as tmp_dir: sub_dir = os.path.join(tmp_dir, "sub_dir") os.mkdir(sub_dir) @@ -103,9 +191,9 @@ def test_skip_symlink_n(self, config: Config) -> None: for root, dirs, files in os.walk(tmp_dir): files_walked.update(files) dirs_walked.update(dirs) - assert dirs_walked == {"sub_dir", "s_dir_link"} - assert files_walked == {"target", "s_link"} + self.assertEqual({"sub_dir", "s_dir_link"}, dirs_walked) + self.assertEqual({"target", "s_link"}, files_walked) - paths = FilePathExtractor.get_file_paths(config, tmp_dir) - assert len(paths) == 1 - assert paths[0] == target_path + paths = FilePathExtractor.get_file_paths(self.config, tmp_dir) + self.assertEqual(1, len(paths)) + self.assertEqual(target_path, paths[0]) diff --git a/tests/samples/password.docx b/tests/samples/password.docx deleted file mode 100644 index 6d6db3a52..000000000 Binary files a/tests/samples/password.docx and /dev/null differ diff --git a/tests/samples/sample.docx b/tests/samples/sample.docx new file mode 100644 index 000000000..a8762b33f Binary files /dev/null and b/tests/samples/sample.docx differ diff --git a/tests/samples/sample.docx.gz b/tests/samples/sample.docx.gz new file mode 100644 index 000000000..7c4c56012 Binary files /dev/null and b/tests/samples/sample.docx.gz differ diff --git a/tests/samples/sample.pdf b/tests/samples/sample.pdf index e4b0a8229..d4a1c8acf 100644 Binary files a/tests/samples/sample.pdf and b/tests/samples/sample.pdf differ diff --git a/tests/samples/sample_bad_empty.docx b/tests/samples/sample_bad_empty.docx new file mode 100644 index 000000000..3b60e1683 Binary files /dev/null and b/tests/samples/sample_bad_empty.docx differ diff --git a/tests/samples/small.pdf b/tests/samples/small.pdf new file mode 100644 index 000000000..7b6e0bf5f Binary files /dev/null and b/tests/samples/small.pdf differ diff --git a/tests/test_main.py b/tests/test_main.py index 6619cc1a8..22412c89a 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -463,9 +463,9 @@ def test_pdf_p(self) -> None: cred_sweeper = CredSweeper(depth=33) cred_sweeper.run(content_provider=content_provider) found_credentials = cred_sweeper.credential_manager.get_credentials() - self.assertEqual(2, len(found_credentials)) - self.assertSetEqual({"AWS Client ID", "Password"}, set(i.rule_name for i in found_credentials)) - self.assertSetEqual({"Xdj@jcN834b", "AKIAGIREOGIAWSKEY123"}, + self.assertEqual(3, len(found_credentials)) + self.assertSetEqual({"AWS Client ID", "Password", "Github Token"}, set(i.rule_name for i in found_credentials)) + self.assertSetEqual({"Xdj@jcN834b", "AKIAGIREOGIAWSKEY123", "ghr_Ku7ikDwqD1Ge2u3Wf1UM3z2CLN230RP1I8Vd"}, set(i.line_data_list[0].value for i in found_credentials)) # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # @@ -556,19 +556,20 @@ def test_encoded_p(self) -> None: def test_docx_p(self) -> None: # test for finding credentials in docx - content_provider: FilesProvider = TextProvider([SAMPLES_PATH / "password.docx"]) - cred_sweeper = CredSweeper(depth=5) + content_provider: FilesProvider = TextProvider([SAMPLES_PATH / "sample.docx"]) + cred_sweeper = CredSweeper(doc=True) cred_sweeper.run(content_provider=content_provider) found_credentials = cred_sweeper.credential_manager.get_credentials() - self.assertEqual(1, len(found_credentials)) - self.assertEqual("Xdj@jcN834b.", found_credentials[0].line_data_list[0].value) + self.assertEqual(2, len(found_credentials)) + self.assertEqual("WeR15tr0n6", found_credentials[0].line_data_list[0].value) + self.assertEqual("ghr_Ku7ikDwqD1Ge2u3Wf1UM3z2SLN230RP1I8Wf", found_credentials[1].line_data_list[0].value) # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # def test_docx_n(self) -> None: - # test docx - no credential should be found without 'depth' - content_provider: FilesProvider = TextProvider([SAMPLES_PATH / "password.docx"]) - cred_sweeper = CredSweeper() + # test docx - no credential should be found without 'doc' + content_provider: FilesProvider = TextProvider([SAMPLES_PATH / "sample.docx"]) + cred_sweeper = CredSweeper(doc=False) cred_sweeper.run(content_provider=content_provider) found_credentials = cred_sweeper.credential_manager.get_credentials() self.assertEqual(0, len(found_credentials))