Samsung · babenek · Nov 1, 2023 · Oct 21, 2023 · Oct 23, 2023 · Oct 23, 2023
@@ -45,3 +45,6 @@ ignore_missing_imports = True
 
 [mypy-password_strength.*]
 ignore_missing_imports = True
+
+[mypy-docx.*]
+ignore_missing_imports = True
@@ -19,6 +19,7 @@ def __init__(self, config: Dict[str, Any]) -> None:
         self.exclude_patterns: List[re.Pattern] = [re.compile(pattern) for pattern in config["exclude"]["pattern"]]
         self.exclude_paths: List[str] = config["exclude"]["path"]
         self.exclude_containers: List[str] = config["exclude"]["containers"]
+        self.exclude_documents: List[str] = config["exclude"]["documents"]
         self.exclude_extensions: List[str] = config["exclude"]["extension"]
         self.exclude_lines: Set[str] = set(config["exclude"].get("lines", []))
         self.exclude_values: Set[str] = set(config["exclude"].get("values", []))

@@ -16,6 +16,7 @@
 from credsweeper.utils import Util
 from .byte_scanner import ByteScanner
 from .bzip2_scanner import Bzip2Scanner
+from .docx_scanner import DocxScanner
 from .encoder_scanner import EncoderScanner
 from .gzip_scanner import GzipScanner
 from .html_scanner import HtmlScanner
@@ -34,6 +35,7 @@
 class DeepScanner(
     ByteScanner,  #
     Bzip2Scanner,  #
+    DocxScanner,  #
     EncoderScanner,  #
     GzipScanner,  #
     HtmlScanner,  #
@@ -71,6 +73,7 @@ def get_deep_scanners(data: bytes) -> List[Any]:
             deep_scanners.append(ZipScanner)
             # probably, there might be a docx, xlxs and so on.
             # It might be scanned with text representation in third-party libraries.
+            deep_scanners.append(DocxScanner)
         elif Util.is_bzip2(data):
             deep_scanners.append(Bzip2Scanner)
         elif Util.is_tar(data):

@@ -0,0 +1,44 @@
+import io
+import logging
+from abc import ABC
+from typing import List
+
+import docx
+
+from credsweeper.credentials import Candidate
+from credsweeper.deep_scanner.abstract_scanner import AbstractScanner
+from credsweeper.file_handler.data_content_provider import DataContentProvider
+from credsweeper.file_handler.string_content_provider import StringContentProvider
+
+logger = logging.getLogger(__name__)
+
+
+class DocxScanner(AbstractScanner, ABC):
+    """Implements pdf scanning"""
+
+    def data_scan(
+            self,  #
+            data_provider: DataContentProvider,  #
+            depth: int,  #
+            recursive_limit_size: int) -> List[Candidate]:
+        """Tries to scan PDF elements recursively and the whole text on page as strings"""
+        candidates = []
+
+        try:
+            docx_lines = []
+
+            doc = docx.Document(io.BytesIO(data_provider.data))
+            for paragraph in doc.paragraphs:
+                for line in paragraph.text.splitlines():
+                    if line:
+                        docx_lines.append(line)
+
+            string_data_provider = StringContentProvider(lines=docx_lines,
+                                                         file_path=data_provider.file_path,
+                                                         file_type=data_provider.file_type,
+                                                         info=f"{data_provider.info}|DOCX")
+            pdf_candidates = self.scanner.scan(string_data_provider)
+            candidates.extend(pdf_candidates)
+        except Exception as docx_exc:
+            logger.debug(f"{data_provider.file_path}:{docx_exc}")
+        return candidates
@@ -143,6 +143,9 @@ def check_exclude_file(config: Config, path: str) -> bool:
             return True
         if not config.depth and file_extension in config.exclude_containers:
             return True
+        # --depth or --doc enables scan for all documents extensions
+        if not (config.depth or config.doc) and file_extension in config.exclude_documents:
+            return True
         return False
 
     @staticmethod

@@ -4,13 +4,15 @@
         "containers": [
             ".apk",
             ".bz2",
-            ".docx",
             ".gz",
-            ".pdf",
             ".tar",
-            ".xlsx",
             ".zip"
         ],
+        "documents": [
+            ".docx",
+            ".pdf",
+            ".xlsx"
+        ],
         "extension": [
             ".7z",
             ".aac",
@@ -71,6 +73,7 @@
             "/__pycache__/",
             "/node_modules/",
             "/target/",
+            "/.venv/",
             "/venv/"
         ],
         "lines": [],

@@ -15,6 +15,7 @@ When paths to scan are entered, get the files in that paths and the files are ex
 - exclude
    - pattern: Regex patterns to exclude scan.
    - containers: Extensions in lower case of container files which might be scan with --depth option
+   - containers: Extensions in lower case of container files which might be scan with --doc and --depth option
    - extension: Extensions in lower case to exclude scan.
    - path: Paths to exclude scan.
 - source_ext: List of extensions for scanning categorized as source files.
@@ -36,6 +37,11 @@ When paths to scan are entered, get the files in that paths and the files are ex
             ".zip",
             ...
         ],
+        "documents": [
+            ".docx",
+            ".pdf",
+            ...
+        ],
         "extension": [
             ".7z",
             ".jpg",

@@ -9,6 +9,7 @@ openpyxl==3.1.2
 pandas==2.0.3
 # ^ the version supports by python 3.8
 PyYAML==6.0.1
+python-docx==1.0.1
 requests==2.31.0
 schwifty==2023.9.0
 typing_extensions==4.8.0
@@ -48,3 +49,4 @@ types-python-dateutil
 types-regex
 types-humanfriendly
 yapf
+
@@ -15,6 +15,7 @@
     "password-strength",  #
     "pdfminer.six",  #
     "PyYAML",  #
+    "python-docx",  #
     "requests",  #
     "scipy",  #
     "schwifty",  #

@@ -1,7 +1,7 @@
 from pathlib import Path
 
 # total number of files in test samples
-SAMPLES_FILES_COUNT: int = 120
+SAMPLES_FILES_COUNT: int = 123
 
 # credentials count after scan
 SAMPLES_CRED_COUNT: int = 383
@@ -11,10 +11,10 @@
 SAMPLES_POST_CRED_COUNT: int = 293
 
 # with option --doc
-SAMPLES_IN_DOC = 426
+SAMPLES_IN_DOC = 431
 
 # archived credentials that are not found without --depth
-SAMPLES_IN_DEEP_1 = SAMPLES_POST_CRED_COUNT + 16
+SAMPLES_IN_DEEP_1 = SAMPLES_POST_CRED_COUNT + 21
 SAMPLES_IN_DEEP_2 = SAMPLES_IN_DEEP_1 + 16
 SAMPLES_IN_DEEP_3 = SAMPLES_IN_DEEP_2 + 3
 

@@ -38,6 +38,9 @@ def config() -> Config:
     config_dict["validation"]["api_validation"] = False
     config_dict["use_filters"] = True
     config_dict["find_by_ext"] = False
+    config_dict["exclude"]["containers"] = [".gz", ".zip"]
+    config_dict["exclude"]["documents"] = [".docx", ".pdf"]
+    config_dict["exclude"]["extension"] = [".jpg", ".bmp"]
     config_dict["depth"] = 0
     config_dict["doc"] = False
     config_dict["find_by_ext_list"] = [".txt", ".inf"]

@@ -6266,30 +6266,6 @@
             }
         ]
     },
-    {
-        "api_validation": "NOT_AVAILABLE",
-        "ml_validation": "VALIDATED_KEY",
-        "ml_probability": 0.97709,
-        "rule": "Password",
-        "severity": "medium",
-        "line_data_list": [
-            {
-                "line": "password = Xdj@jcN834b.",
-                "line_num": 2,
-                "path": "tests/samples/password.docx",
-                "info": "tests/samples/password.docx|ZIP|word/document.xml|HTML",
-                "value": "Xdj@jcN834b.",
-                "value_start": 11,
-                "value_end": 23,
-                "variable": "password",
-                "entropy_validation": {
-                    "iterator": "BASE64_CHARS",
-                    "entropy": 2.8208020839342964,
-                    "valid": false
-                }
-            }
-        ]
-    },
     {
         "api_validation": "NOT_AVAILABLE",
         "ml_validation": "VALIDATED_KEY",
@@ -8144,6 +8120,102 @@
             }
         ]
     },
+    {
+        "api_validation": "NOT_AVAILABLE",
+        "ml_validation": "VALIDATED_KEY",
+        "ml_probability": 0.94412,
+        "rule": "Password",
+        "severity": "medium",
+        "line_data_list": [
+            {
+                "line": "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}t : Password = WeR15tr0n6",
+                "line_num": 1,
+                "path": "tests/samples/sample.docx",
+                "info": "tests/samples/sample.docx|ZIP|word/document.xml|XML",
+                "value": "WeR15tr0n6",
+                "value_start": 77,
+                "value_end": 87,
+                "variable": "Password",
+                "entropy_validation": {
+                    "iterator": "BASE64_CHARS",
+                    "entropy": 3.321928094887362,
+                    "valid": false
+                }
+            }
+        ]
+    },
+    {
+        "api_validation": "NOT_AVAILABLE",
+        "ml_validation": "NOT_AVAILABLE",
+        "ml_probability": null,
+        "rule": "Github Token",
+        "severity": "high",
+        "line_data_list": [
+            {
+                "line": "ghr_Ku7ikDwqD1Ge2u3Wf1UM3z2SLN230RP1I8Wf",
+                "line_num": 2,
+                "path": "tests/samples/sample.docx",
+                "info": "tests/samples/sample.docx|DOCX",
+                "value": "ghr_Ku7ikDwqD1Ge2u3Wf1UM3z2SLN230RP1I8Wf",
+                "value_start": 0,
+                "value_end": 40,
+                "variable": null,
+                "entropy_validation": {
+                    "iterator": "BASE64_CHARS",
+                    "entropy": 4.632263329852917,
+                    "valid": true
+                }
+            }
+        ]
+    },
+    {
+        "api_validation": "NOT_AVAILABLE",
+        "ml_validation": "VALIDATED_KEY",
+        "ml_probability": 0.94412,
+        "rule": "Password",
+        "severity": "medium",
+        "line_data_list": [
+            {
+                "line": "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}t : Password = WeR15tr0n6",
+                "line_num": 1,
+                "path": "tests/samples/sample.docx.gz",
+                "info": "tests/samples/sample.docx.gz|GZIP|tests/samples/sample.docx|ZIP|word/document.xml|XML",
+                "value": "WeR15tr0n6",
+                "value_start": 77,
+                "value_end": 87,
+                "variable": "Password",
+                "entropy_validation": {
+                    "iterator": "BASE64_CHARS",
+                    "entropy": 3.321928094887362,
+                    "valid": false
+                }
+            }
+        ]
+    },
+    {
+        "api_validation": "NOT_AVAILABLE",
+        "ml_validation": "NOT_AVAILABLE",
+        "ml_probability": null,
+        "rule": "Github Token",
+        "severity": "high",
+        "line_data_list": [
+            {
+                "line": "ghr_Ku7ikDwqD1Ge2u3Wf1UM3z2SLN230RP1I8Wf",
+                "line_num": 2,
+                "path": "tests/samples/sample.docx.gz",
+                "info": "tests/samples/sample.docx.gz|GZIP|tests/samples/sample.docx|DOCX",
+                "value": "ghr_Ku7ikDwqD1Ge2u3Wf1UM3z2SLN230RP1I8Wf",
+                "value_start": 0,
+                "value_end": 40,
+                "variable": null,
+                "entropy_validation": {
+                    "iterator": "BASE64_CHARS",
+                    "entropy": 4.632263329852917,
+                    "valid": true
+                }
+            }
+        ]
+    },
     {
         "api_validation": "NOT_AVAILABLE",
         "ml_validation": "VALIDATED_KEY",
@@ -8192,6 +8264,30 @@
             }
         ]
     },
+    {
+        "api_validation": "NOT_AVAILABLE",
+        "ml_validation": "NOT_AVAILABLE",
+        "ml_probability": null,
+        "rule": "Github Token",
+        "severity": "high",
+        "line_data_list": [
+            {
+                "line": "ghr_Ku7ikDwqD1Ge2u3Wf1UM3z2CLN230RP1I8Vd",
+                "line_num": 1,
+                "path": "tests/samples/sample.pdf",
+                "info": "tests/samples/sample.pdf|PDF:1|RAW",
+                "value": "ghr_Ku7ikDwqD1Ge2u3Wf1UM3z2CLN230RP1I8Vd",
+                "value_start": 0,
+                "value_end": 40,
+                "variable": null,
+                "entropy_validation": {
+                    "iterator": "BASE64_CHARS",
+                    "entropy": 4.732263329852917,
+                    "valid": true
+                }
+            }
+        ]
+    },
     {
         "api_validation": "NOT_AVAILABLE",
         "ml_validation": "VALIDATED_KEY",
@@ -8336,6 +8432,30 @@
             }
         ]
     },
+    {
+        "api_validation": "NOT_AVAILABLE",
+        "ml_validation": "NOT_AVAILABLE",
+        "ml_probability": null,
+        "rule": "Azure Secret Value",
+        "severity": "high",
+        "line_data_list": [
+            {
+                "line": "qpF8Q~PCM5MhMoyTFc5TYEomnzRUKim9UJhe8a2P",
+                "line_num": 1,
+                "path": "tests/samples/small.pdf",
+                "info": "tests/samples/small.pdf|PDF:1|RAW",
+                "value": "qpF8Q~PCM5MhMoyTFc5TYEomnzRUKim9UJhe8a2P",
+                "value_start": 0,
+                "value_end": 40,
+                "variable": null,
+                "entropy_validation": {
+                    "iterator": "BASE64_CHARS",
+                    "entropy": 4.620007704961091,
+                    "valid": true
+                }
+            }
+        ]
+    },
     {
         "api_validation": "NOT_AVAILABLE",
         "ml_validation": "VALIDATED_KEY",