Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Parse docx in --doc mode #439

Merged
merged 7 commits into from
Nov 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .mypy.ini
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,6 @@ ignore_missing_imports = True

[mypy-password_strength.*]
ignore_missing_imports = True

[mypy-docx.*]
ignore_missing_imports = True
1 change: 1 addition & 0 deletions credsweeper/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ def __init__(self, config: Dict[str, Any]) -> None:
self.exclude_patterns: List[re.Pattern] = [re.compile(pattern) for pattern in config["exclude"]["pattern"]]
self.exclude_paths: List[str] = config["exclude"]["path"]
self.exclude_containers: List[str] = config["exclude"]["containers"]
self.exclude_documents: List[str] = config["exclude"]["documents"]
self.exclude_extensions: List[str] = config["exclude"]["extension"]
self.exclude_lines: Set[str] = set(config["exclude"].get("lines", []))
self.exclude_values: Set[str] = set(config["exclude"].get("values", []))
Expand Down
3 changes: 3 additions & 0 deletions credsweeper/deep_scanner/deep_scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from credsweeper.utils import Util
from .byte_scanner import ByteScanner
from .bzip2_scanner import Bzip2Scanner
from .docx_scanner import DocxScanner
from .encoder_scanner import EncoderScanner
from .gzip_scanner import GzipScanner
from .html_scanner import HtmlScanner
Expand All @@ -34,6 +35,7 @@
class DeepScanner(
ByteScanner, #
Bzip2Scanner, #
DocxScanner, #
EncoderScanner, #
GzipScanner, #
HtmlScanner, #
Expand Down Expand Up @@ -71,6 +73,7 @@ def get_deep_scanners(data: bytes) -> List[Any]:
deep_scanners.append(ZipScanner)
# probably, there might be a docx, xlxs and so on.
# It might be scanned with text representation in third-party libraries.
deep_scanners.append(DocxScanner)
elif Util.is_bzip2(data):
deep_scanners.append(Bzip2Scanner)
elif Util.is_tar(data):
Expand Down
43 changes: 43 additions & 0 deletions credsweeper/deep_scanner/docx_scanner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import io
import logging
from abc import ABC
from typing import List

import docx

from credsweeper.credentials import Candidate
from credsweeper.deep_scanner.abstract_scanner import AbstractScanner
from credsweeper.file_handler.data_content_provider import DataContentProvider
from credsweeper.file_handler.string_content_provider import StringContentProvider

logger = logging.getLogger(__name__)


class DocxScanner(AbstractScanner, ABC):
"""Implements docx scanning"""

def data_scan(
self, #
data_provider: DataContentProvider, #
depth: int, #
recursive_limit_size: int) -> List[Candidate]:
"""Tries to scan DOCX text with splitting by lines"""
candidates: List[Candidate] = []

try:
docx_lines: List[str] = []

doc = docx.Document(io.BytesIO(data_provider.data))
for paragraph in doc.paragraphs:
for line in paragraph.text.splitlines():
if line:
docx_lines.append(line)

string_data_provider = StringContentProvider(lines=docx_lines,
file_path=data_provider.file_path,
file_type=data_provider.file_type,
info=f"{data_provider.info}|DOCX")
candidates = self.scanner.scan(string_data_provider)
except Exception as docx_exc:
logger.debug(f"{data_provider.file_path}:{docx_exc}")
return candidates
3 changes: 3 additions & 0 deletions credsweeper/file_handler/file_path_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,9 @@ def check_exclude_file(config: Config, path: str) -> bool:
return True
if not config.depth and file_extension in config.exclude_containers:
return True
# --depth or --doc enables scan for all documents extensions
if not (config.depth or config.doc) and file_extension in config.exclude_documents:
return True
return False

@staticmethod
Expand Down
7 changes: 5 additions & 2 deletions credsweeper/secret/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,15 @@
"containers": [
".apk",
".bz2",
".docx",
".gz",
".pdf",
".tar",
".xlsx",
".zip"
],
"documents": [
".docx",
".pdf"
],
"extension": [
".7z",
".aac",
Expand Down Expand Up @@ -71,6 +73,7 @@
"/__pycache__/",
"/node_modules/",
"/target/",
"/.venv/",
csh519 marked this conversation as resolved.
Show resolved Hide resolved
"/venv/"
],
"lines": [],
Expand Down
6 changes: 6 additions & 0 deletions docs/source/overall_architecture.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ When paths to scan are entered, get the files in that paths and the files are ex
- exclude
- pattern: Regex patterns to exclude scan.
- containers: Extensions in lower case of container files which might be scan with --depth option
- documents: Extensions in lower case of container files which might be scan with --doc and/or --depth option
- extension: Extensions in lower case to exclude scan.
- path: Paths to exclude scan.
- source_ext: List of extensions for scanning categorized as source files.
Expand All @@ -36,6 +37,11 @@ When paths to scan are entered, get the files in that paths and the files are ex
".zip",
...
],
"documents": [
".docx",
".pdf",
...
],
"extension": [
".7z",
".jpg",
Expand Down
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ openpyxl==3.1.2
pandas==2.0.3
# ^ the version supports by python 3.8
PyYAML==6.0.1
python-docx==1.0.1
requests==2.31.0
schwifty==2023.9.0
typing_extensions==4.8.0
Expand Down Expand Up @@ -48,3 +49,4 @@ types-python-dateutil
types-regex
types-humanfriendly
yapf

1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
"password-strength", #
"pdfminer.six", #
"PyYAML", #
"python-docx", #
"requests", #
"scipy", #
"schwifty", #
Expand Down
6 changes: 3 additions & 3 deletions tests/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from pathlib import Path

# total number of files in test samples
SAMPLES_FILES_COUNT: int = 120
SAMPLES_FILES_COUNT: int = 123

# credentials count after scan
SAMPLES_CRED_COUNT: int = 383
Expand All @@ -11,10 +11,10 @@
SAMPLES_POST_CRED_COUNT: int = 296

# with option --doc
SAMPLES_IN_DOC = 422
SAMPLES_IN_DOC = 427

# archived credentials that are not found without --depth
SAMPLES_IN_DEEP_1 = SAMPLES_POST_CRED_COUNT + 16
SAMPLES_IN_DEEP_1 = SAMPLES_POST_CRED_COUNT + 21
SAMPLES_IN_DEEP_2 = SAMPLES_IN_DEEP_1 + 16
SAMPLES_IN_DEEP_3 = SAMPLES_IN_DEEP_2 + 3

Expand Down
3 changes: 3 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ def config() -> Config:
config_dict["validation"]["api_validation"] = False
config_dict["use_filters"] = True
config_dict["find_by_ext"] = False
config_dict["exclude"]["containers"] = [".gz", ".zip"]
config_dict["exclude"]["documents"] = [".docx", ".pdf"]
config_dict["exclude"]["extension"] = [".jpg", ".bmp"]
config_dict["depth"] = 0
config_dict["doc"] = False
config_dict["find_by_ext_list"] = [".txt", ".inf"]
Expand Down
168 changes: 144 additions & 24 deletions tests/data/depth_3.json
Original file line number Diff line number Diff line change
Expand Up @@ -6338,30 +6338,6 @@
}
]
},
{
"api_validation": "NOT_AVAILABLE",
"ml_validation": "VALIDATED_KEY",
"ml_probability": 0.97709,
"rule": "Password",
"severity": "medium",
"line_data_list": [
{
"line": "password = Xdj@jcN834b.",
"line_num": 2,
"path": "tests/samples/password.docx",
"info": "tests/samples/password.docx|ZIP|word/document.xml|HTML",
"value": "Xdj@jcN834b.",
"value_start": 11,
"value_end": 23,
"variable": "password",
"entropy_validation": {
"iterator": "BASE64_CHARS",
"entropy": 2.8208020839342964,
"valid": false
}
}
]
},
{
"api_validation": "NOT_AVAILABLE",
"ml_validation": "VALIDATED_KEY",
Expand Down Expand Up @@ -8216,6 +8192,102 @@
}
]
},
{
"api_validation": "NOT_AVAILABLE",
"ml_validation": "VALIDATED_KEY",
"ml_probability": 0.94412,
"rule": "Password",
"severity": "medium",
"line_data_list": [
{
"line": "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}t : Password = WeR15tr0n6",
"line_num": 1,
"path": "tests/samples/sample.docx",
"info": "tests/samples/sample.docx|ZIP|word/document.xml|XML",
"value": "WeR15tr0n6",
"value_start": 77,
"value_end": 87,
"variable": "Password",
"entropy_validation": {
"iterator": "BASE64_CHARS",
"entropy": 3.321928094887362,
"valid": false
}
}
]
},
{
"api_validation": "NOT_AVAILABLE",
"ml_validation": "NOT_AVAILABLE",
"ml_probability": null,
"rule": "Github Token",
"severity": "high",
"line_data_list": [
{
"line": "ghr_Ku7ikDwqD1Ge2u3Wf1UM3z2SLN230RP1I8Wf",
"line_num": 2,
"path": "tests/samples/sample.docx",
"info": "tests/samples/sample.docx|DOCX",
"value": "ghr_Ku7ikDwqD1Ge2u3Wf1UM3z2SLN230RP1I8Wf",
"value_start": 0,
"value_end": 40,
"variable": null,
"entropy_validation": {
"iterator": "BASE64_CHARS",
"entropy": 4.632263329852917,
"valid": true
}
}
]
},
{
"api_validation": "NOT_AVAILABLE",
"ml_validation": "VALIDATED_KEY",
"ml_probability": 0.94412,
"rule": "Password",
"severity": "medium",
"line_data_list": [
{
"line": "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}t : Password = WeR15tr0n6",
"line_num": 1,
"path": "tests/samples/sample.docx.gz",
"info": "tests/samples/sample.docx.gz|GZIP|tests/samples/sample.docx|ZIP|word/document.xml|XML",
"value": "WeR15tr0n6",
"value_start": 77,
"value_end": 87,
"variable": "Password",
"entropy_validation": {
"iterator": "BASE64_CHARS",
"entropy": 3.321928094887362,
"valid": false
}
}
]
},
{
"api_validation": "NOT_AVAILABLE",
"ml_validation": "NOT_AVAILABLE",
"ml_probability": null,
"rule": "Github Token",
"severity": "high",
"line_data_list": [
{
"line": "ghr_Ku7ikDwqD1Ge2u3Wf1UM3z2SLN230RP1I8Wf",
"line_num": 2,
"path": "tests/samples/sample.docx.gz",
"info": "tests/samples/sample.docx.gz|GZIP|tests/samples/sample.docx|DOCX",
"value": "ghr_Ku7ikDwqD1Ge2u3Wf1UM3z2SLN230RP1I8Wf",
"value_start": 0,
"value_end": 40,
"variable": null,
"entropy_validation": {
"iterator": "BASE64_CHARS",
"entropy": 4.632263329852917,
"valid": true
}
}
]
},
{
"api_validation": "NOT_AVAILABLE",
"ml_validation": "NOT_AVAILABLE",
Expand Down Expand Up @@ -8264,6 +8336,30 @@
}
]
},
{
"api_validation": "NOT_AVAILABLE",
"ml_validation": "NOT_AVAILABLE",
"ml_probability": null,
"rule": "Github Token",
"severity": "high",
"line_data_list": [
{
"line": "ghr_Ku7ikDwqD1Ge2u3Wf1UM3z2CLN230RP1I8Vd",
"line_num": 1,
"path": "tests/samples/sample.pdf",
"info": "tests/samples/sample.pdf|PDF:1|RAW",
"value": "ghr_Ku7ikDwqD1Ge2u3Wf1UM3z2CLN230RP1I8Vd",
"value_start": 0,
"value_end": 40,
"variable": null,
"entropy_validation": {
"iterator": "BASE64_CHARS",
"entropy": 4.732263329852917,
"valid": true
}
}
]
},
{
"api_validation": "NOT_AVAILABLE",
"ml_validation": "VALIDATED_KEY",
Expand Down Expand Up @@ -8408,6 +8504,30 @@
}
]
},
{
"api_validation": "NOT_AVAILABLE",
"ml_validation": "NOT_AVAILABLE",
"ml_probability": null,
"rule": "Azure Secret Value",
"severity": "high",
"line_data_list": [
{
"line": "qpF8Q~PCM5MhMoyTFc5TYEomnzRUKim9UJhe8a2P",
"line_num": 1,
"path": "tests/samples/small.pdf",
"info": "tests/samples/small.pdf|PDF:1|RAW",
"value": "qpF8Q~PCM5MhMoyTFc5TYEomnzRUKim9UJhe8a2P",
"value_start": 0,
"value_end": 40,
"variable": null,
"entropy_validation": {
"iterator": "BASE64_CHARS",
"entropy": 4.620007704961091,
"valid": true
}
}
]
},
{
"api_validation": "NOT_AVAILABLE",
"ml_validation": "NOT_AVAILABLE",
Expand Down
Loading