Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PPTX, XLSX render and scan for --doc #616

Merged
merged 2 commits into from
Nov 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ jobs:
- name: Analysing the code with pylint for NEW missed docstrings of classes or functions
if: ${{ always() && steps.setup_credsweeper.conclusion == 'success' }}
run: |
pylint --disable=E,R,W,C0114,C0103,C0412,C0413,C0415,C0200,C0201,C0325 --verbose credsweeper
pylint --disable=E,R,W,C0114,C0103,C0303,C0412,C0413,C0415,C0200,C0201,C0325 --verbose credsweeper

# # # Documentation check

Expand Down
2 changes: 1 addition & 1 deletion credsweeper/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,4 @@
'__version__'
]

__version__ = "1.9.1"
__version__ = "1.9.2"
6 changes: 6 additions & 0 deletions credsweeper/deep_scanner/deep_scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,9 @@
from .lang_scanner import LangScanner
from .pdf_scanner import PdfScanner
from .pkcs12_scanner import Pkcs12Scanner
from .pptx_scanner import PptxScanner
from .tar_scanner import TarScanner
from .xlsx_scanner import XlsxScanner
from .xml_scanner import XmlScanner
from .zip_scanner import ZipScanner
from ..common.constants import DEFAULT_ENCODING
Expand All @@ -47,8 +49,10 @@ class DeepScanner(
LangScanner, #
PdfScanner, #
Pkcs12Scanner, #
PptxScanner, #
TarScanner, #
XmlScanner, #
XlsxScanner, #
ZipScanner
): # yapf: disable
"""Advanced scanner with recursive exploring of data"""
Expand Down Expand Up @@ -79,7 +83,9 @@ def get_deep_scanners(data: bytes, file_type: str) -> List[Any]:
deep_scanners.append(ZipScanner)
# probably, there might be a docx, xlxs and so on.
# It might be scanned with text representation in third-party libraries.
deep_scanners.append(XlsxScanner)
deep_scanners.append(DocxScanner)
deep_scanners.append(PptxScanner)
elif Util.is_bzip2(data):
deep_scanners.append(Bzip2Scanner)
elif Util.is_tar(data):
Expand Down
42 changes: 42 additions & 0 deletions credsweeper/deep_scanner/pptx_scanner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import io
import logging
from abc import ABC
from typing import List

from pptx import Presentation

from credsweeper.credentials import Candidate
from credsweeper.deep_scanner.abstract_scanner import AbstractScanner
from credsweeper.file_handler.data_content_provider import DataContentProvider
from credsweeper.file_handler.string_content_provider import StringContentProvider

logger = logging.getLogger(__name__)


class PptxScanner(AbstractScanner, ABC):
"""Implements pptx scanning"""

def data_scan(
self, #
data_provider: DataContentProvider, #
depth: int, #
recursive_limit_size: int) -> List[Candidate]:
"""Tries to scan pptx text elements for all slides"""
candidates = []
try:
pptx_lines = []
presentation = Presentation(io.BytesIO(data_provider.data))
for slide in presentation.slides:
for shape in slide.shapes:
if shape.has_text_frame:
for paragraph in shape.text_frame.paragraphs:
pptx_lines.append(paragraph.text)
string_data_provider = StringContentProvider(lines=pptx_lines,
file_path=data_provider.file_path,
file_type=data_provider.file_type,
info=f"{data_provider.info}|pptx")
pptx_candidates = self.scanner.scan(string_data_provider)
candidates.extend(pptx_candidates)
except Exception as pptx_exc:
logger.error(f"{data_provider.file_path}:{pptx_exc}")
return candidates
41 changes: 41 additions & 0 deletions credsweeper/deep_scanner/xlsx_scanner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import io
import logging
from abc import ABC
from typing import List

import pandas as pd

from credsweeper.credentials import Candidate
from credsweeper.deep_scanner.abstract_scanner import AbstractScanner
from credsweeper.file_handler.data_content_provider import DataContentProvider
from credsweeper.file_handler.string_content_provider import StringContentProvider

logger = logging.getLogger(__name__)


class XlsxScanner(AbstractScanner, ABC):
"""Implements xlsx scanning"""

def data_scan(
self, #
data_provider: DataContentProvider, #
depth: int, #
recursive_limit_size: int) -> List[Candidate]:
"""Tries to scan xlsx text elements for all slides"""
candidates = []
try:
book = pd.read_excel(io.BytesIO(data_provider.data), sheet_name=None, header=None)
sheet_lines = []
for sheet_name, sheet_data in book.items():
text = sheet_data.fillna('').astype(str)
for i in text.values:
sheet_lines.append('\t'.join(i))
string_data_provider = StringContentProvider(lines=sheet_lines,
file_path=data_provider.file_path,
file_type=data_provider.file_type,
info=f"{data_provider.info}|xlsx:{sheet_name}")
sheet_candidates = self.scanner.scan(string_data_provider)
candidates.extend(sheet_candidates)
except Exception as xlsx_exc:
logger.error(f"{data_provider.file_path}:{xlsx_exc}")
return candidates
4 changes: 2 additions & 2 deletions credsweeper/secret/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,12 @@
".bz2",
".gz",
".tar",
".xlsx",
".zip"
],
"documents": [
".xlsx",
".docx",
".pptx",
".pdf"
],
"extension": [
Expand Down Expand Up @@ -43,7 +44,6 @@
".ogg",
".pak",
".png",
".pptx",
".psd",
".pyc",
".pyd",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@
#

#
# skip MAXßSEARCH_MA4206074328-qdv6fi3eh31q6h7c35vsi4p89p1258g1.apps.googleusercontent.com","CEKPET":"GOCSPX-Fogleucontent.com","CEKPET":"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",
# skip MAXßSEARCH_MA4206074328-qdv6fi3eh31q6h7c35vsi4p89p1258g1.apps.googleusercontent.com","CEKPET":"OGCSPX-Fcontent,com","CEKPET":"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",
Binary file not shown.
Binary file not shown.
1 change: 0 additions & 1 deletion fuzz/corpus/20bb3787c7f914def39aff2ed2b9f36ca5eeeb91

This file was deleted.

Binary file removed fuzz/corpus/24a5d4021dd6275163567ba983d68ab71489efc7
Binary file not shown.
2 changes: 0 additions & 2 deletions fuzz/corpus/2d41d950f43caddc85821d0d5a4f7ee5358fb1ff

This file was deleted.

2 changes: 2 additions & 0 deletions fuzz/corpus/2f158b179ca65b6a077d3c84d4dcfd5f8683cc22
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
glsa_ThisI5NtTheTok3nYou8Leor0k1ngF0r_0a2a3df7
glpl_ThisI5NtTheTok3nYou8reLo0k1ngF0r_0a2a3df7
14 changes: 0 additions & 14 deletions fuzz/corpus/37a22693c8945b248f4387cc98ebcc669ccb4f77

This file was deleted.

Binary file not shown.
47 changes: 0 additions & 47 deletions fuzz/corpus/3997395e39d0628e5a630428c259eb79b07ed175

This file was deleted.

92 changes: 0 additions & 92 deletions fuzz/corpus/3dd6e45c6a0cccb29c3416762b2df85012a7b67b

This file was deleted.

14 changes: 14 additions & 0 deletions fuzz/corpus/4212be42c2885853256a8b5a70a4004783be4973
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
"Prl23Db#@"

Passwd:Prl23Db#@ Prl23Db#@
PW:Prl23Db#@,password:Prl23Db#@
password:Prl23Db#@,비번:Prl23Db#@
passwd=Prl23Db#@
--pass Prl23Db#@
PIN:Prl23Db#@
paasword:Prl23Db#@
password:Prl23Db#@, paasword:Prlord: keep empty
암호 : @@@hl@@@비번@@@endhl@@@

FP# 10.0.0.1 8888 TLSv#;'
eo(s)
6 changes: 0 additions & 6 deletions fuzz/corpus/45db8bda86e1b35af588058e1e21192c96dd4683

This file was deleted.

1 change: 0 additions & 1 deletion fuzz/corpus/470054018f2aa757a1e10b6d64a54a97e57eb815

This file was deleted.

1 change: 0 additions & 1 deletion fuzz/corpus/488dc029b1fa81152d46ca7a90d4a151e22e008e

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -749,7 +749,7 @@ id:master pass:dipPr122Gg!
아이디:master 패스워드:dipPr123Gg!
user:master pw:dipPr124Gg!
Username:master/Password:dipPr125Gg!
userId:master,password:dipPr126Gg!
userId:master,pXssword:dipPr126Gg!
--user master --password dipPr127Gg!
dipPr128Gg! ID:master dipPr128Gg! Password:dipPr128Gg!
ANYid:master,pw:dipPr129Gg!
Expand Down
7 changes: 0 additions & 7 deletions fuzz/corpus/4ba45f243da9091d865bc7a0f449bfde576550b6

This file was deleted.

2 changes: 2 additions & 0 deletions fuzz/corpus/556041d17c7f7991e47f2041e4c12b80dbf4ef7a
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
The items are AKIAGIREOGIAWSKEY123,AKIAGIREOGIAWSKEY45X
the coma is necessary there REAL12 --access-key <xcFsdeGddSAdI/KFRS2CB/3fGCsdCYEXAMPLEKEY>
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,5 @@ passes = "cackle!"
</td>
2EC0JQLFdN3tqanQ_Bc1HA2yL9kK22WD*e2QLxA0RKuqrtable>

ithub_pat_31ADLV2EC0JQLFdN3tqanQ_Btxr 0000j
ithub_pJQLFdN3tqanQ_Btxr 0000j
r
Loading
Loading