From a6622b6c9c160a8e86c50fe03ec38072c34fb3a8 Mon Sep 17 00:00:00 2001 From: Roman Babenko Date: Thu, 1 Aug 2024 13:41:32 +0300 Subject: [PATCH] binary file detection for latin1 --- credsweeper/utils/util.py | 9 ++++++++- tests/utils/test_util.py | 19 ++++++++++++------- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/credsweeper/utils/util.py b/credsweeper/utils/util.py index 87e888a6f..3f51d18d5 100644 --- a/credsweeper/utils/util.py +++ b/credsweeper/utils/util.py @@ -161,7 +161,14 @@ def is_binary(data: bytes) -> bool: return True if b"\0\0" in data: return True - return False + non_ascii_cnt = 0 + for i in data[:MAX_LINE_LENGTH]: + if 0x20 > i and i not in (0x09, 0x0A, 0x0D) or 0x7E < i < 0xA0: + # less than space and not tab, line feed, line end + non_ascii_cnt += 1 + chunk_len = float(MAX_LINE_LENGTH if MAX_LINE_LENGTH < len(data) else len(data)) + # experiment for 255217 binary files shown avg = 0.268264 ± 0.168767, so let choose minimal + return 0.1 < non_ascii_cnt / chunk_len @staticmethod def read_file(path: Union[str, Path], encodings: Optional[List[str]] = None) -> List[str]: diff --git a/tests/utils/test_util.py b/tests/utils/test_util.py index e46f91dc8..2d67cccde 100644 --- a/tests/utils/test_util.py +++ b/tests/utils/test_util.py @@ -11,7 +11,7 @@ from lxml.etree import XMLSyntaxError from credsweeper.common.constants import Chars, DEFAULT_ENCODING, UTF_8, MAX_LINE_LENGTH, CHUNK_STEP_SIZE, CHUNK_SIZE, \ - OVERLAP_SIZE + OVERLAP_SIZE, LATIN_1, UTF_16 from credsweeper.utils import Util from tests import AZ_DATA, AZ_STRING, SAMPLES_PATH @@ -309,14 +309,19 @@ def test_is_elf_n(self): self.assertFalse(Util.is_elf(data)) def test_is_binary_p(self): - self.assertFalse(Util.is_elf(AZ_STRING.encode("utf_32"))) - self.assertFalse(Util.is_elf(AZ_STRING.encode("utf_32_le"))) - self.assertFalse(Util.is_elf(AZ_STRING.encode("utf_32_be"))) + self.assertTrue(Util.is_binary(b"\0\0\0\0")) + # unsupported encoding + self.assertTrue(Util.is_binary(AZ_STRING.encode("utf_32"))) + self.assertTrue(Util.is_binary(AZ_STRING.encode("utf_32_le"))) + self.assertTrue(Util.is_binary(AZ_STRING.encode("utf_32_be"))) + # utf-16 is supported but must be decoded before Util.is_binary() + self.assertTrue(Util.is_binary(AZ_STRING.encode(UTF_16))) + self.assertTrue(Util.is_binary(AZ_STRING.encode("utf_16_le"))) + self.assertTrue(Util.is_binary(AZ_STRING.encode("utf_16_be"))) def test_is_binary_n(self): - self.assertFalse(Util.is_elf(AZ_STRING.encode("utf_16"))) - self.assertFalse(Util.is_elf(AZ_STRING.encode("utf_16_le"))) - self.assertFalse(Util.is_elf(AZ_STRING.encode("utf_16_be"))) + self.assertFalse(Util.is_binary("Üben von Xylophon und Querflöte ist ja zweckmäßig".encode(LATIN_1))) + self.assertFalse(Util.is_binary(b"\x7Ffew unprintable letters\x00")) def test_is_ascii_entropy_validate_p(self): self.assertTrue(Util.is_ascii_entropy_validate(b''))