diff --git a/scripts/native_libraries/get_emba_db.py b/scripts/native_libraries/get_emba_db.py new file mode 100644 index 00000000..8c37b53a --- /dev/null +++ b/scripts/native_libraries/get_emba_db.py @@ -0,0 +1,92 @@ +import json +import os +import re + +import requests + +from surfactant.configmanager import ConfigManager + + +def load_database(url): + response = requests.get(url) + response.raise_for_status() + return response.text + + +def parse_cfg_file(content): + database = {} + lines = content.splitlines() + filtered_lines = [] + + for line in lines: + if not (line.startswith("#") or line.startswith("identifier")): + filtered_lines.append(line) + + for line in filtered_lines: + line = line.strip() + + # Split by semicolons + fields = line.split(";") + + # Name of library + lib_name = fields[0] + + # Empty filename because EMBA doesn't need filename patterns + name_patterns = [] + + # Check if it starts with one double quote and ends with two double quotes + if fields[3].startswith('"') and fields[3].endswith('""'): + filecontent = fields[3][1:-1] + elif fields[3].endswith('""'): + filecontent = fields[3][:-1] + else: + filecontent = fields[3].strip('"') + + # Create a dictionary for this entry and add it to the database + # Strict mode is deprecated so those entries will be matched just by filename + if fields[1] == "" or fields[1] == "strict": + if fields[1] == "strict": + if lib_name not in database: + database[lib_name] = { + "filename": [lib_name], + "filecontent": [], + } + else: + try: + re.search(filecontent.encode("utf-8"), b"") + if lib_name not in database: + database[lib_name] = { + "filename": name_patterns, + "filecontent": [filecontent], + } + else: + database[lib_name]["filecontent"].append(filecontent) + except re.error as e: + print(f"Error parsing file content regexp {filecontent}: {e}") + + return database + + +# Use database from this specific commit +emba_database_url = "https://raw.githubusercontent.com/e-m-b-a/emba/11d6c281189c3a14fc56f243859b0bccccce8b9a/config/bin_version_strings.cfg" +json_file_path = ConfigManager().get_data_dir_path() / "native_lib_patterns" / "emba.json" + +file_content = load_database(emba_database_url) + +parsed_data = parse_cfg_file(file_content) + +for _, value in parsed_data.items(): + filecontent_list = value["filecontent"] + + # Remove leading ^ from each string in the filecontent list + for i, pattern in enumerate(filecontent_list): # Use enumerate to get index and value + if pattern.startswith("^"): + filecontent_list[i] = pattern[1:] + + if not pattern.endswith("\\$"): + if pattern.endswith("$"): + filecontent_list[i] = pattern[:-1] + +os.makedirs(os.path.dirname(json_file_path), exist_ok=True) +with open(json_file_path, "w") as json_file: + json.dump(parsed_data, json_file, indent=4) diff --git a/surfactant/infoextractors/native_lib_file.py b/surfactant/infoextractors/native_lib_file.py new file mode 100644 index 00000000..b25b9e19 --- /dev/null +++ b/surfactant/infoextractors/native_lib_file.py @@ -0,0 +1,104 @@ +import json +import os +import re +from typing import Any, Dict, List, Optional + +from loguru import logger + +import surfactant.plugin +from surfactant.configmanager import ConfigManager +from surfactant.sbomtypes import SBOM, Software + + +@surfactant.plugin.hookimpl +def short_name() -> Optional[str]: + return "native_lib_patterns" + + +def load_pattern_db(): + # Load regex patterns into database var + try: + with open(native_lib_patterns, "r") as regex: + emba_patterns = json.load(regex) + return emba_patterns + except FileNotFoundError: + logger.warning(f"File not found for native library detection: {native_lib_patterns}") + return None + + +# Load the pattern database once at module import +native_lib_patterns = ConfigManager().get_data_dir_path() / "native_lib_patterns" / "emba.json" +database = load_pattern_db() + + +def supports_file(filetype) -> bool: + return filetype in ("PE", "ELF", "MACHOFAT", "MACHOFAT64", "MACHO32", "MACHO64") + + +@surfactant.plugin.hookimpl +def extract_file_info(sbom: SBOM, software: Software, filename: str, filetype: str) -> object: + if not supports_file(filetype): + return None + return extract_native_lib_info(filename) + + +def extract_native_lib_info(filename): + native_lib_info: Dict[str, Any] = {"nativeLibraries": []} + if not database: + return None + + found_libraries = set() + library_names = [] + contains_library_names = [] + + # Match based on filename + base_filename = os.path.basename(filename) + filenames_list = match_by_attribute("filename", base_filename, database) + if len(filenames_list) > 0: + for match in filenames_list: + library_name = match["isLibrary"] + if library_name not in found_libraries: + library_names.append(library_name) + found_libraries.add(library_name) + + # Match based on filecontent + try: + with open(filename, "rb") as native_file: + filecontent = native_file.read() + filecontent_list = match_by_attribute("filecontent", filecontent, database) + + # Extend the list and add the new libraries found + for match in filecontent_list: + library_name = match["containsLibrary"] + if library_name not in found_libraries: + contains_library_names.append(library_name) + found_libraries.add(library_name) + + except FileNotFoundError: + logger.warning(f"File not found: {filename}") + + # Create the single entry for isLibrary + if library_names: + native_lib_info["nativeLibraries"].append({"isLibrary": library_names}) + + # Create the single entry for containsLibrary + if contains_library_names: + native_lib_info["nativeLibraries"].append({"containsLibrary": contains_library_names}) + + return native_lib_info + + +def match_by_attribute(attribute: str, content: str, patterns_database: Dict) -> List[Dict]: + libs = [] + for lib_name, lib_info in patterns_database.items(): + if attribute in lib_info: + for pattern in lib_info[attribute]: + if attribute == "filename": + if pattern.lower() == content.lower(): + libs.append({"isLibrary": lib_name}) + + elif attribute == "filecontent": + matches = re.search(pattern.encode("utf-8"), content) + if matches: + libs.append({"containsLibrary": lib_name}) + return libs diff --git a/surfactant/plugin/manager.py b/surfactant/plugin/manager.py index 6d83b223..1936ff3b 100644 --- a/surfactant/plugin/manager.py +++ b/surfactant/plugin/manager.py @@ -23,6 +23,7 @@ def _register_plugins(pm: pluggy.PluginManager) -> None: java_file, js_file, mach_o_file, + native_lib_file, ole_file, pe_file, ) @@ -62,6 +63,7 @@ def _register_plugins(pm: pluggy.PluginManager) -> None: cyclonedx_writer, spdx_writer, cytrics_reader, + native_lib_file, ) for plugin in internal_plugins: pm.register(plugin)