Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Native library detection plugin #267

Merged
merged 42 commits into from
Dec 17, 2024
Merged
Show file tree
Hide file tree
Changes from 41 commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
c58782d
created new files for native lib detection
wangmot Jul 25, 2024
2fafa6d
using filetypeid plugin
wangmot Jul 26, 2024
efc1721
using filetypeid plugin
wangmot Jul 26, 2024
3362a88
adding as a new plugin
wangmot Jul 26, 2024
2db9dd3
adding script to parse EMBA database and save as json
wangmot Jul 29, 2024
e8da550
adding script to parse EMBA database and save as json
wangmot Jul 29, 2024
1c6a771
change format of native lib patterns database
wangmot Jul 30, 2024
c546a12
updating parsing of emba db
wangmot Jul 31, 2024
c384649
matching native libs
wangmot Aug 6, 2024
0957c21
testing with more native libs
wangmot Aug 12, 2024
8616fcc
testing with more libraries
wangmot Aug 12, 2024
7635f8e
more general updates
wangmot Aug 12, 2024
ea9be51
matching native libs
wangmot Aug 13, 2024
459886d
dealing with 'strict' mode
wangmot Aug 14, 2024
65236c3
making some updates to matching script
wangmot Aug 14, 2024
f630d70
testing pattern matching with a sample file
wangmot Aug 15, 2024
2134c60
incorporating tar file decompression
wangmot Aug 20, 2024
45654f1
moving tar decompression to new file
wangmot Aug 20, 2024
a6e6a55
handling decompression infoextractor
wangmot Aug 22, 2024
781fc02
adding different compression types
wangmot Aug 24, 2024
3456621
adding different compression types
wangmot Aug 26, 2024
3f76daf
native lib plugin
wangmot Oct 9, 2024
fe86f6a
Remove unnecessary native libraries
wangmot Oct 9, 2024
31d742b
cleaning native lib detection
wangmot Oct 14, 2024
3189b04
deleted sample library
wangmot Oct 14, 2024
747500a
delete example lib
wangmot Oct 14, 2024
4037a7a
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Oct 14, 2024
9ee1835
works now with updated emba DB
wangmot Nov 13, 2024
43f2fdf
make separate entries for isLibrary and containsLibrary
wangmot Nov 25, 2024
8e37513
cleaning up code
wangmot Nov 25, 2024
7d5718d
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Nov 25, 2024
33f7ac8
fixing pylint errors
wangmot Nov 25, 2024
b95d42e
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Nov 25, 2024
c9b9793
fixing precommit errors
wangmot Nov 25, 2024
edc0d64
add short_name hookimpl and move loading databse outside of hook
wangmot Dec 2, 2024
3813f5c
fixing linter errors
wangmot Dec 2, 2024
eead4ed
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 2, 2024
407f1f2
switching file name matching
wangmot Dec 16, 2024
689608d
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 16, 2024
abbe592
updating filename string comparison
wangmot Dec 16, 2024
ef0919f
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 16, 2024
1126d3c
updating var names for clarity
wangmot Dec 17, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 92 additions & 0 deletions scripts/native_libraries/get_emba_db.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
import json
wangmot marked this conversation as resolved.
Show resolved Hide resolved
import os
import re

import requests

wangmot marked this conversation as resolved.
Show resolved Hide resolved
from surfactant.configmanager import ConfigManager


def load_database(url):
response = requests.get(url)
response.raise_for_status()
return response.text


def parse_cfg_file(content):
database = {}
lines = content.splitlines()
filtered_lines = []

for line in lines:
if not (line.startswith("#") or line.startswith("identifier")):
filtered_lines.append(line)

for line in filtered_lines:
line = line.strip()

# Split by semicolons
fields = line.split(";")

# Name of library
lib_name = fields[0]

# Empty filename because EMBA doesn't need filename patterns
name_patterns = []

# Check if it starts with one double quote and ends with two double quotes
if fields[3].startswith('"') and fields[3].endswith('""'):
filecontent = fields[3][1:-1]
elif fields[3].endswith('""'):
filecontent = fields[3][:-1]
else:
filecontent = fields[3].strip('"')

# Create a dictionary for this entry and add it to the database
# Strict mode is deprecated so those entries will be matched just by filename
if fields[1] == "" or fields[1] == "strict":
if fields[1] == "strict":
if lib_name not in database:
database[lib_name] = {
"filename": [lib_name],
"filecontent": [],
}
else:
try:
re.search(filecontent.encode("utf-8"), b"")
if lib_name not in database:
database[lib_name] = {
"filename": name_patterns,
"filecontent": [filecontent],
}
else:
database[lib_name]["filecontent"].append(filecontent)
except re.error as e:
print(f"Error parsing file content regexp {filecontent}: {e}")

return database


# Use database from this specific commit
emba_database_url = "https://raw.githubusercontent.com/e-m-b-a/emba/11d6c281189c3a14fc56f243859b0bccccce8b9a/config/bin_version_strings.cfg"
json_file_path = ConfigManager().get_data_dir_path() / "native_lib_patterns" / "emba.json"

file_content = load_database(emba_database_url)

parsed_data = parse_cfg_file(file_content)

for _, value in parsed_data.items():
filecontent_list = value["filecontent"]

# Remove leading ^ from each string in the filecontent list
for i, pattern in enumerate(filecontent_list): # Use enumerate to get index and value
if pattern.startswith("^"):
filecontent_list[i] = pattern[1:]

if not pattern.endswith("\\$"):
if pattern.endswith("$"):
filecontent_list[i] = pattern[:-1]

os.makedirs(os.path.dirname(json_file_path), exist_ok=True)
with open(json_file_path, "w") as json_file:
nightlark marked this conversation as resolved.
Show resolved Hide resolved
json.dump(parsed_data, json_file, indent=4)
104 changes: 104 additions & 0 deletions surfactant/infoextractors/native_lib_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
import json
import os
import re
from typing import Any, Dict, List, Optional

from loguru import logger

import surfactant.plugin
from surfactant.configmanager import ConfigManager
from surfactant.sbomtypes import SBOM, Software


wangmot marked this conversation as resolved.
Show resolved Hide resolved
@surfactant.plugin.hookimpl
def short_name() -> Optional[str]:
return "native_lib_patterns"


def load_pattern_db():
# Load regex patterns into database var
try:
with open(native_lib_patterns, "r") as regex:
emba_patterns = json.load(regex)
return emba_patterns
except FileNotFoundError:
logger.warning(f"File not found for native library detection: {native_lib_patterns}")
return None


# Load the pattern database once at module import
native_lib_patterns = ConfigManager().get_data_dir_path() / "native_lib_patterns" / "emba.json"
database = load_pattern_db()


def supports_file(filetype) -> bool:
return filetype in ("PE", "ELF", "MACHOFAT", "MACHOFAT64", "MACHO32", "MACHO64")


@surfactant.plugin.hookimpl
def extract_file_info(sbom: SBOM, software: Software, filename: str, filetype: str) -> object:
if not supports_file(filetype):
return None
return extract_native_lib_info(filename)


def extract_native_lib_info(filename):
native_lib_info: Dict[str, Any] = {"nativeLibraries": []}
if not database:
return None

found_libraries = set()
library_names = []
contains_library_names = []

# Match based on filename
base_filename = os.path.basename(filename)
filenames_list = match_by_attribute("filename", base_filename, database)
if len(filenames_list) > 0:
for match in filenames_list:
library_name = match["isLibrary"]
if library_name not in found_libraries:
library_names.append(library_name)
found_libraries.add(library_name)

# Match based on filecontent
try:
with open(filename, "rb") as native_file:
filecontent = native_file.read()
filecontent_list = match_by_attribute("filecontent", filecontent, database)

# Extend the list and add the new libraries found
for match in filecontent_list:
library_name = match["containsLibrary"]
if library_name not in found_libraries:
contains_library_names.append(library_name)
found_libraries.add(library_name)

except FileNotFoundError:
logger.warning(f"File not found: {filename}")

# Create the single entry for isLibrary
if library_names:
native_lib_info["nativeLibraries"].append({"isLibrary": library_names})

# Create the single entry for containsLibrary
if contains_library_names:
native_lib_info["nativeLibraries"].append({"containsLibrary": contains_library_names})

return native_lib_info


def match_by_attribute(attribute: str, content: str, patterns_database: Dict) -> List[Dict]:
libs = []
for name, library in patterns_database.items():
if attribute in library:
for pattern in library[attribute]:
if attribute == "filename":
if name.lower() == content.lower():
libs.append({"isLibrary": name})

elif attribute == "filecontent":
matches = re.search(pattern.encode("utf-8"), content)
if matches:
libs.append({"containsLibrary": name})
return libs
nightlark marked this conversation as resolved.
Show resolved Hide resolved
2 changes: 2 additions & 0 deletions surfactant/plugin/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ def _register_plugins(pm: pluggy.PluginManager) -> None:
java_file,
js_file,
mach_o_file,
native_lib_file,
ole_file,
pe_file,
)
Expand Down Expand Up @@ -62,6 +63,7 @@ def _register_plugins(pm: pluggy.PluginManager) -> None:
cyclonedx_writer,
spdx_writer,
cytrics_reader,
native_lib_file,
)
for plugin in internal_plugins:
pm.register(plugin)
Expand Down
Loading