Skip to content

Commit

Permalink
Use update_db hook for updating patterns database for JS libraries (#298
Browse files Browse the repository at this point in the history
)
  • Loading branch information
willis89pr authored Dec 18, 2024
1 parent bff6a2f commit e471c5f
Show file tree
Hide file tree
Showing 5 changed files with 91 additions and 603 deletions.
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ dependencies = [
"loguru==0.7.*",
"flask==3.*",
"tomlkit==0.13.*",
"requests>=2.32.3",
]
dynamic = ["version"]

Expand Down
41 changes: 0 additions & 41 deletions scripts/js_libraries/get_retirejs_db.py

This file was deleted.

6 changes: 4 additions & 2 deletions scripts/js_libraries/match_javascript.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

import requests

from surfactant.configmanager import ConfigManager


def get_test_file():
url = "https://cdnjs.cloudflare.com/ajax/libs/select2/3.5.4/select2.min.js"
Expand Down Expand Up @@ -32,8 +34,8 @@ def find_js_match(expressions: dict, filename: str) -> str:


get_test_file()

with open("js_library_patterns.json", "r") as f:
json_file_path = ConfigManager().get_data_dir_path() / "infoextractors" / "js_library_patterns.json"
with open(json_file_path, "r") as f:
patterns = json.load(f)

library_name = find_js_match(patterns, "testFile.js")
Expand Down
96 changes: 86 additions & 10 deletions surfactant/infoextractors/js_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,15 @@
#
# SPDX-License-Identifier: MIT
import json
import pathlib
import re
from typing import Any, Dict, List

import click
import requests
from loguru import logger

import surfactant.plugin
from surfactant.configmanager import ConfigManager
from surfactant.sbomtypes import SBOM, Software


Expand All @@ -26,18 +28,12 @@ def extract_file_info(sbom: SBOM, software: Software, filename: str, filetype: s

def extract_js_info(filename: str) -> object:
js_info: Dict[str, Any] = {"jsLibraries": []}
js_lib_file = pathlib.Path(__file__).parent / "js_library_patterns.json"

# Load expressions from retire.js, should move this file elsewhere
try:
with open(js_lib_file, "r") as regex:
database = json.load(regex)
except FileNotFoundError:
logger.warning(f"File not found: {js_lib_file}")
if js_lib_database is None:
return None

# Try to match file name
libs = match_by_attribute("filename", filename, database)
libs = match_by_attribute("filename", filename, js_lib_database)
if len(libs) > 0:
js_info["jsLibraries"] = libs
return js_info
Expand All @@ -46,7 +42,7 @@ def extract_js_info(filename: str) -> object:
try:
with open(filename, "r") as js_file:
filecontent = js_file.read()
libs = match_by_attribute("filecontent", filecontent, database)
libs = match_by_attribute("filecontent", filecontent, js_lib_database)
js_info["jsLibraries"] = libs
except FileNotFoundError:
logger.warning(f"File not found: {filename}")
Expand All @@ -67,3 +63,83 @@ def match_by_attribute(attribute: str, content: str, database: Dict) -> List[Dic
# skip remaining patterns, move on to the next library
break
return libs


def download_database() -> dict:
url = "https://raw.githubusercontent.com/RetireJS/retire.js/master/repository/jsrepository-master.json"
response = requests.get(url)
if response.status_code == 200:
click.echo("Request successful!")
return json.loads(response.text)

if response.status_code == 404:
click.echo("Resource not found.")
else:
click.echo("An error occurred.")

return None


def strip_irrelevant_data(retirejs_db: dict) -> dict:
clean_db = {}
reg_temp = "\u00a7\u00a7version\u00a7\u00a7"
version_regex = r"\d+(?:\.\d+)*"
for library, lib_entry in retirejs_db.items():
if "extractors" in lib_entry:
clean_db[library] = {}
patterns = lib_entry["extractors"]
possible_entries = [
"filename",
"filecontent",
"hashes",
]
for entry in possible_entries:
if entry in patterns:
entry_list = []
for reg in patterns[entry]:
entry_list.append(reg.replace(reg_temp, version_regex))
clean_db[library][entry] = entry_list
return clean_db


@surfactant.plugin.hookimpl
def update_db():
"""Retrieves the javascript library CVE database used by retire.js (https://github.com/RetireJS/retire.js/blob/master/repository/jsrepository-master.json) and only keeps the contents under each library's "extractors" section, which contains file hashes and regexes relevant for detecting a specific javascript library by its file name or contents.
The resulting smaller json is written to js_library_patterns.json in the same directory. This smaller file will be read from to make the checks later on."""
retirejs = download_database()
if retirejs is not None:
cleaned = strip_irrelevant_data(retirejs)
path = ConfigManager().get_data_dir_path() / "infoextractors"
path.mkdir(parents=True, exist_ok=True)
json_file_path = (
ConfigManager().get_data_dir_path() / "infoextractors" / "js_library_patterns.json"
)
with open(json_file_path, "w") as f:
json.dump(cleaned, f, indent=4)
return "Update complete."
return "No update occurred."


@surfactant.plugin.hookimpl
def short_name():
return "js_file"


def load_db():
js_lib_file = (
ConfigManager().get_data_dir_path() / "infoextractors" / "js_library_patterns.json"
)

try:
with open(js_lib_file, "r") as regex:
database = json.load(regex)
except FileNotFoundError:
logger.warning(
"Javascript library pattern database database could not be loaded. Run `surfactant plugin update-db js_file` to fetch the pattern database."
)
return None
return database


js_lib_database = load_db()
Loading

0 comments on commit e471c5f

Please sign in to comment.