From adfdfc78936faaff75aa97357fc8febd6d4b6568 Mon Sep 17 00:00:00 2001 From: Zion Leonahenahe Basque Date: Tue, 31 Oct 2023 15:19:38 -0700 Subject: [PATCH] Remove builtin Joern, depend on PyJoern (#2) * Remove builtin Joern, depend on PyJoern * Update container for new pyjoern * update setup * Fix setup again * update java version * fix counting bug --- .github/workflows/release.yml | 40 ++ .gitignore | 1 - Dockerfile | 4 +- sailreval/__init__.py | 2 +- sailreval/analysis/counting.py | 5 +- sailreval/analysis/measure.py | 8 +- sailreval/joern/__init__.py | 16 - sailreval/joern/bin/.keep | 0 sailreval/joern/cfg/__init__.py | 0 sailreval/joern/cfg/cfged.py | 520 -------------- sailreval/joern/cfg/ged.py | 129 ---- sailreval/joern/cfg/graph_region.py | 333 --------- sailreval/joern/cfg/jil/__init__.py | 0 sailreval/joern/cfg/jil/block.py | 103 --- sailreval/joern/cfg/jil/lifter.py | 155 ---- sailreval/joern/cfg/jil/statement.py | 208 ------ sailreval/joern/cfg/region_identifier.py | 868 ----------------------- sailreval/joern/cfg/utils.py | 233 ------ sailreval/joern/client.py | 254 ------- sailreval/joern/server.py | 86 --- sailreval/metrics/ged_to_source.py | 18 +- sailreval/utils/compile.py | 5 +- scripts/run_cfged_on_file.py | 13 +- setup.cfg | 7 +- setup.py | 69 +- setup.sh | 6 +- tests/test_cfged.py | 12 +- tests/test_readability.py | 6 +- 28 files changed, 79 insertions(+), 3022 deletions(-) create mode 100644 .github/workflows/release.yml delete mode 100755 sailreval/joern/__init__.py delete mode 100644 sailreval/joern/bin/.keep delete mode 100644 sailreval/joern/cfg/__init__.py delete mode 100644 sailreval/joern/cfg/cfged.py delete mode 100644 sailreval/joern/cfg/ged.py delete mode 100644 sailreval/joern/cfg/graph_region.py delete mode 100644 sailreval/joern/cfg/jil/__init__.py delete mode 100644 sailreval/joern/cfg/jil/block.py delete mode 100644 sailreval/joern/cfg/jil/lifter.py delete mode 100644 sailreval/joern/cfg/jil/statement.py delete mode 100644 sailreval/joern/cfg/region_identifier.py delete mode 100644 sailreval/joern/cfg/utils.py delete mode 100755 sailreval/joern/client.py delete mode 100755 sailreval/joern/server.py diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..f21b385 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,40 @@ +name: Release + +on: + push: + tags: + - "v**" + +jobs: + + release-github: + name: Create Github Release + permissions: write-all + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v2 + - name: Create Release + uses: ncipollo/release-action@v1 + with: + generateReleaseNotes: true + + release-pypi: + name: Release pypi package + runs-on: ubuntu-latest + steps: + - name: Checkout source + uses: actions/checkout@v2 + - name: Setup Python + uses: actions/setup-python@v2 + with: + python-version: '3.8' + - name: Install build + run: pip install build + - name: Build dists + run: python -m build + - name: Release to PyPI + uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 + with: + user: __token__ + password: ${{ secrets.PYPI_API_TOKEN }} diff --git a/.gitignore b/.gitignore index d3f3736..d8de52d 100755 --- a/.gitignore +++ b/.gitignore @@ -6,7 +6,6 @@ testing/ /final_results/* tests/cfged/angr_sailr* tests/cfged/*.dwarf.linemaps -sailreval/joern/bin/joern-cli/* # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/Dockerfile b/Dockerfile index 9a5f488..c874386 100644 --- a/Dockerfile +++ b/Dockerfile @@ -27,8 +27,8 @@ RUN apt-get update && apt-get -o APT::Immediate-Configure=0 install -y \ COPY ./sailreval /SAILR/sailreval COPY ./*.py /SAILR/ COPY ./setup.cfg /SAILR/ -# also installs joern -RUN pip3 install -e ./SAILR +# also inits the pyjoern project to avoid later download in init +RUN pip3 install -e ./SAILR && pyjoern --install # # diff --git a/sailreval/__init__.py b/sailreval/__init__.py index 030aad8..b5a858c 100755 --- a/sailreval/__init__.py +++ b/sailreval/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.0.0" +__version__ = "1.0.0" # create loggers import logging diff --git a/sailreval/analysis/counting.py b/sailreval/analysis/counting.py index a9655d9..19e5d13 100644 --- a/sailreval/analysis/counting.py +++ b/sailreval/analysis/counting.py @@ -1,6 +1,7 @@ import itertools import os import sys +from copy import deepcopy from pathlib import Path from collections import defaultdict from typing import List, Dict, Tuple, Set @@ -12,7 +13,7 @@ from sailreval.utils import load_tomls_by_bin_name, bcolors from sailreval.utils.sailr_target import SAILRTarget from sailreval.utils.compile import DEFAULT_OPTIMIZATION_LEVELS, OPTIMIZATION_LEVELS -from sailreval.joern import JoernServer, JoernClient +from pyjoern import JoernServer, JoernClient from tqdm import tqdm import toml @@ -529,7 +530,7 @@ def summarize_sailr_targets( toml_dirs, decompilers, metrics, unique_funcs=unique_funcs, min_func_size=min_func_size, max_func_size=max_func_size, ) - full_summary_doc += save_data_as_markdown_table(_summary, show_stats=show_stats) + full_summary_doc += save_data_as_markdown_table(deepcopy(_summary), show_stats=show_stats) #pkg_name = list(_summary.keys())[0] for name_with_opt in _summary: for opt_level in opt_levels: diff --git a/sailreval/analysis/measure.py b/sailreval/analysis/measure.py index 9d9a529..28fe995 100644 --- a/sailreval/analysis/measure.py +++ b/sailreval/analysis/measure.py @@ -10,10 +10,10 @@ from typing import List import toml +from pyjoern import JoernClient, JoernServer, fast_cfgs_from_source +from pyjoern.mapping import cfg_root_node, correct_source_cfg_addrs from sailreval import ALL_DECOMPILERS, ALL_METRICS, SAILR_DECOMPILERS, SAILR_METRICS, JOERNLESS_SERVER_METRICS -from sailreval.joern import JoernClient, JoernServer -from sailreval.joern.cfg.utils import cfgs_from_source, correct_source_cfg_addrs from sailreval.metrics import get_metric_function, POST_METRICS from sailreval.metrics.ged_to_source import has_cfged_required_files, has_cfged_required_src_files from sailreval.utils import bcolors, SAILR_DECOMPILATION_RESULTS_DIR, timeout, SAILR_MEASURE_RESULTS_DIR, WorkDirContext @@ -192,7 +192,7 @@ def measure_files(file_dir: Path, basename: str, decompilers=None, metrics=None, if require_cfgs: if has_cfged_required_src_files(tfile, target_binary): linemaps_path = tfile.with_suffix(".linemaps") - extracted_cfgs = cfgs_from_source(tfile.absolute()) + extracted_cfgs = fast_cfgs_from_source(tfile.absolute()) if extracted_cfgs: source_cfgs = correct_source_cfg_addrs( extracted_cfgs, @@ -227,7 +227,7 @@ def measure_files(file_dir: Path, basename: str, decompilers=None, metrics=None, # extract cfgs if needed if require_cfgs and has_cfged_required_files(target_file.absolute()): try: - dec_cfgs = cfgs_from_source(target_file.absolute()) if dec_name != "source" else source_cfgs + dec_cfgs = fast_cfgs_from_source(target_file.absolute()) if dec_name != "source" else source_cfgs except Exception: dec_cfgs = {} diff --git a/sailreval/joern/__init__.py b/sailreval/joern/__init__.py deleted file mode 100755 index 926be34..0000000 --- a/sailreval/joern/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -from pathlib import Path -import logging - -from .client import JoernClient -from .server import JoernServer - -_l = logging.getLogger(__name__) - -JOERN_BIN_DIR_PATH = Path(Path(__file__).parent / "bin" / "joern-cli").absolute() -JOERN_SERVER_PATH = JOERN_BIN_DIR_PATH / "joern" -JOERN_EXPORT_PATH = JOERN_BIN_DIR_PATH / "joern-export" -JOERN_PARSE_PATH = JOERN_BIN_DIR_PATH / "joern-parse" - -if not JOERN_BIN_DIR_PATH.exists(): - raise FileNotFoundError(f"Joern bin directory not found at {JOERN_BIN_DIR_PATH}, please reinstall!") - diff --git a/sailreval/joern/bin/.keep b/sailreval/joern/bin/.keep deleted file mode 100644 index e69de29..0000000 diff --git a/sailreval/joern/cfg/__init__.py b/sailreval/joern/cfg/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/sailreval/joern/cfg/cfged.py b/sailreval/joern/cfg/cfged.py deleted file mode 100644 index eceb9ee..0000000 --- a/sailreval/joern/cfg/cfged.py +++ /dev/null @@ -1,520 +0,0 @@ -import itertools -from collections import defaultdict -from pathlib import Path -from typing import Dict, Union, List, Set, Optional -import logging - -import networkx as nx - -from .region_identifier import RegionIdentifier -from .graph_region import GraphRegion -from .ged import graph_edit_distance_core_analysis, MAX_NODES_FOR_EXACT_GED -from .utils import addr_to_node_map, save_as_png -from .jil.block import Block, make_merge_block, MergedRegionStart -from ...utils.binary_debug_info import gen_dwarf_addr_to_line_map, read_line_maps -from ...utils import bcolors, timeout - -l = logging.getLogger(__name__) -_DEBUG = False -if _DEBUG: - l.setLevel(logging.DEBUG) - - -def _ged_upperbound_approx(dec_cfg, src_cfg, with_timeout=3): - try: - with timeout(seconds=with_timeout): - score = next(nx.optimize_graph_edit_distance(dec_cfg, src_cfg)) - except TimeoutError: - score = None - - return score - - -def find_dst_mismatches( - dec_edges, src_edges, - dec_line_to_addr_map: Dict, src_addr_to_line_map: Dict, -): - """ - In this algorithm, we always assume the decompiler is the graph we are transforming into the source graph. - The basics of this idea is that after we've completed a region collapse, we want to find the edges that were - not included in the score computation. - - Conceptually, every edge that _enters_ the region can can only enter at one place: the head. This means that when - the region is collapsed, all those edges will remain going to that node in the right place (since both graphs - pointed to the head). - - However, the outgoing edges from the region can come from any node in the region to the outside. This means that - when we collapse the region we lose info. Here is an example of turning G1 into G2: - G1: - A -> B -> C - | ^ - ----------| - - G2: - A -> B -> C - | ^ - ---> D ---| - - The region GED is 2. After the collapse, we have the following: - G1: - R1 -> C - - G2: - R1 -> C - - The final CFGED score of this graph would be 2, but the real GED is 4. - The edits needed for G1 -> G2 is: [del: (A, C), add: (D), add: (D, C), add: (A, D)] - - When we collapse the region, we lost information that (D, C) was the edge that was supposed to exist, - not (A, C). Knowing that, would mean we need to del: (A,C) and then add: (D, C), worth 2 points. - - This leads to how to finally compute this: - 1. Collect all the edges leaving both graphs regions - 2. For each edge, find the corresponding edge in the other graph by their src node - 3. Remove then from the mismatch list - 4. Sum the number of remaining edges, that is the mismatch score - """ - dec_edges = dec_edges["out"].copy() - src_edges = src_edges["out"].copy() - matches = set() - already_matched_src_edge = set() - for dec_src, dec_dst in dec_edges: - dec_src_addrs = dec_line_to_addr_map.get(dec_src.addr, set()) - if not dec_src_addrs: - continue - - src_src_addrs = set(itertools.chain.from_iterable([src_addr_to_line_map.get(addr, set()) for addr in dec_src_addrs])) - if not src_src_addrs: - continue - - for src_src, src_dst in src_edges: - if (src_src, src_dst) in already_matched_src_edge: - continue - - if src_src.addr in src_src_addrs: - already_matched_src_edge.add((src_src, src_dst)) - matches.add((dec_src, dec_dst)) - matches.add((src_src, src_dst)) - break - - unmatched_dec_edges = list(filter(lambda x: x not in matches, dec_edges)) - unmatched_src_edges = list(filter(lambda x: x not in matches, src_edges)) - total_edits = len(unmatched_dec_edges) + len(unmatched_src_edges) - - return total_edits - - -def destroy_old_region(cfg: nx.DiGraph, expanded_region_graph: nx.DiGraph, r_head: Block): - extra_edges = defaultdict(list) - r_nodes = list(expanded_region_graph.nodes) - if r_head not in r_nodes: - r_nodes.append(r_head) - - r_preds = list() - r_succs = list() - - for r_node in r_nodes: - for suc in cfg.successors(r_node): - if suc not in r_nodes: - r_succs.append(suc) - extra_edges["out"].append((r_node, suc)) - for pred in cfg.predecessors(r_node): - if pred not in r_nodes: - r_preds.append(pred) - extra_edges["in"].append((pred, r_node)) - - cfg.remove_nodes_from(r_nodes) - merged_node = make_merge_block(r_head.addr, r_nodes) - cfg.add_node(merged_node, node=merged_node) - for pred in r_preds: - cfg.add_edge(pred, merged_node, src=pred, dst=merged_node) - - for suc in r_succs: - cfg.add_edge(merged_node, suc, src=merged_node, dst=suc) - - return extra_edges - - -def expand_region_to_block_graph(region: GraphRegion, graph: nx.DiGraph): - def _expand_region_to_blocks(_region: GraphRegion): - all_nodes = list() - for node in _region.graph.nodes: - if isinstance(node, Block): - all_nodes.append(node) - elif isinstance(node, GraphRegion): - all_nodes += _expand_region_to_blocks(node) - - return all_nodes - - region_blocks = _expand_region_to_blocks(region) - return nx.subgraph(graph, region_blocks) - - -def expand_region_head_to_block(region: GraphRegion): - region_head = region.head - if isinstance(region_head, Block): - return region_head - - if isinstance(region_head, GraphRegion): - return expand_region_head_to_block(region_head) - - raise ValueError(f"Invalid region head type {type(region_head)}") - -def is_only_blocks(region: GraphRegion): - for node in region.graph.nodes: - if isinstance(node, Block): - continue - if isinstance(node, GraphRegion): - return False - - return True - - -def find_containing_block_addrs(graph: nx.DiGraph, lines: Set): - containing_addrs = set() - line_has_container = set() - graph_nodes = list(graph.nodes) - for node in graph_nodes: - for line in lines: - if node.contains_addr(line): - line_has_container.add(line) - containing_addrs.add(node.addr) - - for line in lines: - if line in line_has_container: - continue - - closest_node = min(graph_nodes, key=lambda x: x.addr) - for node in graph_nodes: - if line >= node.addr >= closest_node.addr: - closest_node = node - - containing_addrs.add(closest_node.addr) - return containing_addrs - - -def find_matching_regions_with_lines(region: GraphRegion, lines: Set): - if region.head.addr in lines: - yield region - - for node in region.graph.nodes: - if isinstance(node, Block): - continue - if isinstance(node, GraphRegion): - yield from find_matching_regions_with_lines(node, lines) - - -def dfs_region_for_parent(region: GraphRegion, child: Block): - for node in region.graph.nodes: - if isinstance(node, GraphRegion): - if node.head.addr == child.addr: - yield region - - yield from dfs_region_for_parent(node, child) - - -def dfs_region_for_leafs(region: GraphRegion): - has_block = False - only_blocks = True - for node in region.graph.nodes: - if isinstance(node, GraphRegion): - yield from dfs_region_for_leafs(node) - only_blocks = False - elif isinstance(node, Block): - has_block = True - - if only_blocks and has_block: - yield region - - -def find_some_leaf_region(region: Union[Block, GraphRegion], node_blacklist, og_cfg: nx.DiGraph) -> Optional[Union[GraphRegion, Block]]: - # sanity check - if isinstance(region, Block): - return region - elif not isinstance(region, GraphRegion): - return None - - leaf_regions = list(dfs_region_for_leafs(region)) - if not leaf_regions: - return None - - # find a region we did not blacklist - leaf_regions = sorted(leaf_regions, key=lambda x: x.head.addr, reverse=True) - for leaf_region in leaf_regions: - if leaf_region.head.addr not in node_blacklist: - return leaf_region - - # if we are all out of non blacklisted regions, let's try to find a parent of each leaf - #for leaf_region in leaf_regions: - # head, graph = leaf_region - # parents = list(dfs_region_for_parent(region, head)) - # if not parents: - # continue - - # parents = sorted(parents, key=lambda x: x.head.addr) - # for parent in parents: - # if parent.head.addr in node_blacklist: - # continue - - # expanded_parent = expand_region_to_block_graph(parent, og_cfg) - # return parent.head, expanded_parent - - return None - - -def cfg_edit_distance( - dec_cfg: nx.DiGraph, - src_cfg: nx.DiGraph, - dec_line_to_addr_map: Dict, - src_addr_to_line_map: Dict, - max_region_collapse=200, - max_region_estimates=3, - check_upperbound_approx=True, -): - """ - src_addr_to_line_map[address] = set(line1, line2, ...) - dec_line_to_addr_map[line_num] = set(addr1, addr2, ...) - - :param src_cfg: - :param dec_cfg: - :param src_addr_to_line_map: - :param dec_line_to_addr_map: - :param max_region_collapse: - :return: - """ - src_cfg, dec_cfg = nx.DiGraph(src_cfg), nx.DiGraph(dec_cfg) - cfged_score = 0 - curr_region_collapse = 0 - curr_region_estimates = 0 - redo_structuring = True - unable_to_approx = False - region_blacklist = set() - - #if len(src_cfg.nodes) <= MAX_EXACT_GED_SIZE and len(dec_cfg.nodes) <= MAX_EXACT_GED_SIZE: - # l.debug(f"Graph small enough for exact! Running it...") - # return graph_edit_distance(dec_cfg, src_cfg, with_timeout=10, max_on_timeout=False) - - # the max possible CFGED score for any two graphs - max_cfged_score = len(src_cfg.nodes) + len(dec_cfg.nodes) + len(src_cfg.edges) + len(dec_cfg.edges) - upperbound = max_cfged_score - if check_upperbound_approx: - approx = _ged_upperbound_approx(dec_cfg, src_cfg) - if approx is not None: - upperbound = approx - - while True: - if unable_to_approx or curr_region_collapse >= max_region_collapse: - if unable_to_approx: - l.debug(f"Unable to approximate the rest of the graph. Exiting early with max score!") - else: - l.debug(f"Max region collapse limit hit. Exiting early with max score!") - return cfged_score + len(src_cfg.nodes) + len(dec_cfg.nodes) + len(src_cfg.edges) + len(dec_cfg.edges) - - l.debug(f"Running region collapse round {curr_region_collapse}...") - # supergraph it! - if redo_structuring: - #src_cfg, dec_cfg = to_jil_supergraph(src_cfg), to_jil_supergraph(dec_cfg) - src_nodes, dec_nodes = addr_to_node_map(src_cfg), addr_to_node_map(dec_cfg) - l.debug(f"Decompiler nodes: {len(dec_cfg.nodes)}") - l.debug(f"Source nodes: {len(src_cfg.nodes)}") - - # compute the regions - src_regions, dec_regions = RegionIdentifier(src_cfg).region, RegionIdentifier(dec_cfg).region - - if _DEBUG: - save_as_png(src_cfg, Path(f"./src_cfg_{curr_region_collapse}.png")) - save_as_png(dec_cfg, Path(f"./dec_cfg_{curr_region_collapse}.png")) - - # Now that you have regions we want to iteratively enter the lower region we can find on one graph - # then find that same region on the other graph, then do a graph edit distance. In this case - # we start with the src cfg - dec_region = find_some_leaf_region(dec_regions, region_blacklist, dec_cfg) - if dec_region is None: - dec_r_head, dec_r_cfg = None, None - elif isinstance(dec_region, Block): - dec_r_head, dec_r_cfg = dec_region, nx.DiGraph() - dec_r_cfg.add_node(dec_region) - else: - dec_r_head, dec_r_cfg = dec_region.head, dec_region.graph - - if dec_r_head is None: - l.debug(f"We are unable to match anymore small region heads, which means we must now approximate the rest.") - score = graph_edit_distance_core_analysis(dec_cfg, src_cfg, with_timeout=8) - if score is None: - unable_to_approx = True - continue - - cfged_score += score - break - - # map the decompilation line to an address (reported by the decompiler - addrs = dec_line_to_addr_map.get(dec_r_head.addr, None) - if not addrs: - # quick check if you can find the line in the addr map in a proximity - up_addrs = dec_line_to_addr_map.get(dec_r_head.addr - 1, None) - down_addrs = dec_line_to_addr_map.get(dec_r_head.addr + 1, None) - if not up_addrs and not down_addrs: - l.debug(f"Unable to find any line-addr map for region head {dec_r_head.addr}! Skipping...") - region_blacklist.add(dec_r_head.addr) - continue - - addrs = up_addrs if up_addrs else down_addrs - - all_lines = set( - itertools.chain.from_iterable( - [src_addr_to_line_map[addr] for addr in addrs if addr in src_addr_to_line_map] - ) - ) - lines_in_src = set(filter(lambda x: x in src_nodes, all_lines)) - - # We were unable to find a real region start, probably because a node got consumed - if not lines_in_src: - lines_in_src = find_containing_block_addrs(src_cfg, all_lines) - if not lines_in_src and addrs: - # If we still have nothing, walk backwards! - for i in range(1, 0x14): - all_lines = set( - itertools.chain.from_iterable( - [src_addr_to_line_map[addr-i] for addr in addrs if addr-i in src_addr_to_line_map] - ) - ) - lines_in_src = set(filter(lambda x: x in src_nodes, all_lines)) - if lines_in_src: - break - - - lines_in_src = sorted(lines_in_src) - matching_src_regions = list(find_matching_regions_with_lines(src_regions, lines_in_src)) - if not matching_src_regions: - region_blacklist.add(dec_r_head.addr) - redo_structuring = False - curr_region_collapse += 1 - l.debug(f"Unable to find a pairing region for {dec_r_head.addr}: no src addrs found") - continue - - # - # try to find the base matching region - # - - # gather sizes - matches_by_size = {} - lowest_size = 10000 - for src_region in matching_src_regions: - src_block_region = expand_region_to_block_graph(src_region, src_cfg) - src_r_head = src_region.head - size_diff = abs(len(src_block_region.nodes) - len(dec_r_cfg.nodes)) - matches_by_size[src_r_head] = (src_region, size_diff) - if size_diff < lowest_size: - lowest_size = size_diff - - # filter out the ones that are too big - matching_src_regions = list(filter(lambda x: x[1][1] <= lowest_size, matches_by_size.items())) - if not matching_src_regions: - region_blacklist.add(dec_r_head.addr) - redo_structuring = False - curr_region_collapse += 1 - l.debug(f"Unable to find a pairing region for {dec_r_head.addr}") - continue - - # if we have more than one, tie break with statement size - best_match = None - if len(matching_src_regions) > 1: - smallest_stmt_size = 10000 - for (head, (region, size)) in matching_src_regions: - if not isinstance(head, Block): - continue - - has_merged_region = isinstance(head.statements[0], MergedRegionStart) - if len(head.statements) < smallest_stmt_size: - smallest_stmt_size = len(head.statements) - best_match = region - - if has_merged_region: - best_match = region - break - - if best_match is None: - best_match = matching_src_regions[0][1][0] - - - l.debug(f"Collapsing (Dec, Src) region pair: {(dec_r_head, best_match.head)}") - # - # compute GED of the expanded region - # - - src_r_cfg = expand_region_to_block_graph(best_match, src_cfg) - - dec_r_size, src_r_size = len(dec_r_cfg.nodes), len(src_r_cfg.nodes) - if dec_r_size > MAX_NODES_FOR_EXACT_GED or src_r_size > MAX_NODES_FOR_EXACT_GED: - l.debug(f"Encountered a region too large (dec,src): ({len(dec_r_cfg.nodes), len(src_r_cfg.nodes)} nodes) for an exact score, estimating it...") - size_diff = abs(dec_r_size - src_r_size) - curr_region_collapse += 1 - if size_diff > dec_r_size*2 or size_diff > src_r_size*2: - l.debug(f"Difference in regions size too large to approximate, skipping for now...") - region_blacklist.add(dec_r_head.addr) - redo_structuring = False - continue - - #if curr_region_estimates >= max_region_estimates: - # l.warning(f"Exceeded the max region GED approximizaition limit. This function can't be computed.") - # return -1 - curr_region_estimates += 1 - - distance = graph_edit_distance_core_analysis(dec_r_cfg, src_r_cfg, with_timeout=4) - if distance is None: - l.debug(f"Unable to compute the GED of the region, skipping...") - region_blacklist.add(dec_r_head.addr) - redo_structuring = False - continue - - cfged_score += distance - l.debug(f"Region distance: {distance}") - - # - # cleanup, destroy measured nodes, and restart loop - # - - extra_src_edges = destroy_old_region( - src_cfg, - expand_region_to_block_graph(best_match, src_cfg), - expand_region_head_to_block(best_match) - ) - extra_dec_edges = destroy_old_region( - dec_cfg, - expand_region_to_block_graph(dec_region, dec_cfg) if not isinstance(dec_region, Block) else dec_region, - expand_region_head_to_block(dec_region) if not isinstance(dec_region, Block) else dec_region - ) - edge_diff = find_dst_mismatches(extra_dec_edges, extra_src_edges, dec_line_to_addr_map, src_addr_to_line_map) - if edge_diff: - l.debug(f"In/Out Edge Diff: {edge_diff}") - cfged_score += edge_diff - - if len(dec_cfg.nodes) <= 1 or len(src_cfg.nodes) <= 1: - distance = graph_edit_distance_core_analysis(dec_cfg, src_cfg, with_timeout=10) - if distance is None: - unable_to_approx = True - continue - - cfged_score += distance - break - - region_blacklist = set() - curr_region_collapse += 1 - redo_structuring = True - - l.debug(f"{bcolors.WARNING}Final CFGED Score: {bcolors.BOLD}{cfged_score}{bcolors.ENDC}") - - # handle error cases. - # this only will make sure scores from CFGED are always right, but these cases should all - # be investigated and fixed if encountered. - if cfged_score > max_cfged_score: - l.critical(f"CFGED score of {cfged_score} is somehow larger than the max possible score! DEBUG THIS!") - return max_cfged_score - elif check_upperbound_approx and cfged_score > upperbound: - l.debug(f"CFGED score {cfged_score} is larger than upperbound {upperbound}, returning upperbound") - return upperbound - else: - return cfged_score - - - diff --git a/sailreval/joern/cfg/ged.py b/sailreval/joern/cfg/ged.py deleted file mode 100644 index fcb2604..0000000 --- a/sailreval/joern/cfg/ged.py +++ /dev/null @@ -1,129 +0,0 @@ -import re -from collections import defaultdict -import logging - -import networkx as nx - -from .jil.block import Block -from .jil.statement import ( - Statement, Assignment, Compare, Call, Nop -) -from .utils import find_function_root_node -from sailreval.utils import timeout - -_l = logging.getLogger(__name__) -MAX_NODES_FOR_EXACT_GED = 10 -INVALID_CHOICE_PENALTY = 100000 - -# -# Helpers -# - - -def _collect_graph_roots(g1, g2): - g1_start, g2_start = find_function_root_node(g1), find_function_root_node(g2) - if g1_start is not None and g2_start is not None: - roots = (g1_start, g2_start,) - else: - roots = None - - return roots - - -# -# Edit distance -# - -def ged_max(g1, g2): - return len(g1.nodes) + len(g1.edges) + len(g2.nodes) + len(g2.edges) - - -def ged_exact(g1, g2, with_timeout=10): - """ - Computes the exact Graph Edit Distance for two graphs. On the event of a timeout, - a score of None is returned. - """ - if len(g1.nodes) > MAX_NODES_FOR_EXACT_GED or len(g2.nodes) > MAX_NODES_FOR_EXACT_GED: - return None - - return graph_edit_distance_core_analysis(g1, g2, with_timeout=with_timeout, exact_score=True) - - -def ged_upperbound(g1, g2, with_timeout=5): - """ - Does a single iterations of the GED algorithm and returns the upperbound. - Note: this is not the max possible score. - """ - return graph_edit_distance_core_analysis(g1, g2, upperbound_approx=True, with_timeout=with_timeout) - - -def graph_edit_distance_core_analysis( - g1: nx.DiGraph, g2: nx.DiGraph, is_cfg=True, upperbound_approx=False, exact_score=False, with_timeout=10, - penalize_root_exit_edits=True, recover_on_invalid_edits=True -): - roots = _collect_graph_roots(g1, g2) if is_cfg else None - - # edge insertion cost - def _edge_ins_cost(*args): return 1 - # node deletion cost - def _node_del_cost(*args): return 1 - - if is_cfg: - def _edge_ins_cost(*args): - """ - Makes it illegal to add edges to pred of function start or succ of function end - (with exception of self loops) - """ - attrs = args[0] - src = attrs.get('src', None) - dst = attrs.get('dst', None) - if penalize_root_exit_edits: - if src and src.statements: - last_stmt = src.statements[-1] - if isinstance(last_stmt, Nop) and last_stmt.type == Nop.FUNC_END and src is not dst: - return INVALID_CHOICE_PENALTY - elif dst and dst.statements: - first_stmt = dst.statements[0] - if isinstance(first_stmt, Nop) and first_stmt.type == Nop.FUNC_START and dst is not src: - return INVALID_CHOICE_PENALTY - - return 1 - - def _node_del_cost(*args): - """ - Makes it illegal to delete function start nodes or end nodes - """ - node = args[0].get('node', None) - if penalize_root_exit_edits and node and node.statements: - first_stmt = node.statements[0] - last_stmt = node.statements[-1] - if isinstance(first_stmt, Nop) and first_stmt.type == Nop.FUNC_START: - return INVALID_CHOICE_PENALTY - elif isinstance(last_stmt, Nop) and last_stmt.type == Nop.FUNC_END: - return INVALID_CHOICE_PENALTY - - return 1 - - if exact_score or upperbound_approx: - try: - with timeout(seconds=with_timeout): - if upperbound_approx: - dist = next(nx.optimize_graph_edit_distance(g1, g2, node_del_cost=_node_del_cost, edge_ins_cost=_edge_ins_cost)) - else: - dist = nx.graph_edit_distance(g1, g2, roots=roots, node_del_cost=_node_del_cost, edge_ins_cost=_edge_ins_cost) - except TimeoutError: - dist = None - else: - dist = nx.graph_edit_distance( - g1, g2, roots=roots, node_del_cost=_node_del_cost, edge_ins_cost=_edge_ins_cost, timeout=with_timeout - ) - - # sometimes the score can be computed wrong, which we can fix with a recompute ONCE - if dist is not None and dist > INVALID_CHOICE_PENALTY and recover_on_invalid_edits: - dist = graph_edit_distance_core_analysis( - g1, g2, is_cfg=is_cfg, upperbound_approx=upperbound_approx, exact_score=exact_score, - with_timeout=with_timeout, penalize_root_exit_edits=False, recover_on_invalid_edits=False - ) - - return dist - diff --git a/sailreval/joern/cfg/graph_region.py b/sailreval/joern/cfg/graph_region.py deleted file mode 100644 index b3dbd5b..0000000 --- a/sailreval/joern/cfg/graph_region.py +++ /dev/null @@ -1,333 +0,0 @@ -import logging -from typing import Optional, List, Set - -import networkx - -from .jil.block import Block - -l = logging.getLogger(name=__name__) - - -class GraphRegion: - """ - GraphRegion represents a region of nodes. - - :ivar head: The head of the region. - :ivar graph: The region graph. - :ivar successors: A set of successors of nodes in the graph. These successors do not belong to the current - region. - :ivar graph_with_successors: The region graph that includes successor nodes. - """ - - __slots__ = ( - "head", - "graph", - "successors", - "graph_with_successors", - "cyclic", - "full_graph", - ) - - def __init__( - self, - head, - graph, - successors: Optional[Set], - graph_with_successors: Optional[networkx.DiGraph], - cyclic, - full_graph: Optional[networkx.DiGraph], - ): - self.head = head - self.graph = graph - self.successors = set(successors) if successors is not None else None - # successors inside graph_with_successors should be treated as read-only. when deep-copying GraphRegion objects, - # successors inside graph_with_successors are *not* deep copied. therefore, you should never modify any - # successor node in graph_with_successors. to avoid potential programming errors, just treat - # graph_with_successors as read-only. - self.graph_with_successors = graph_with_successors - - self.full_graph = full_graph - self.cyclic = cyclic - - def __repr__(self): - addrs: List[int] = [] - s = "" - if self.graph is None: - # only head is available - return "" % self.head - - for node in self.graph.nodes(): - if hasattr(node, "addr"): - addrs.append(node.addr) - if addrs: - s = f": {min(addrs):#x}-{max(addrs):#x}" - - return "" % (self.head, self.graph.number_of_nodes(), s) - - def copy(self) -> "GraphRegion": - return GraphRegion( - self.head, - networkx.DiGraph(self.graph) if self.graph is not None else None, - set(self.successors) if self.successors is not None else None, - networkx.DiGraph(self.graph_with_successors) if self.graph_with_successors is not None else None, - self.cyclic, - networkx.DiGraph(self.full_graph) if self.full_graph is not None else None, - ) - - def recursive_copy(self, nodes_map=None): - - if nodes_map is None: - nodes_map = {} - new_graph = self._recursive_copy(self.graph, nodes_map) - - if self.graph_with_successors is not None: - successors = set() - for succ in self.successors: - if succ not in nodes_map: - if isinstance(succ, GraphRegion): - nodes_map[succ] = succ.recursive_copy(nodes_map=nodes_map) - else: - nodes_map[succ] = succ - successors.add(nodes_map[succ]) - - new_graph_with_successors = self._recursive_copy(self.graph_with_successors, nodes_map) - else: - new_graph_with_successors = None - successors = None - - if self.full_graph is not None: - new_full_graph = self._recursive_copy(self.full_graph, nodes_map) - else: - new_full_graph = None - - return GraphRegion( - nodes_map[self.head], new_graph, successors, new_graph_with_successors, self.cyclic, new_full_graph - ) - - @staticmethod - def _recursive_copy(old_graph, nodes_map, ignored_nodes=None) -> networkx.DiGraph: - new_graph = networkx.DiGraph() - - # make copy of each node and add the mapping from old nodes to new nodes into nodes_map - for node in old_graph.nodes(): - if node in nodes_map: - new_graph.add_node(nodes_map[node]) - elif ignored_nodes is not None and node in ignored_nodes: - # do not copy. use the reference instead - new_graph.add_node(node) - # drop it into the nodes_map - nodes_map[node] = node - else: - # make recursive copies - if type(node) is GraphRegion: - new_node = node.recursive_copy(nodes_map=nodes_map) - nodes_map[node] = new_node - elif type(node) is Block: - new_node = node.copy() - nodes_map[node] = new_node - else: - new_node = node - nodes_map[node] = new_node - new_graph.add_node(new_node) - - # add all edges - for src, dst, edge_data in old_graph.edges(data=True): - new_graph.add_edge(nodes_map[src], nodes_map[dst], **edge_data) - - return new_graph - - @property - def addr(self): - return self.head.addr - - @staticmethod - def dbg_get_repr(obj, ident=0): - if type(obj) is GraphRegion: - s = obj.dbg_print(ident=ident) - else: - s = " " * ident + str(obj) - - return s - - def dbg_print(self, ident=0): - - s = self.dbg_get_repr(self.head, ident=ident) + "\n" - - successors = list(self.graph.successors(self.head)) - if len(successors) == 2: - left_kid, right_kid = successors - s += ( - " " * ident - + "if (...) {\n" - + self.dbg_get_repr(left_kid, ident=ident + 2) - + "\n" - + " " * ident - + "}\n" - + " " * ident - + "else if (...) {\n" - + self.dbg_get_repr(right_kid, ident=ident + 2) - + "\n" - + " " * ident - + "}" - ) - # TODO: other nodes - elif len(successors) == 1: - s += self.dbg_get_repr(successors[0], ident=ident) - - return s - - def replace_region(self, sub_region, replace_with): - - if sub_region not in self.graph: - l.error("The sub-region to replace must be in the current region. Note that this method is not recursive.") - raise Exception() - - if sub_region is self.head: - self.head = replace_with - - self._replace_node_in_graph(self.graph, sub_region, replace_with) - if self.graph_with_successors is not None: - self._replace_node_in_graph(self.graph_with_successors, sub_region, replace_with) - - def replace_region_with_region(self, sub_region: "GraphRegion", replace_with: "GraphRegion"): - - if sub_region not in self.graph: - l.error("The sub-region to replace must be in the current region. Note that this method is not recursive.") - raise Exception() - - if sub_region is self.head: - self.head = replace_with.head - - # special case: a successor in replace_with.successors is a normal AIL block while the corresponding - # successor in self.successors is a graph region (with the AIL block as its head). we handle this case here by - # creating a new graph_with_successors for the replace_with region - successor_map = {} - if self.successors: - if any(succ not in self.successors for succ in replace_with.successors): - for succ in replace_with.successors: - if succ not in self.successors: - for succ_ in self.successors: - if isinstance(succ_, GraphRegion) and succ_.head == succ: - successor_map[succ] = succ_ - if successor_map: - replace_with_graph_with_successors = networkx.DiGraph() - for nn in replace_with.graph_with_successors: - replace_with_graph_with_successors.add_node(successor_map.get(nn, nn)) - for n0, n1 in replace_with.graph_with_successors.edges: - n0 = successor_map.get(n0, n0) - n1 = successor_map.get(n1, n1) - replace_with_graph_with_successors.add_edge(n0, n1) - else: - replace_with_graph_with_successors = replace_with.graph_with_successors - - self._replace_node_in_graph_with_subgraph( - self.graph, - self.successors, - self.full_graph, - sub_region, - replace_with_graph_with_successors, - replace_with.head, - ) - if self.graph_with_successors is not None: - self._replace_node_in_graph_with_subgraph( - self.graph_with_successors, - None, - self.full_graph, - sub_region, - replace_with_graph_with_successors, - replace_with.head, - ) - - @staticmethod - def _replace_node_in_graph(graph: networkx.DiGraph, node, replace_with): - - in_edges = list(graph.in_edges(node)) - out_edges = list(graph.out_edges(node)) - - graph.remove_node(node) - graph.add_node(replace_with) - - for src, _ in in_edges: - if src is node: - graph.add_edge(replace_with, replace_with) - else: - graph.add_edge(src, replace_with) - - for _, dst in out_edges: - if dst is node: - graph.add_edge(replace_with, replace_with) - else: - graph.add_edge(replace_with, dst) - - assert node not in graph - - @staticmethod - def _replace_node_in_graph_with_subgraph( - graph: networkx.DiGraph, - known_successors: Optional[List], - reference_full_graph: Optional[networkx.DiGraph], - node, - sub_graph: networkx.DiGraph, - sub_graph_head, - ): - - in_edges = list(graph.in_edges(node)) - out_edges = list(graph.out_edges(node)) - - graph.remove_node(node) - sub_graph_nodes = list(sub_graph.nodes) - sub_graph_edges = list(sub_graph.edges) - - for src, _ in in_edges: - if src is node: - graph.add_edge(sub_graph_head, sub_graph_head) - else: - graph.add_edge(src, sub_graph_head) - - for _, dst in out_edges: - if dst is node: - # ignore all self-loops - continue - if known_successors is not None and dst in known_successors: - continue - # find the correct source - if isinstance(dst, GraphRegion) and dst not in sub_graph: - # GraphRegion.successors may not store GraphRegion objects. Instead, the heads of GraphRegion objects - # are stored. - for src in sub_graph.predecessors(dst.head): - graph.add_edge(src, dst) - # replace the corresponding nodes in sub_graph_nodes and sub_graph_edges - for i in range(len(sub_graph_nodes)): # pylint:disable=consider-using-enumerate - if sub_graph_nodes[i] is dst.head: - sub_graph_nodes[i] = dst - for i in range(len(sub_graph_edges)): # pylint:disable=consider-using-enumerate - if sub_graph_edges[i][0] is dst.head: - sub_graph_edges[i] = (dst, sub_graph_edges[i][1]) - if sub_graph_edges[i][1] is dst.head: - sub_graph_edges[i] = (sub_graph_edges[i][0], dst) - else: - if dst in sub_graph: - for src in sub_graph.predecessors(dst): - graph.add_edge(src, dst) - elif reference_full_graph is not None and dst in reference_full_graph: - for src in reference_full_graph.predecessors(dst): - if src in graph: - graph.add_edge(src, dst) - else: - # it may happen that the dst node does not exist in sub_graph - # fallback - l.info("Node dst is not found in sub_graph. Enter the fall back logic.") - for src in sub_graph.nodes: - if sub_graph.out_degree[src] == 0: - graph.add_edge(src, dst) - - graph.add_nodes_from(sub_graph_nodes) - graph.add_edges_from(sub_graph_edges) - # finally, remove all nodes from the graph in known_successors. they are only supposed to be in - # graph_with_successors. - if known_successors is not None: - for nn in known_successors: - if nn in graph: - graph.remove_node(nn) - - assert node not in graph diff --git a/sailreval/joern/cfg/jil/__init__.py b/sailreval/joern/cfg/jil/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/sailreval/joern/cfg/jil/block.py b/sailreval/joern/cfg/jil/block.py deleted file mode 100644 index 2f13b6e..0000000 --- a/sailreval/joern/cfg/jil/block.py +++ /dev/null @@ -1,103 +0,0 @@ -from textwrap import dedent -from typing import Union, List - -import networkx as nx - -from .statement import MergedRegionStart - -# -# Blocks -# - -class Block: - def __init__(self, addr, statements=None, idx=None): - self.addr = addr - self.idx = idx - self.statements = statements or list() - self.idx_str = "" if self.idx is None else f".{self.idx}" - - def __eq__(self, other): - return isinstance(other, Block) and self.addr == other.addr and self.statements == other.statements - - def __hash__(self): - return hash(f"{self.addr}{self.idx}{[stmt for stmt in self.statements]}") - - def __repr__(self): - return f"" - - def __str__(self): - output = f"{self.addr}{self.idx_str}:\n" - for line in self.statements: - output += f"{line}\n" - - return output - - def copy(self): - return Block( - self.addr, - statements=[stmt.copy() for stmt in self.statements], - idx=self.idx - ) - - def contains_addr(self, addr): - if self.addr == addr: - return True - - for stmt in self.statements: - if stmt.source_line_number == addr: - return True - - return False - - @staticmethod - def merge_blocks(block1: "Block", block2: "Block", graph: nx.DiGraph = None, update_graph=True) -> \ - Union["Block", nx.DiGraph]: - """ - Merges to JIL blocks. When update_graph is true, the graph will be modified to reflect this. - This implementation assumes block1 is the parent of block2 for graph updates. - - :param block1: - :param block2: - :param graph: - :param update_graph: - :return: - """ - - new_node = block1 - new_node.statements += block2.statements - if not update_graph or graph is None: - return new_node - - in_edges = list(graph.in_edges(block1)) - out_edges = list(graph.out_edges(block2)) - graph.remove_node(block1) - graph.remove_node(block2) - - if new_node is not None: - graph.add_node(block1) - - for src, _ in in_edges: - if src is block2: - src = new_node - graph.add_edge(src, new_node) - - for _, dst in out_edges: - if dst is block1: - dst = new_node - graph.add_edge(new_node, dst) - - return new_node - - -def make_merge_block(addr, nodes: List[Block]): - node_count = 0 - stmts = list() - for node in nodes: - for stmt in node.statements: - node_count += stmt.total_nodes if isinstance(stmts, MergedRegionStart) else 1 - stmts.append(stmt) - - return Block( - addr, - statements=[MergedRegionStart(source_line_number=addr, total_nodes=node_count)] + stmts - ) diff --git a/sailreval/joern/cfg/jil/lifter.py b/sailreval/joern/cfg/jil/lifter.py deleted file mode 100644 index 95dec70..0000000 --- a/sailreval/joern/cfg/jil/lifter.py +++ /dev/null @@ -1,155 +0,0 @@ -import re -from typing import Dict, List -from collections import defaultdict -import logging - -from .block import Block -from .statement import ( - UnsupportedStmt, UnknownStmt, Assignment, Compare, BinOp, Return, Call, Ternary, Nop -) - -import networkx as nx - -l = logging.getLogger(__name__) - -STMT_MAP = { - "assignment": (Assignment, None), - "minus": (BinOp, BinOp.SUB), - "plus": (BinOp, BinOp.ADD), - "logicalAnd": (BinOp, BinOp.AND), - "logicalOr": (BinOp, BinOp.OR), - "equals": (Compare, Compare.EQ), - "greaterEqualsThan": (Compare, Compare.GTE), - "greaterThan": (Compare, Compare.GT), - "lessEqualsThan": (Compare, Compare.LTE), - "lessThan": (Compare, Compare.LT), - "notEquals": (Compare, Compare.NE), - "return": (Return, None), - "call": (Call, None), - "conditional": (Ternary, None), -} - - -def lift_graph(graph: nx.DiGraph): - lifted_graph = nx.DiGraph() - edges_by_node_addr: Dict[int, List[int]] = defaultdict(list) - for (in_node, out_node) in graph.edges: - edges_by_node_addr[in_node].append(out_node) - - addr_uses = defaultdict(int) - new_blocks = { - node: lift_block(node, graph, addr_dict=addr_uses) for node in graph.nodes - } - - for node_name, new_node in new_blocks.items(): - for out_node in edges_by_node_addr[node_name]: - new_out_block = new_blocks[out_node] - lifted_graph.add_edge(new_node, new_out_block) - - return lifted_graph - - -def lift_block(node, graph, addr_dict=None): - node_data = graph._node[node].get('label', None) - if not node_data: - new_addr = 0 - int(node, 10) - new_idx = addr_dict[new_addr] - addr_dict[new_addr] += 1 - return Block(0 - int(node, 10), statements=[Nop("", Nop.NOP)], idx=new_idx) - - statements = [] - for raw_stmt in node_data.split("\n"): - try: - lifted_stmt = lift_statement(raw_stmt) - except Exception as e: - l.debug(f"Encountered error while lifting: {e}") - lifted_stmt = UnsupportedStmt(raw_stmt) - - statements.append(lifted_stmt) - - new_addr = statements[0].source_line_number - new_idx = addr_dict[new_addr] - addr_dict[new_addr] += 1 - return Block(new_addr, statements=statements, idx=new_idx) - - -def lift_statement(raw_data: str): - source_line_number = re.findall("([0-9]+)", raw_data) - if source_line_number: - source_line_number = int(source_line_number[0], 10) - else: - l.debug(f"Failed to parse line number of node!") - source_line_number = None - line_num_kwarg = {"source_line_number": source_line_number} - - _t = "(".join(raw_data.split("(")[1:]) - raw_stmt = "".join(_t.split(")")[0:-1]).replace(" ", "") - - # statement - is_stmt = True - stmt_type = None - if raw_stmt.startswith("<operator>."): - # <operator>.lessEqualsThan,v0 <= line_buf - t = raw_stmt.split("<operator>.")[1] - t = t.split(",") - stmt_type = t[0] - elif raw_stmt.startswith("RETURN"): - t = raw_stmt.split("RETURN")[1] - t = t.split(",") - stmt_type = "return" - elif raw_stmt.startswith("METHOD_RETURN"): - return Nop(raw_stmt, Nop.FUNC_END, **line_num_kwarg) - elif raw_stmt.startswith("METHOD"): - return Nop(raw_stmt, Nop.FUNC_START, **line_num_kwarg) - else: - is_stmt = False - - if is_stmt: - # [lessEqualsThan], [v0 <= line_buf],... - raw_ops = ",".join(t[1:]) - # v0 <= line_buf ... - - - (jil_stmt_cls, sub_type) = STMT_MAP.get(stmt_type, (UnsupportedStmt, None)) - ops = [] - if jil_stmt_cls != UnsupportedStmt: - sep = jil_stmt_cls.op_seperator(sub_type) - if sep: - out = raw_ops.split(sep) - if not out[0]: - ops = [f"{sep}".join(out[1:])] - else: - ops = out[0:1] + [f"{sep}".join(out[1:])] - - # specific parsers - if jil_stmt_cls in (Compare, BinOp): - return jil_stmt_cls(t, sub_type, *ops, **line_num_kwarg) - # SPECIAL CASE: ternary - elif jil_stmt_cls == Ternary: - t = raw_ops.split("?") - cond = t[0] - t = "?".join(t) - t.split(":") - true = t[0] - false = t[1] - return Ternary(raw_stmt, cond, true, false, **line_num_kwarg) - else: - return jil_stmt_cls(t, *ops, **line_num_kwarg) - else: - t = raw_stmt.replace(" ", "") - - # check if call - try: - t_no_comma = t.split(",") - is_call = t_no_comma and t_no_comma[0] == t_no_comma[1].split("(")[0] - except (KeyError, IndexError): - is_call = False - if is_call: - raw_call = t_no_comma[1] - func = raw_call.split("(")[0] - raw_args = "(".join(raw_call.split("(")[1:]) - raw_args = ")".join(raw_args.split(")")[:-1]) - args = raw_args.split(",") - return Call(raw_stmt, func, args, **line_num_kwarg) - - return UnsupportedStmt(raw_stmt, **line_num_kwarg) diff --git a/sailreval/joern/cfg/jil/statement.py b/sailreval/joern/cfg/jil/statement.py deleted file mode 100644 index ba78a51..0000000 --- a/sailreval/joern/cfg/jil/statement.py +++ /dev/null @@ -1,208 +0,0 @@ -# JIL: -# JOERN Intermediate Language -# - -from copy import deepcopy - -class Statement: - def __init__(self, raw_text, source_line_number=None, **kwargs): - self.raw_text = raw_text - self.source_line_number = source_line_number - - @classmethod - def op_seperator(cls, sub_type): - return None - - def __str__(self): - return self.raw_text - - def __repr__(self): - return f"<{self.__class__.__name__}: {self.__str__()}>" - - def copy(self): - # TODO make this better in the future! - return deepcopy(self) - - -class Assignment(Statement): - def __init__(self, raw_text, src, dst, **kwargs): - super().__init__(raw_text, **kwargs) - self.src = src - self.dst = dst - - @classmethod - def op_seperator(cls, sub_type): - return "=" - - def __str__(self): - return f"{self.src} = {self.dst}" - - -class Return(Statement): - def __init__(self, raw_text, ret, **kwargs): - super().__init__(raw_text, **kwargs) - self.ret = ret - - @classmethod - def op_seperator(cls, sub_type): - return "return" - - def __str__(self): - return f"return {self.ret}" - - -class Call(Statement): - def __init__(self, raw_text, func, args, **kwargs): - super().__init__(raw_text, **kwargs) - self.func = func - self.args = args - - @classmethod - def op_seperator(cls, sub_type): - return "return" - - def __str__(self): - return f"{self.func}({''.join(self.args)})" - - -class Compare(Statement): - EQ = 0 - GTE = 1 - GT = 2 - LTE = 3 - LT = 4 - NE = 5 - - def __init__(self, raw_text, type_, arg1, arg2, **kwargs): - super().__init__(raw_text, **kwargs) - self.type = type_ - self.arg1 = arg1 - self.arg2 = arg2 - - def _pretty_seperator(self): - op_map = { - self.EQ: "==", - self.GTE: ">=", - self.GT: ">", - self.LT: "<", - self.LTE: "<=", - self.NE: "!=", - } - - return op_map[self.type] - - @classmethod - def op_seperator(cls, sub_type): - op_map = { - cls.EQ: "==", - cls.GTE: ">=", - cls.GT: ">", - cls.LT: "<", - cls.LTE: "<=", - cls.NE: "!=" - } - - return op_map.get(sub_type, None) - - def __str__(self): - return f"{self.arg1} {self._pretty_seperator()} {self.arg2}" - - -class Ternary(Statement): - def __init__(self, raw_text, cond, true, false, **kwargs): - super().__init__(raw_text, **kwargs) - self.cond = cond - self.true = true - self.false = false - - @classmethod - def op_seperator(cls, sub_type): - return "" - - def __str__(self): - return f"{self.cond} ? {self.true} : {self.false}" - - -class UnaryOp(Statement): - pass - - -class BinOp(Statement): - SUB = 0 - ADD = 1 - AND = 2 - OR = 3 - - def __init__(self, raw_text, type_, arg1, arg2, **kwargs): - super().__init__(raw_text, **kwargs) - self.type = type_ - self.arg1 = arg1 - self.arg2 = arg2 - - def _pretty_seperator(self): - op_map = { - self.SUB: "-", - self.ADD: "+", - self.AND: "&&", - self.OR: "||", - } - - return op_map[self.type] - - @classmethod - def op_seperator(cls, sub_type): - op_map = { - cls.SUB: "minus", - cls.ADD: "plus", - cls.AND: "logicalAnd", - cls.OR: "logicalOr", - } - - return op_map.get(sub_type, None) - - def __str__(self): - return f"{self.arg1} {self._pretty_seperator()} {self.arg2}" - - -class Nop(Statement): - FUNC_START = 0 - FUNC_END = 1 - NOP = 2 - - str_map = { - FUNC_START: "FUNCTION_START", - FUNC_END: "FUNCTION_END", - NOP: "NOP" - } - - def __init__(self, raw_text, type_, **kwargs): - super().__init__(raw_text, **kwargs) - self.type = type_ - - def __str__(self): - return f"{self.str_map.get(self.type, None)}" - - -class UnknownStmt(Statement): - def __str__(self): - return f"<{self.__class__.__name__}: {self.raw_text}>" - - def __repr__(self): - return self.__str__() - - -class UnsupportedStmt(Statement): - def __str__(self): - return f"<{self.__class__.__name__}: {self.raw_text}>" - - def __repr__(self): - return self.__str__() - - -class MergedRegionStart(Statement): - def __init__(self, source_line_number=None, total_nodes=0, **kwargs): - super().__init__("", source_line_number=source_line_number) - self.total_nodes = total_nodes - - def __str__(self): - return f"<{self.__class__.__name__}: {self.source_line_number}, {self.total_nodes} Nodes>" diff --git a/sailreval/joern/cfg/region_identifier.py b/sailreval/joern/cfg/region_identifier.py deleted file mode 100644 index d13e7f7..0000000 --- a/sailreval/joern/cfg/region_identifier.py +++ /dev/null @@ -1,868 +0,0 @@ -from collections import defaultdict -import logging -from typing import List, Optional, Union - -import networkx -from angr.utils.graph import dfs_back_edges, subgraph_between_nodes, dominates, shallow_reverse -from angr.utils.graph import GraphUtils - -from .graph_region import GraphRegion -from .jil.block import Block - -l = logging.getLogger(name=__name__) - - -class RegionIdentifier: - """ - Identifies regions within a function. - """ - - def __init__( - self, - graph, - largest_successor_tree_outside_loop=True, - complete_successors=False, - ): - self._graph = graph - self.regions_by_block_addrs = [] - - self.region = None - self._start_node = None - self._loop_headers: Optional[List] = None - self._largest_successor_tree_outside_loop = largest_successor_tree_outside_loop - self._complete_successors = complete_successors - - self._analyze() - - @staticmethod - def slice_graph(graph, node, frontier, include_frontier=False): - """ - Generate a slice of the graph from the head node to the given frontier. - - :param networkx.DiGraph graph: The graph to work on. - :param node: The starting node in the graph. - :param frontier: A list of frontier nodes. - :param bool include_frontier: Whether the frontier nodes are included in the slice or not. - :return: A subgraph. - :rtype: networkx.DiGraph - """ - - subgraph = subgraph_between_nodes(graph, node, frontier, include_frontier=include_frontier) - if not list(subgraph.nodes): - # HACK: FIXME: for infinite loop nodes, this would return an empty set, so we include the loop body itself - # Make sure this makes sense (EDG thinks it does) - if (node, node) in graph.edges: - subgraph.add_edge(node, node, src=node, dst=node) - return subgraph - - def _analyze(self): - - # make a copy of the graph - graph = networkx.DiGraph(self._graph) - - # preprocess: make it a super graph - self._make_supergraph(graph) - - self._start_node = self._get_start_node(graph) - - # preprocess: find loop headers - self._loop_headers = self._find_loop_headers(graph) - - self.region = self._make_regions(graph) - - # make regions into block address lists - self.regions_by_block_addrs = self._make_regions_by_block_addrs() - - def _make_regions_by_block_addrs(self) -> List[List[int]]: - """ - Creates a list of addr lists representing each region without recursion. A single region is defined - as a set of only blocks, no Graphs containing nested regions. The list contains the address of each - block in the region, including the heads of each recursive region. - - @return: List of addr lists - """ - - work_list = [self.region] - block_only_regions = [] - seen_regions = set() - while work_list: - children_regions = [] - for region in work_list: - children_blocks = [] - for node in region.graph.nodes: - if isinstance(node, Block): - children_blocks.append(node.addr) - elif isinstance(node, GraphRegion): - if node not in seen_regions: - children_regions.append(node) - children_blocks.append(node.head.addr) - seen_regions.add(node) - else: - continue - - if children_blocks: - block_only_regions.append(children_blocks) - - work_list = children_regions - - return block_only_regions - - def _get_start_node(self, graph: networkx.DiGraph): - try: - return next(n for n in graph.nodes() if graph.in_degree(n) == 0) - except StopIteration: - pass - - try: - return next(n for n in graph.nodes() if n.addr == self.function.addr) - except StopIteration as ex: - raise RuntimeError("Cannot find the start node from the graph!") from ex - - def _test_reducibility(self): - - # make a copy of the graph - graph = networkx.DiGraph(self._graph) - - # preprocess: make it a super graph - self._make_supergraph(graph) - - while True: - - changed = False - - # find a node with a back-edge, remove the edge (deleting the loop), and replace it with a MultiNode - changed |= self._remove_self_loop(graph) - - # find a node that has only one predecessor, and merge it with its predecessor (replace them with a - # MultiNode) - changed |= self._merge_single_entry_node(graph) - - if not changed: - # a fixed-point is reached - break - - # Flow graph reducibility, Hecht and Ullman - if len(graph.nodes) == 1: - return True - - return False - - def _make_supergraph(self, graph: networkx.DiGraph): - #return graph - while True: - for src, dst, data in graph.edges(data=True): - type_ = data.get("type", None) - if type_ == "fake_return": - if len(list(graph.successors(src))) == 1 and len(list(graph.predecessors(dst))) == 1: - self._merge_nodes(graph, src, dst, force_multinode=True) - break - elif type_ == "call": - graph.remove_node(dst) - break - else: - break - - def _find_loop_headers(self, graph: networkx.DiGraph) -> List: - - heads = {t for _, t in dfs_back_edges(graph, self._start_node)} - return GraphUtils.quasi_topological_sort_nodes(graph, heads) - - def _find_initial_loop_nodes(self, graph: networkx.DiGraph, head): - # TODO optimize - latching_nodes = {s for s, t in dfs_back_edges(graph, self._start_node) if t == head} - loop_subgraph = self.slice_graph(graph, head, latching_nodes, include_frontier=True) - - # special case: any node with more than two non-self successors are probably the head of a switch-case. we - # should include all successors into the loop subgraph. - while True: - updated = False - for node in list(loop_subgraph): - nonself_successors = [succ for succ in graph.successors(node) if succ is not node] - if len(nonself_successors) > 2: - for succ in nonself_successors: - if not loop_subgraph.has_edge(node, succ): - updated = True - loop_subgraph.add_edge(node, succ, src=node, dst=succ) - if not updated: - break - - nodes = set(loop_subgraph) - return nodes - - def _refine_loop(self, graph: networkx.DiGraph, head, initial_loop_nodes, initial_exit_nodes): - if len(initial_exit_nodes) <= 1: - return initial_loop_nodes, initial_exit_nodes - - refined_loop_nodes = initial_loop_nodes.copy() - refined_exit_nodes = initial_exit_nodes.copy() - - # simple optimization: include all single-in-degree successors of existing loop nodes - while True: - added = set() - for exit_node in list(refined_exit_nodes): - if graph.in_degree[exit_node] == 1 and graph.out_degree[exit_node] <= 1: - added.add(exit_node) - refined_loop_nodes.add(exit_node) - refined_exit_nodes |= { - succ for succ in graph.successors(exit_node) if succ not in refined_loop_nodes - } - refined_exit_nodes.remove(exit_node) - if not added: - break - - if len(refined_exit_nodes) <= 1: - return refined_loop_nodes, refined_exit_nodes - - idom = networkx.immediate_dominators(graph, head) - - new_exit_nodes = refined_exit_nodes - # a graph with only initial exit nodes and new loop nodes that are reachable from at least one initial exit - # node. - subgraph = networkx.DiGraph() - - sorted_refined_exit_nodes = GraphUtils.quasi_topological_sort_nodes(graph, refined_exit_nodes) - while len(sorted_refined_exit_nodes) > 1 and new_exit_nodes: - # visit each node in refined_exit_nodes once and determine which nodes to consider as loop nodes - candidate_nodes = {} - for n in list(sorted_refined_exit_nodes): - if all((pred is n or pred in refined_loop_nodes) for pred in graph.predecessors(n)) and dominates( - idom, head, n - ): - to_add = set(graph.successors(n)) - refined_loop_nodes - candidate_nodes[n] = to_add - - # visit all candidate nodes and only consider candidates that will not be added as exit nodes - all_new_exit_candidates = set() - for new_exit_candidates in candidate_nodes.values(): - all_new_exit_candidates |= new_exit_candidates - - # to guarantee progressing, we must ensure all_new_exit_candidates cannot contain all candidate nodes - if all(n in all_new_exit_candidates for n in candidate_nodes): - all_new_exit_candidates = set() - - # do the actual work - new_exit_nodes = set() - for n in candidate_nodes: - if n in all_new_exit_candidates: - continue - refined_loop_nodes.add(n) - sorted_refined_exit_nodes.remove(n) - to_add = set(graph.successors(n)) - refined_loop_nodes - new_exit_nodes |= to_add - for succ in to_add: - subgraph.add_edge(n, succ, src=n, dst=succ) - - sorted_refined_exit_nodes += list(new_exit_nodes) - sorted_refined_exit_nodes = list(set(sorted_refined_exit_nodes)) - sorted_refined_exit_nodes = GraphUtils.quasi_topological_sort_nodes(graph, sorted_refined_exit_nodes) - - refined_exit_nodes = set(sorted_refined_exit_nodes) - refined_loop_nodes = refined_loop_nodes - refined_exit_nodes - - if self._largest_successor_tree_outside_loop and not refined_exit_nodes: - # figure out the new successor tree with the highest number of nodes - initial_exit_to_newnodes = defaultdict(set) - newnode_to_initial_exits = defaultdict(set) - for initial_exit in initial_exit_nodes: - if initial_exit in subgraph: - for _, succs in networkx.bfs_successors(subgraph, initial_exit): - initial_exit_to_newnodes[initial_exit] |= set(succs) - for succ in succs: - newnode_to_initial_exits[succ].add(initial_exit) - - for newnode, exits in newnode_to_initial_exits.items(): - for exit_ in exits: - initial_exit_to_newnodes[exit_].add(newnode) - if initial_exit_to_newnodes: - tree_sizes = {exit_: len(initial_exit_to_newnodes[exit_]) for exit_ in initial_exit_to_newnodes} - max_tree_size = max(tree_sizes.values()) - if list(tree_sizes.values()).count(max_tree_size) == 1: - tree_size_to_exit = {v: k for k, v in tree_sizes.items()} - max_size_exit = tree_size_to_exit[max_tree_size] - if all(len(newnode_to_initial_exits[nn]) == 1 for nn in initial_exit_to_newnodes[max_size_exit]): - refined_loop_nodes = ( - refined_loop_nodes - initial_exit_to_newnodes[max_size_exit] - {max_size_exit} - ) - refined_exit_nodes.add(max_size_exit) - - return refined_loop_nodes, refined_exit_nodes - - def _remove_self_loop(self, graph: networkx.DiGraph): - - r = False - - while True: - for node in graph.nodes(): - if node in graph[node]: - # found a self loop - self._remove_node(graph, node) - r = True - break - else: - break - - return r - - def _merge_single_entry_node(self, graph: networkx.DiGraph): - - r = False - - while True: - for node in networkx.dfs_postorder_nodes(graph): - preds = graph.predecessors(node) - if len(preds) == 1: - # merge the two nodes - self._absorb_node(graph, preds[0], node) - r = True - break - else: - break - - return r - - def _make_regions(self, graph: networkx.DiGraph): - - structured_loop_headers = set() - new_regions = [] - - # FIXME: _get_start_node() will fail if the graph is just a loop - - # Find all loops - while True: - restart = False - - self._start_node = self._get_start_node(graph) - - # Start from loops - for node in list(reversed(self._loop_headers)): - if node in structured_loop_headers: - continue - if node not in graph: - continue - region = self._make_cyclic_region(node, graph) - if region is None: - # failed to struct the loop region - remove the header node from loop headers - l.debug( - "Failed to structure a loop region starting at %#x. Remove it from loop headers.", node.addr - ) - self._loop_headers.remove(node) - else: - l.debug("Structured a loop region %r.", region) - new_regions.append(region) - structured_loop_headers.add(node) - restart = True - break - - if restart: - continue - - break - - new_regions.append(GraphRegion(self._get_start_node(graph), graph, None, None, False, None)) - - l.debug("Identified %d loop regions.", len(structured_loop_headers)) - l.debug("No more loops left. Start structuring acyclic regions.") - # No more loops left. Structure acyclic regions. - while new_regions: - region = new_regions.pop(0) - head = region.head - subgraph = region.graph - - failed_region_attempts = set() - while self._make_acyclic_region( - head, subgraph, region.graph_with_successors, failed_region_attempts, region.cyclic - ): - if head not in subgraph: - # update head - head = next(iter(n for n in subgraph.nodes() if n.addr == head.addr)) - - head = next(iter(n for n in subgraph.nodes() if n.addr == head.addr)) - region.head = head - - if len(graph.nodes()) == 1 and isinstance(list(graph.nodes())[0], GraphRegion): - return list(graph.nodes())[0] - # create a large graph region - new_head = self._get_start_node(graph) - region = GraphRegion(new_head, graph, None, None, False, None) - return region - - # - # Cyclic regions - # - - def _make_cyclic_region(self, head, graph: networkx.DiGraph): - - l.debug("Found cyclic region at %#08x", head.addr) - initial_loop_nodes = self._find_initial_loop_nodes(graph, head) - l.debug("Initial loop nodes %s", self._dbg_block_list(initial_loop_nodes)) - - # Make sure no other loops are contained in the current loop - if {n for n in initial_loop_nodes if n.addr != head.addr}.intersection(self._loop_headers): - return None - - normal_entries = {n for n in graph.predecessors(head) if n not in initial_loop_nodes} - abnormal_entries = set() - for n in initial_loop_nodes: - if n == head: - continue - preds = set(graph.predecessors(n)) - abnormal_entries |= preds - initial_loop_nodes - l.debug("Normal entries %s", self._dbg_block_list(normal_entries)) - l.debug("Abnormal entries %s", self._dbg_block_list(abnormal_entries)) - - initial_exit_nodes = set() - for n in initial_loop_nodes: - succs = set(graph.successors(n)) - initial_exit_nodes |= succs - initial_loop_nodes - - l.debug("Initial exit nodes %s", self._dbg_block_list(initial_exit_nodes)) - - refined_loop_nodes, refined_exit_nodes = self._refine_loop(graph, head, initial_loop_nodes, initial_exit_nodes) - l.debug("Refined loop nodes %s", self._dbg_block_list(refined_loop_nodes)) - l.debug("Refined exit nodes %s", self._dbg_block_list(refined_exit_nodes)) - - # make sure there is a jump statement to the outside at the end of each node going to exit nodes. - # this jump statement will be rewritten to a break statement during structuring. - #for exit_node in refined_exit_nodes: - # for pred in graph.predecessors(exit_node): - # if pred in refined_loop_nodes: - # self._ensure_jump_at_loop_exit_ends(pred) - - if len(refined_exit_nodes) > 1: - # self._get_start_node(graph) - node_post_order = list(networkx.dfs_postorder_nodes(graph, head)) - sorted_exit_nodes = sorted(list(refined_exit_nodes), key=node_post_order.index) - normal_exit_node = sorted_exit_nodes[0] - abnormal_exit_nodes = set(sorted_exit_nodes[1:]) - else: - normal_exit_node = next(iter(refined_exit_nodes)) if len(refined_exit_nodes) > 0 else None - abnormal_exit_nodes = set() - - return self._abstract_cyclic_region( - graph, refined_loop_nodes, head, normal_entries, abnormal_entries, normal_exit_node, abnormal_exit_nodes - ) - - # - # Acyclic regions - # - - def _make_acyclic_region(self, head, graph: networkx.DiGraph, secondary_graph, failed_region_attempts, cyclic): - # pre-processing - - # we need to create a copy of the original graph if - # - there are in edges to the head node, or - # - there are more than one end nodes - - head_inedges = list(graph.in_edges(head)) - if head_inedges: - # we need a copy of the graph to remove edges coming into the head - graph_copy = networkx.DiGraph(graph) - # remove any in-edge to the head node - for src, _ in head_inedges: - graph_copy.remove_edge(src, head) - else: - graph_copy = graph - - endnodes = [node for node in graph_copy.nodes() if graph_copy.out_degree(node) == 0] - if len(endnodes) == 0: - # sanity check: there should be at least one end node - #l.critical("No end node is found in a supposedly acyclic graph. Is it really acyclic?") - return False - - add_dummy_endnode = False - if len(endnodes) > 1: - # if this graph has multiple end nodes: create a single end node - add_dummy_endnode = True - elif head_inedges and len(endnodes) == 1 and endnodes[0] not in list(graph.predecessors(head)): - # special case: there are in-edges to head, but the only end node is not a predecessor to head. - # in this case, we will want to put the end node and a predecessor of the head into the same region. - add_dummy_endnode = True - - if add_dummy_endnode: - # we need a copy of the graph! - graph_copy = networkx.DiGraph(graph_copy) - dummy_endnode = "DUMMY_ENDNODE" - for endnode in endnodes: - graph_copy.add_edge(endnode, dummy_endnode, src=endnode, dst=dummy_endnode) - endnodes = [dummy_endnode] - else: - dummy_endnode = None - - # compute dominator tree - doms = networkx.immediate_dominators(graph_copy, head) - - # compute post-dominator tree - inverted_graph = shallow_reverse(graph_copy) - postdoms = networkx.immediate_dominators(inverted_graph, endnodes[0]) - - # dominance frontiers - df = networkx.algorithms.dominance_frontiers(graph_copy, head) - - # visit the nodes in post-order - for node in networkx.dfs_postorder_nodes(graph_copy, source=head): - if node is dummy_endnode: - # skip the dummy endnode - continue - if cyclic and node is head: - continue - - out_degree = graph_copy.out_degree[node] - if out_degree == 0: - # the root element of the region hierarchy should always be a GraphRegion, - # so we transform it into one, if necessary - if graph_copy.in_degree(node) == 0 and not isinstance(node, GraphRegion): - subgraph = networkx.DiGraph() - subgraph.add_node(node, node=node) - self._abstract_acyclic_region( - graph, GraphRegion(node, subgraph, None, None, False, None), [], secondary_graph=secondary_graph - ) - continue - - # test if this node is an entry to a single-entry, single-successor region - levels = 0 - postdom_node = postdoms.get(node, None) - while postdom_node is not None: - if (node, postdom_node) not in failed_region_attempts: - if self._check_region(graph_copy, node, postdom_node, doms, df): - frontier = [postdom_node] - region = self._compute_region(graph_copy, node, frontier, dummy_endnode=dummy_endnode) - if region is not None: - # update region.graph_with_successors - if secondary_graph is not None: - if self._complete_successors: - for nn in list(region.graph_with_successors.nodes): - original_successors = secondary_graph.successors(nn) - for succ in original_successors: - if not region.graph_with_successors.has_edge(nn, succ): - region.graph_with_successors.add_edge(nn, succ, src=nn, dst=succ) - region.successors.add(succ) - else: - for nn in list(region.graph_with_successors.nodes): - original_successors = secondary_graph.successors(nn) - for succ in original_successors: - if succ not in graph_copy: - # the successor wasn't added to the graph because it does not belong - # to the frontier. we backpatch the successor graph here. - region.graph_with_successors.add_edge(nn, succ, src=nn, dst=succ) - region.successors.add(succ) - - # l.debug("Walked back %d levels in postdom tree.", levels) - l.debug("Node %r, frontier %r.", node, frontier) - # l.debug("Identified an acyclic region %s.", self._dbg_block_list(region.graph.nodes())) - self._abstract_acyclic_region( - graph, region, frontier, dummy_endnode=dummy_endnode, secondary_graph=secondary_graph - ) - # assert dummy_endnode not in graph - return True - - failed_region_attempts.add((node, postdom_node)) - if not dominates(doms, node, postdom_node): - break - if postdom_node is postdoms.get(postdom_node, None): - break - postdom_node = postdoms.get(postdom_node, None) - levels += 1 - # l.debug("Walked back %d levels in postdom tree and did not find anything for %r. Next.", levels, node) - - return False - - @staticmethod - def _check_region(graph, start_node, end_node, doms, df): - """ - - :param graph: - :param start_node: - :param end_node: - :param doms: - :param df: - :return: - """ - - # if the exit node is the header of a loop that contains the start node, the dominance frontier should only - # contain the exit node. - if not dominates(doms, start_node, end_node): - frontier = df.get(start_node, set()) - for node in frontier: - if node is not start_node and node is not end_node: - return False - - # no edges should enter the region. - for node in df.get(end_node, set()): - if dominates(doms, start_node, node) and node is not end_node: - return False - - # no edges should leave the region. - for node in df.get(start_node, set()): - if node is start_node or node is end_node: - continue - if node not in df.get(end_node, set()): - return False - for pred in graph.predecessors(node): - if dominates(doms, start_node, pred) and not dominates(doms, end_node, pred): - return False - - return True - - @staticmethod - def _compute_region(graph, node, frontier, include_frontier=False, dummy_endnode=None): - - subgraph = networkx.DiGraph() - frontier_edges = [] - queue = [node] - traversed = set() - - while queue: - node_ = queue.pop() - if node_ in frontier: - continue - traversed.add(node_) - subgraph.add_node(node_, node=node_) - - for succ in graph.successors(node_): - edge_data = graph.get_edge_data(node_, succ) - - if node_ in frontier and succ in traversed: - if include_frontier: - # if frontier nodes are included, do not keep traversing their successors - # however, if it has an edge to an already traversed node, we should add that edge - subgraph.add_edge(node_, succ, src=node_, dst=succ) - else: - frontier_edges.append((node_, succ, edge_data)) - continue - - if succ is dummy_endnode: - continue - - if succ in frontier: - if not include_frontier: - # skip all frontier nodes - frontier_edges.append((node_, succ, edge_data)) - continue - subgraph.add_edge(node_, succ, src=node_, dst=succ) - if succ in traversed: - continue - queue.append(succ) - - if dummy_endnode is not None: - frontier = {n for n in frontier if n is not dummy_endnode} - - if subgraph.number_of_nodes() > 1: - subgraph_with_frontier = networkx.DiGraph(subgraph) - for src, dst, edge_data in frontier_edges: - if dst is not dummy_endnode: - subgraph_with_frontier.add_edge(src, dst, src=src, dst=dst) - # assert dummy_endnode not in frontier - # assert dummy_endnode not in subgraph_with_frontier - return GraphRegion(node, subgraph, frontier, subgraph_with_frontier, False, None) - else: - return None - - def _abstract_acyclic_region( - self, graph: networkx.DiGraph, region, frontier, dummy_endnode=None, secondary_graph=None - ): - - in_edges = self._region_in_edges(graph, region, data=True) - out_edges = self._region_out_edges(graph, region, data=True) - - nodes_set = set() - for node_ in list(region.graph.nodes()): - nodes_set.add(node_) - if node_ is not dummy_endnode: - graph.remove_node(node_) - - graph.add_node(region, node=region) - - for src, _, data in in_edges: - if src not in nodes_set: - graph.add_edge(src, region, src=src, dst=region) - - for _, dst, data in out_edges: - if dst not in nodes_set: - graph.add_edge(region, dst, src=region, dst=dst) - - if frontier: - for frontier_node in frontier: - if frontier_node is not dummy_endnode: - graph.add_edge(region, frontier_node, src=region, dst=frontier_node) - - if secondary_graph is not None: - self._abstract_acyclic_region(secondary_graph, region, {}) - - @staticmethod - def _abstract_cyclic_region( - graph: networkx.DiGraph, - loop_nodes, - head, - normal_entries, - abnormal_entries, - normal_exit_node, - abnormal_exit_nodes, - ): - region = GraphRegion(head, None, None, None, True, None) - - subgraph = networkx.DiGraph() - region_outedges = [] - - delayed_edges = [] - - full_graph = networkx.DiGraph() - - for node in loop_nodes: - subgraph.add_node(node, node=node) - in_edges = list(graph.in_edges(node, data=True)) - out_edges = list(graph.out_edges(node, data=True)) - - for src, dst, data in in_edges: - full_graph.add_edge(src, dst, src=src, dst=dst) - if src in loop_nodes: - subgraph.add_edge(src, dst, src=src, dst=dst) - elif src is region: - subgraph.add_edge(head, dst, src=head, dst=dst) - elif src in normal_entries: - # graph.add_edge(src, region, **data) - delayed_edges.append((src, region, data)) - elif src in abnormal_entries: - data["region_dst_node"] = dst - # graph.add_edge(src, region, **data) - delayed_edges.append((src, region, data)) - else: - assert 0 - - for src, dst, data in out_edges: - full_graph.add_edge(src, dst, src=src, dst=dst) - if dst in loop_nodes: - subgraph.add_edge(src, dst, src=src, dst=dst) - elif dst is region: - subgraph.add_edge(src, head, src=src, dst=head) - elif dst is normal_exit_node: - region_outedges.append((node, dst)) - # graph.add_edge(region, dst, **data) - delayed_edges.append((region, dst, data)) - elif dst in abnormal_exit_nodes: - region_outedges.append((node, dst)) - # data['region_src_node'] = src - # graph.add_edge(region, dst, **data) - delayed_edges.append((region, dst, data)) - else: - assert 0 - - subgraph_with_exits = networkx.DiGraph(subgraph) - for src, dst in region_outedges: - subgraph_with_exits.add_edge(src, dst, src=src, dst=dst) - region.graph = subgraph - region.graph_with_successors = subgraph_with_exits - if normal_exit_node is not None: - region.successors = [normal_exit_node] - else: - region.successors = [] - region.successors += list(abnormal_exit_nodes) - - for node in loop_nodes: - graph.remove_node(node) - - # add delayed edges - graph.add_node(region, node=region) - for src, dst, data in delayed_edges: - graph.add_edge(src, dst, src=src, dst=dst) - - region.full_graph = full_graph - - return region - - @staticmethod - def _region_in_edges(graph, region, data=False): - - return list(graph.in_edges(region.head, data=data)) - - @staticmethod - def _region_out_edges(graph, region, data=False): - - out_edges = [] - for node in region.graph.nodes(): - out_ = graph.out_edges(node, data=data) - for _, dst, data_ in out_: - if dst in region.graph: - continue - out_edges.append((region, dst, data_)) - return out_edges - - def _remove_node(self, graph: networkx.DiGraph, node): # pylint:disable=no-self-use - graph.remove_node(node) - - def _merge_nodes( - self, graph: networkx.DiGraph, node_a, node_b, force_multinode=False - ): # pylint:disable=no-self-use - - in_edges = list(graph.in_edges(node_a, data=True)) - out_edges = list(graph.out_edges(node_b, data=True)) - - if not force_multinode and len(in_edges) <= 1 and len(out_edges) <= 1: - # it forms a region by itself :-) - new_node = None - - else: - new_node = Block.merge_blocks(node_a, node_b) - - graph.remove_node(node_a) - graph.remove_node(node_b) - - if new_node is not None: - graph.add_node(new_node, node=new_node) - - for src, _, data in in_edges: - if src is node_b: - src = new_node - graph.add_edge(src, new_node, src=src, dst=new_node) - - for _, dst, data in out_edges: - if dst is node_a: - dst = new_node - graph.add_edge(new_node, dst, src=new_node, dst=dst) - - assert not node_a in graph - assert not node_b in graph - - def _absorb_node( - self, graph: networkx.DiGraph, node_mommy, node_kiddie, force_multinode=False - ): # pylint:disable=no-self-use - - in_edges_mommy = graph.in_edges(node_mommy, data=True) - out_edges_mommy = graph.out_edges(node_mommy, data=True) - out_edges_kiddie = graph.out_edges(node_kiddie, data=True) - - if not force_multinode and len(in_edges_mommy) <= 1 and len(out_edges_kiddie) <= 1: - # it forms a region by itself :-) - new_node = None - - else: - new_node = Block.merge_blocks(node_mommy, node_kiddie) - - graph.remove_node(node_mommy) - graph.remove_node(node_kiddie) - - if new_node is not None: - graph.add_node(new_node, node=new_node) - - for src, _, data in in_edges_mommy: - if src == node_kiddie: - src = new_node - graph.add_edge(src, new_node, src=src, dst=new_node) - - for _, dst, data in out_edges_mommy: - if dst == node_kiddie: - continue - if dst == node_mommy: - dst = new_node - graph.add_edge(new_node, dst, src=new_node, dst=dst) - - for _, dst, data in out_edges_kiddie: - if dst == node_mommy: - dst = new_node - graph.add_edge(new_node, dst, src=new_node, dst=dst) - - assert not node_mommy in graph - assert not node_kiddie in graph - - @staticmethod - def _dbg_block_list(blocks): - return [(hex(b.addr) if hasattr(b, "addr") else repr(b)) for b in blocks] diff --git a/sailreval/joern/cfg/utils.py b/sailreval/joern/cfg/utils.py deleted file mode 100644 index db4970f..0000000 --- a/sailreval/joern/cfg/utils.py +++ /dev/null @@ -1,233 +0,0 @@ -from subprocess import run -from pathlib import Path -import shutil -from tempfile import TemporaryDirectory -import logging -import os -from typing import Dict - -import networkx -from networkx import Graph, DiGraph - -from sailreval.utils import WorkDirContext, bcolors -from .. import JOERN_EXPORT_PATH, JOERN_PARSE_PATH -from .jil.lifter import lift_graph -from .jil.block import Block -from .jil.statement import Nop, Statement -from ...utils.binary_debug_info import read_line_maps - -import networkx as nx -import pygraphviz as pg -import graphviz - -l = logging.getLogger(__name__) - - -def addr_to_node_map(graph): - return { - node.addr: node for node in graph.nodes - } - - -def find_function_root_node(graph: nx.DiGraph): - for node in graph.nodes: - first_stmt = node.statements[0] - if isinstance(first_stmt, Nop) and first_stmt.type == Nop.FUNC_START: - return node - - return None - - -def correct_decompiler_mappings(dec_cfg, dec_line_to_addr_map): - node: Block = find_function_root_node(dec_cfg) - if node is None: - raise Exception("Could not find root node!") - - base = node.addr - 1 - corrected_line_nums = {k+base: v for k, v in dec_line_to_addr_map.items()} - line_nums = corrected_line_nums.keys() - first_ln = min(line_nums) - last_ln = max(line_nums) - - for ln in range(first_ln, last_ln): - val = corrected_line_nums.get(ln, None) - if val is not None: - continue - - # search for a value expanding pos and negative equally until - # another line have a value that can be used - reach = 1 - is_neg = True - while reach < last_ln: - curr_reach = 0 - reach if is_neg else reach - val = corrected_line_nums.get(ln + curr_reach, None) - # leave when we find a good val - if val is not None: - corrected_line_nums[ln] = val - break - - reach += 1 - is_neg ^= True - - return corrected_line_nums - - -def correct_source_cfg_addrs(cfgs: Dict[str, nx.DiGraph], line_map_file: Path): - line_map_file = Path(line_map_file).absolute() - func_correct_line_map = read_line_maps(line_map_file, value_is_set_type=False) - - new_cfgs = {} - for cfg_name, cfg in cfgs.items(): - # skip cfgs without mappings - line_to_correct_map = func_correct_line_map.get(cfg_name, None) - if line_to_correct_map is None: - continue - - nodes_map = {} - new_graph = nx.DiGraph() - for node in cfg: - node_copy = node.copy() - new_addr = line_to_correct_map.get(node.addr, None) - if new_addr is not None: - node_copy.addr = new_addr - else: - l.warning(f"Parsed a block: {node} without a mapping for lineaddrs!") - - for stmt in node_copy.statements: - new_addr = line_to_correct_map.get(stmt.source_line_number, None) - if new_addr is None: - l.warning(f"Parsed a block statement: {stmt} without a mapping for lineaddrs!") - continue - - stmt.source_line_number = new_addr - - nodes_map[node] = node_copy - - new_graph.add_nodes_from(nodes_map.values()) - for src, dst in cfg.edges: - new_graph.add_edge(nodes_map[src], nodes_map[dst]) - - new_cfgs[cfg_name] = new_graph - - return new_cfgs - - -def cfgs_from_source(filepath: Path, lift_cfgs=True): - filepath = Path(filepath).absolute() - cfgs = {} - with TemporaryDirectory() as tmpdir: - with WorkDirContext(tmpdir): - # run joern-parser which will output a cpg in the same dir by the - # filename of "cpg.bin" - ret = run(f"{JOERN_PARSE_PATH} {filepath}".split(), capture_output=True) - if ret.returncode != 0: - l.warning("Joern parse failed, stopping CFG extraction") - return None - - # extras the cfgs into out_dir in same directory - ret = run(f"{JOERN_EXPORT_PATH} --repr cfg --out out_dir".split(), capture_output=True) - if ret.returncode != 0: - l.warning("Joern Export failed, stopping CFG extraction") - return None - - out_dir = Path("./out_dir") - cfg_files = list(out_dir.rglob("*.dot")) - for cfg_file in cfg_files: - cfg = cfg_from_dotfile(cfg_file.absolute()) - if not cfg or not len(cfg.nodes): - continue - - if cfg.name in cfgs and len(cfg.nodes) < len(cfgs[cfg.name].nodes): - continue - - cfgs[cfg.name] = nx.DiGraph(cfg) #_unparsed_graph_to_supergraph(cfg) - - if lift_cfgs: - jil_cfgs = {} - for cfg_name, cfg in cfgs.items(): - jil_cfg = lift_graph(cfg) - jil_cfg.name = cfg_name - jil_cfgs[cfg_name] = to_jil_supergraph(jil_cfg) - - cfgs = jil_cfgs - - for _, cfg in cfgs.items(): - node_attrs = {} - edge_attrs = {} - for node in cfg.nodes: - node_attrs[node] = {'node': node} - for edgd in cfg.edges: - edge_attrs[edgd] = {'src': edgd[0], 'dst': edgd[1]} - - nx.set_node_attributes(cfg, node_attrs) - nx.set_edge_attributes(cfg, edge_attrs) - - return cfgs - - -def cfg_from_dotfile(filepath: Path): - filepath = Path(filepath).expanduser().absolute() - with open(filepath, "r") as fp: - data = fp.read() - - try: - graph = nx.nx_agraph.from_agraph(pg.AGraph(data)) - except Exception: - graph = None - - return graph - -# -# JIL Graph Helpers -# - - -def merge_jil_nodes(graph: DiGraph, node_a, node_b): - in_edges = list(graph.in_edges(node_a, data=True)) - out_edges = list(graph.out_edges(node_b, data=True)) - - new_node = node_a.copy() - old_node = node_b - new_node.statements += old_node.statements - - graph.remove_node(node_a) - graph.remove_node(node_b) - - if new_node is not None: - graph.add_node(new_node, node=new_node) - - for src, _, data in in_edges: - if src is node_b: - src = new_node - graph.add_edge(src, new_node, src=src, dst=new_node) - - for _, dst, data in out_edges: - if dst is node_a: - dst = new_node - graph.add_edge(new_node, dst, src=new_node, dst=dst) - - return new_node - - -def to_jil_supergraph(graph: DiGraph): - new_graph = DiGraph(graph) - while True: - for src, dst in new_graph.edges(): - if len(list(new_graph.successors(src))) == 1 and len(list(new_graph.predecessors(dst))) == 1: - if src is not dst: - merge_jil_nodes(new_graph, src, dst) - break - else: - break - - return new_graph - - -def save_as_png(cfg: nx.DiGraph, output_path: Path): - tmp_path = output_path.with_suffix(".dot") - nx.drawing.nx_agraph.write_dot(cfg, str(tmp_path)) - dot_src = graphviz.Source(open(tmp_path).read(), format="png") - dot_src.render(outfile=str(output_path.with_suffix(".png"))) - tmp_path.with_suffix(".gv").unlink() - tmp_path.unlink() - return output_path.with_suffix(".png") diff --git a/sailreval/joern/client.py b/sailreval/joern/client.py deleted file mode 100755 index eb672c7..0000000 --- a/sailreval/joern/client.py +++ /dev/null @@ -1,254 +0,0 @@ -import os -import logging -from typing import Set - -from cpgqls_client import CPGQLSClient, import_code_query -import networkx as nx -import pygraphviz as pg - -l = logging.getLogger(__name__) - - -class JoernClient: - def __init__(self, target_file, ip="localhost", port=9000, bin_name=None): - self.ip = ip - self.port = port - self.target_file = target_file - self.bin_name = bin_name or "" - - # connect and import target file - self.client = CPGQLSClient(f"{ip}:{port}") - self.client.execute(import_code_query(f"{self.target_file}", f"{os.path.basename(self.target_file)}")) - self.functions: Set[str] = self._get_functions_with_code() - - # cache - self.cfg_cache = {} - - # - # Public API - # - - def function_line_numbers(self): - out = self._exec_list_cmd("cpg.method.filter(node => node.lineNumber!=None&&node.lineNumberEnd!=None).map(node => (node.name, node.lineNumber.last, node.lineNumberEnd.last)).l") - if not out: - return {} - - if not isinstance(out, (list, tuple)): - return {} - - line_nums_by_func = {} - for data in out: - if not isinstance(data, (list, tuple)) or len(data) != 3: - continue - - fn, ln_start, ln_end = data - if fn not in self.functions: - continue - - line_nums_by_func[fn] = (ln_start, ln_end) - - return line_nums_by_func - - def function_headers(self): - headers = {} - for func in self.functions: - out = self._exec_list_cmd(f'cpg.method("{func}").code.l') - if not out: - continue - - if not isinstance(out, (list, tuple)): - continue - - headers[func] = out[0] - - return headers - - def functions_with_gotos(self): - out1 = self._exec_list_cmd(f'cpg.goto.method.name.l') - if not out1: - return [] - - if isinstance(out1, (list, tuple)) and len(out1) > 0: - return out1 - - return [] - - def functions_with_switches(self): - out1 = self._exec_list_cmd(f'cpg.switchBlock.method.name.l') - if not out1: - return [] - - if isinstance(out1, (list, tuple)) and len(out1) > 0: - return out1 - - return [] - - def get_func_cfg(self, func_name): - if func_name in self.cfg_cache: - return self.cfg_cache[func_name] - - cfg = self._dump_func_cfg(func_name) - self.cfg_cache[func_name] = cfg - - return cfg - - def get_func_loc(self, func_name): - """ - Gets a functions lines of code (loc) count. - """ - if func_name not in self.functions: - return None - - out1 = self._exec_list_cmd(f'cpg.method.name("{func_name}").lineNumber.l') - if out1 is None: - return None - - out2 = self._exec_list_cmd(f'cpg.method.name("{func_name}").lineNumberEnd.l') - if out2 is None: - return None - - start_line = out1[-1] if not isinstance(out1, int) else out1 - end_line = out2[-1] if not isinstance(out2, int) else out2 - - try: - val = int(end_line) - int(start_line) - except Exception: - return None - return val - - def function_ternary_counts(self): - count = {} - for func_name in self.functions: - out = self._exec_int_cmd(f'cpg.method("{func_name}").call.filter(_.name==".conditional").filter(_.code.contains("__builtin_unreachable")==false).size') - if out is None: - continue - count[func_name] = out - - return count - - - def count_gotos(self, func_name): - if func_name not in self.functions: - return None - - out = self._exec_int_cmd(f'cpg.method("{func_name}").goto.size') - if out is None: - return None - - return out - - def count_if_levels(self, func_name): - """ - Returns the max of if-nesting level - - """ - out = self._exec_list_cmd( - f'cpg.method("{func_name}").controlStructure.controlStructureType("IF").depth(_.isControlStructure).l' - ) - if out is None or (isinstance(out, (list, tuple)) and len(out) == 0): - return None - - # if_count = out if isinstance(out, int) else sum(out) // len(out) - if_count = out if isinstance(out, int) else max(out) - return if_count - - def func_calls_in_func(self, func_name): - out = self._exec_list_cmd( - f'cpg.method("{func_name}").call.name.l' - ) - if out is None: - return None - - out = [out] if isinstance(out, str) else out - good_funcs = self._filter_blacklisted(out) - return good_funcs - - def get_control_structure_conditions(self, func_name): - if func_name not in self.functions: - return None - - out = self._exec_list_cmd( - f'cpg.method("{func_name}").controlStructure.condition.code.l' - ) - if not isinstance(out, (tuple, list)): - return None - - return list(out) - - # - # Private Helpers - # - - def _get_functions_with_code(self): - out = self._exec_list_cmd( - "cpg.method.filter(node => node.lineNumber!=None&&node.lineNumberEnd!=None&&node.lineNumber!=node.lineNumberEnd).name.l" - ) - if not isinstance(out, (tuple, list)): - return [] - - out = list(out) - return set(self._filter_blacklisted(out)) - - @staticmethod - def _filter_blacklisted(strings): - blacklist = [ - "<", "+", "*", "(", ">", "JUMPOUT", "__builtin_unreachable" - ] - good_strings = [] - for string in strings: - if not any(string.startswith(b) for b in blacklist): - good_strings.append(string) - - return good_strings - - def _exec_list_cmd(self, raw_command): - res = self.client.execute(f"show({raw_command})") - tuple_str = self._get_str_tuple(res, cmd=raw_command) - return tuple_str - - def _exec_int_cmd(self, raw_command): - res = self.client.execute(f"show({raw_command})") - int_val = self._get_str_int(res, cmd=raw_command) - if int_val is None or not isinstance(int_val, int): - return None - - return int_val - - def _get_str_int(self, req_res, cmd=None): - if 'stdout' not in req_res: - return None - - out = None - try: - raw_out = req_res['stdout'] - out = int(raw_out, 10) - except Exception as e: - l.warning(f"Error occurred doing JOERN eval for {self.bin_name} on {cmd} because {e}") - - return out - - def _get_str_tuple(self, req_res, cmd=None): - if 'stdout' not in req_res: - return None - - try: - raw_out = req_res['stdout'].split("List(")[-1] - str_tuple = eval("(" + raw_out) - except Exception as e: - l.warning(f"Error occurred doing JOERN eval for {self.bin_name} on {cmd} because {e}") - return None - - if type(str_tuple) == str: - str_tuple = [str_tuple] - - return str_tuple - - def _dump_func_cfg(self, func_name): - str_tuple = self._exec_list_cmd(f'cpg.method("{func_name}").dotCfg.l') - try: - graph = nx.nx_agraph.from_agraph(pg.AGraph(str_tuple[-1])) - except Exception as e: - l.warning(f"Error getting CFG from JOERN for {self.bin_name} on {func_name} as {e}") - graph = None - - return graph diff --git a/sailreval/joern/server.py b/sailreval/joern/server.py deleted file mode 100755 index e0eb719..0000000 --- a/sailreval/joern/server.py +++ /dev/null @@ -1,86 +0,0 @@ -import subprocess -from time import time, sleep -import os -import logging - -from cpgqls_client import CPGQLSClient -import psutil - -l = logging.getLogger(__name__) - - -class JoernServer: - def __init__(self, ip="localhost", port=9000): - self.ip = ip - self.port = port - - def __enter__(self): - self.start() - self.wait_for_server_start() - return self - - def __exit__(self, *args, **kwargs): - self.stop() - - def start(self): - # always kill proc of the same port before starting - joern_proc = self._find_joern_proc() - if joern_proc: - joern_proc.kill() - - from . import JOERN_SERVER_PATH - proc = subprocess.Popen([f'{JOERN_SERVER_PATH} --server --server-port {self.port} >> /tmp/t.tmp 2>&1 &'], shell=True) - return True - - def wait_for_server_start(self, timeout=30): - start_time = time() - success = False - while time() - start_time < timeout: - if self._find_joern_proc(): - try: - CPGQLSClient(f"{self.ip}:{self.port}").execute("val a = 1") - except (ConnectionRefusedError, OSError): - pass - else: - success = True - break - - sleep(0.5) - - if not success: - l.critical(f"Was unable to start the JOERN server on port {self.port} before timeout...") - else: - sleep(0.5) - - - def stop(self): - joern_proc = self._find_joern_proc() - if joern_proc: - joern_proc.kill() - else: - l.critical(f"Unable to kill the Joern server on port {self.port} because it's dead our changed id") - - try: - os.unlink("t.tmp") - except Exception: - pass - - def reboot(self): - self.stop() - self.start() - self.wait_for_server_start() - - def _find_joern_proc(self): - for proc in psutil.process_iter(): - try: - cmd = " ".join(proc.cmdline()) - except Exception as e: - continue - - if "java" in cmd and "joern" in cmd and f"--server-port {self.port}" in cmd: - break - else: - proc = None - - return proc - diff --git a/sailreval/metrics/ged_to_source.py b/sailreval/metrics/ged_to_source.py index 4fbc6f6..e000db8 100644 --- a/sailreval/metrics/ged_to_source.py +++ b/sailreval/metrics/ged_to_source.py @@ -1,21 +1,11 @@ import logging -import shutil -from multiprocessing import Pool -import re from pathlib import Path -import random - -from ..joern import JoernClient, JoernServer -from ..joern.cfg.cfged import cfg_edit_distance as _cfg_edit_distance -from ..utils import timeout -from ..joern.cfg.utils import find_function_root_node, correct_decompiler_mappings -from ..joern.cfg.ged import ged_max, ged_upperbound, ged_exact - -from tqdm import tqdm -import networkx as nx from typing import Dict -from ..utils.binary_debug_info import gen_dwarf_addr_to_line_map, read_line_maps +from pyjoern.mapping import correct_decompiler_mappings, read_line_maps +from cfgutils.similarity import ged_max, ged_upperbound, ged_exact +from cfgutils.similarity import cfg_edit_distance as _cfg_edit_distance +import networkx as nx l = logging.getLogger(__name__) MAX_EXACT_NODES = 12 diff --git a/sailreval/utils/compile.py b/sailreval/utils/compile.py index 21fc098..e4b98f6 100644 --- a/sailreval/utils/compile.py +++ b/sailreval/utils/compile.py @@ -10,9 +10,10 @@ import traceback import toml +from tqdm import tqdm +from pyjoern import JoernClient, JoernServer from .sailr_target import SAILRTarget -from ..joern import JoernClient, JoernServer from ..utils import timeout from . import ( bcolors, WorkDirContext, SAILR_COMPILATION_RESULTS_DIR, SAILR_DECOMPILATION_RESULTS_DIR, @@ -20,8 +21,6 @@ ) from .. import SAILR_METRICS -from tqdm import tqdm - l = logging.getLogger(__name__) # diff --git a/scripts/run_cfged_on_file.py b/scripts/run_cfged_on_file.py index 5e38ff0..429b22c 100755 --- a/scripts/run_cfged_on_file.py +++ b/scripts/run_cfged_on_file.py @@ -3,9 +3,12 @@ import argparse from pathlib import Path +from cfgutils.file_formats import save_cfg_as_png +from pyjoern import fast_cfgs_from_source +from pyjoern.mapping import correct_source_cfg_addrs + from sailreval import SAILR_DECOMPILERS, ALL_DECOMPILERS from sailreval.metrics.ged_to_source import compute_cfg_edit_distance, ged_upperbound_score -from sailreval.joern.cfg.utils import cfgs_from_source, correct_source_cfg_addrs, save_as_png if __name__ == "__main__": @@ -40,23 +43,23 @@ source_filepath = dec_dir_path / source_filename binary_path = dec_dir_path / f"{basename}.o" linemaps_path = source_filepath.with_suffix(".linemaps") - extracted_cfgs = cfgs_from_source(source_filepath) + extracted_cfgs = fast_cfgs_from_source(source_filepath) source_cfgs = correct_source_cfg_addrs( extracted_cfgs, linemaps_path, ) if args.save_png: output_file = dec_dir_path / f"source_{basename}_{function}.png" - save_as_png(source_cfgs[function], output_file) + save_cfg_as_png(source_cfgs[function], output_file) for decompiler in args.decs: filename = f"{decompiler}_{basename}.c" filepath = dec_dir_path / filename - dec_cfgs = cfgs_from_source(filepath) + dec_cfgs = fast_cfgs_from_source(filepath) func_cfg = dec_cfgs[function] dist = compute_cfg_edit_distance(func_cfg, source_cfgs[function], function, binary_path, decompiler) upper_ged = ged_upperbound_score(function, None, source_cfgs=source_cfgs, dec_cfgs=dec_cfgs) print(f"Decompiler: {decompiler} Target: {basename} - {function} | CFGED: {dist} | O-GED: {upper_ged}") if args.save_png: - save_as_png(func_cfg, dec_dir_path / f"{decompiler}_{basename}_{function}.png") + save_cfg_as_png(func_cfg, dec_dir_path / f"{decompiler}_{basename}_{function}.png") diff --git a/setup.cfg b/setup.cfg index a28bd8f..256eb8f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -20,7 +20,6 @@ install_requires = toml psutil tqdm - cpgqls-client docker matplotlib numpy @@ -28,11 +27,9 @@ install_requires = scipy seaborn pyelftools + cfgutils==1.3.0 + pyjoern==1.2.18.3 python_requires = >= 3.6 packages = find: include_package_data = True - -[options.package_data] -* = joern/bin/* - diff --git a/setup.py b/setup.py index ab4bdde..6068493 100755 --- a/setup.py +++ b/setup.py @@ -1,70 +1,3 @@ -# pylint: disable=missing-class-docstring -import platform -import urllib.request -from pathlib import Path -import sys -from distutils.util import get_platform -from distutils.command.build import build as st_build -from subprocess import run - from setuptools import setup -from setuptools.command.develop import develop as st_develop - - -def _download_joern_zipfile(save_location: Path) -> Path: - url = "https://github.com/joernio/joern/releases/download/v1.2.18/joern-cli.zip" - with urllib.request.urlopen(url) as response: - if response.status != 200: - raise Exception(f"HTTP error {response.status}: {response.reason}") - - with open(save_location, 'wb') as f: - while True: - chunk = response.read(8192) - if not chunk: - break - f.write(chunk) - - return save_location - - -def _download_joern(): - joern_bin_dir = Path("sailreval/joern/bin/").absolute() - joern_binary = joern_bin_dir / "joern" - if joern_binary.exists(): - return - - # download joern - joern_zip_file = _download_joern_zipfile(joern_bin_dir / "joern-cli.zip") - # unzip joern - run(["unzip", str(joern_zip_file)], cwd=str(joern_bin_dir)) - # remove zip file - joern_zip_file.unlink() - - -class build(st_build): - def run(self, *args): - self.execute(_download_joern, (), msg="Downloading Joern from GitHub...") - super().run(*args) - - -class develop(st_develop): - def run(self, *args): - self.execute(_download_joern, (), msg="Downloading Joern from GitHub...") - super().run(*args) - - -cmdclass = { - "build": build, - "develop": develop, -} - -if 'bdist_wheel' in sys.argv and '--plat-name' not in sys.argv: - sys.argv.append('--plat-name') - name = get_platform() - if 'linux' in name: - sys.argv.append('manylinux2014_' + platform.machine()) - else: - # https://www.python.org/dev/peps/pep-0425/ - sys.argv.append(name.replace('.', '_').replace('-', '_')) -setup(cmdclass=cmdclass) +setup() diff --git a/setup.sh b/setup.sh index 0ca623b..a7537dd 100755 --- a/setup.sh +++ b/setup.sh @@ -20,20 +20,20 @@ else fi if [ -e /etc/debian_version ] then - $SUDO apt-get install -y graphviz-dev openjdk-17-jdk unzip + $SUDO apt-get install -y graphviz-dev openjdk-19-jdk unzip elif [ $IS_MACOS -eq 1 ] then if ! which brew > /dev/null; then error "You must have homebrew installed for MacOS installs." fi - brew install graphviz-dev openjdk@17 unzip + brew install graphviz-dev openjdk@19 unzip else error "System is unknown, please install graphviz-dev on your system!" fi echo "Installing the sailreval Python package locally..." -pip3 install -e . +pip3 install -e . && pyjoern --install # build docker image echo "Building docker image (~6gb)..." diff --git a/tests/test_cfged.py b/tests/test_cfged.py index e0a755c..468369a 100644 --- a/tests/test_cfged.py +++ b/tests/test_cfged.py @@ -3,16 +3,15 @@ import os from sailreval.utils.binary_debug_info import read_line_maps, gen_dwarf_addr_to_line_map, dump_dwarf_addr_to_line_map -from sailreval.joern.cfg.utils import cfgs_from_source, correct_source_cfg_addrs, correct_decompiler_mappings -from sailreval.joern.cfg.ged import graph_edit_distance_core_analysis, ged_upperbound -from sailreval.joern.cfg.cfged import cfg_edit_distance from sailreval.metrics.ged_to_source import compute_cfg_edit_distance from sailreval.decompilers.angr_dec import angr_decompile from sailreval.decompilers.ida_dec import ida_decompile from sailreval import SAILR_DECOMPILERS import unittest -import networkx as nx +from pyjoern import fast_cfgs_from_source +from pyjoern.mapping import correct_source_cfg_addrs +from cfgutils.similarity import graph_edit_distance_core_analysis, ged_upperbound FILES_PATH = Path(os.path.join(os.path.dirname(os.path.realpath(__file__)), "./cfged/")) DECOMPILERS = { @@ -24,7 +23,7 @@ def extract_cfgs_for_decompiler(source_dir: Path, binary_name, decompiler, function=None): source_dir = Path(source_dir).absolute() - out = cfgs_from_source(source_dir.joinpath(f"{decompiler}_{binary_name}.c")) + out = fast_cfgs_from_source(source_dir.joinpath(f"{decompiler}_{binary_name}.c")) if decompiler == SAILR_DECOMPILERS.SOURCE_CODE: linemaps = source_dir.joinpath(f"source_{binary_name}.linemaps") @@ -105,5 +104,4 @@ def test_cfged_on_large_functions(self): if __name__ == "__main__": - #unittest.main(argv=sys.argv) - TestCFGED().test_cfged_on_large_functions() + unittest.main(argv=sys.argv) diff --git a/tests/test_readability.py b/tests/test_readability.py index ddfb8ad..4e38d87 100644 --- a/tests/test_readability.py +++ b/tests/test_readability.py @@ -3,13 +3,15 @@ import os from sailreval.utils.binary_debug_info import read_line_maps, gen_dwarf_addr_to_line_map, dump_dwarf_addr_to_line_map -from sailreval.joern.cfg.utils import cfgs_from_source, correct_source_cfg_addrs, correct_decompiler_mappings from sailreval.metrics.ged_to_source import compute_cfg_edit_distance from sailreval.decompilers.angr_dec import angr_decompile from sailreval.decompilers.ida_dec import ida_decompile from sailreval import SAILR_DECOMPILERS import unittest +from pyjoern import fast_cfgs_from_source +from pyjoern.mapping import correct_source_cfg_addrs + FILES_PATH = Path(os.path.join(os.path.dirname(os.path.realpath(__file__)), "./cfged/")) DECOMPILERS = { @@ -25,7 +27,7 @@ def extract_cfgs_for_decompiler(source_dir: Path, binary_name, decompiler, function=None): source_dir = Path(source_dir).absolute() - out = cfgs_from_source(source_dir.joinpath(f"{decompiler}_{binary_name}.c")) + out = fast_cfgs_from_source(source_dir.joinpath(f"{decompiler}_{binary_name}.c")) if decompiler == SAILR_DECOMPILERS.SOURCE_CODE: linemaps = source_dir.joinpath(f"source_{binary_name}.linemaps")