Skip to content

Commit

Permalink
Add support for JVM based projects (#236)
Browse files Browse the repository at this point in the history
This PR attempts to add support in the base logic for JVM based
projects.

---------

Signed-off-by: Arthur Chan <[email protected]>
  • Loading branch information
arthurscchan authored May 17, 2024
1 parent 181a382 commit c78a3a6
Show file tree
Hide file tree
Showing 23 changed files with 838 additions and 86 deletions.
21 changes: 21 additions & 0 deletions benchmark-sets/jvm/fuzzywuzzy.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
"functions":
- "name": "[me.xdrop.fuzzywuzzy.algorithms.DefaultStringProcessor].process(java.lang.String)"
"params":
- "name": "var_String"
"type": "java.lang.String"
"return_type": "void"
"signature": "[me.xdrop.fuzzywuzzy.algorithms.DefaultStringProcessor].process(java.lang.String)"
- "name": "[me.xdrop.diffutils.DiffUtils].levEditDistance(java.lang.String,java.lang.String,int)"
"params":
- "name": "var_String"
"type": "java.lang.String"
- "name": "var_String"
"type": "java.lang.String"
- "name": "var_int"
"type": "int"
"return_type": "void"
"signature": "[me.xdrop.diffutils.DiffUtils].levEditDistance(java.lang.String,java.lang.String,int)"
"language": "jvm"
"project": "fuzzywuzzy"
"target_name": "DiffUtilsFuzzer"
"target_path": "/src/DiffUtilsFuzzer.java"
23 changes: 23 additions & 0 deletions benchmark-sets/jvm/jsemver.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
"functions":
- "name": "[com.github.zafarkhaja.semver.Version].tryParse(java.lang.String,boolean)"
"params":
- "name": "var_String"
"type": "java.lang.String"
- "name": "var_boolean"
"type": "boolean"
"return_type": "void"
"signature": "[com.github.zafarkhaja.semver.Version].tryParse(java.lang.String,boolean)"
- "name": "[com.github.zafarkhaja.semver.Version].of(long,long,long)"
"params":
- "name": "var_long"
"type": "long"
- "name": "var_long"
"type": "long"
- "name": "var_long"
"type": "long"
"return_type": "void"
"signature": "[com.github.zafarkhaja.semver.Version].of(long,long,long)"
"language": "jvm"
"project": "jsemver"
"target_name": "VersionFuzzer"
"target_path": "/src/VersionFuzzer.java"
17 changes: 17 additions & 0 deletions benchmark-sets/jvm/json-java.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
"functions":
- "name": "[org.json.JSONArray].putAll(java.lang.String[])"
"params":
- "name": "var_Iterable"
"type": "java.lang.String[]"
"return_type": "void"
"signature": "[org.json.JSONArray].putAll(java.lang.String[])"
- "name": "[org.json.JSONArray].putAll(int[])"
"params":
- "name": "var_Iterable"
"type": "int[]"
"return_type": "void"
"signature": "[org.json.JSONArray].putAll(int[])"
"language": "jvm"
"project": "json-java"
"target_name": "JsonJavaFuzzer"
"target_path": "/src/JsonJavaFuzzer.java"
23 changes: 23 additions & 0 deletions benchmark-sets/jvm/sqlite-jdbc.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
"functions":
- "name": "[org.sqlite.date.DateFormatUtils].format(long,java.lang.String)"
"params":
- "name": "var_long"
"type": "long"
- "name": "var_String"
"type": "java.lang.String"
"return_type": "void"
"signature": " [org.sqlite.date.DateFormatUtils].format(long,java.lang.String)"
- "name": "[org.sqlite.date.DateFormatUtils].formatUTC(long,java.lang.String,java.util.Locale)"
"params":
- "name": "var_long"
"type": "long"
- "name": "var_String"
"type": "java.lang.String"
- "name": "var_Locale"
"type": "java.util.Locale"
"return_type": "void"
"signature": "[org.sqlite.date.DateFormatUtils].formatUTC(long,java.lang.String,java.util.Locale)"
"language": "jvm"
"project": "sqlite-jdbc"
"target_name": "SqliteConnectionFuzzer"
"target_path": "/src/SqliteConnectionFuzzer.java"
17 changes: 17 additions & 0 deletions benchmark-sets/jvm/twitter4j.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
"functions":
- "name": "[twitter4j.OAuth2Token].equals(java.lang.Object)"
"params":
- "name": "var_Object"
"type": "java.lang.Object"
"return_type": "void"
"signature": "[twitter4j.OAuth2Token].equals(java.lang.Object)"
- "name": "[twitter4j.TimeSpanConverter].toTimeSpanString(long)"
"params":
- "name": "var_long"
"type": "long"
"return_type": "void"
"signature": "[twitter4j.TimeSpanConverter].toTimeSpanString(long)"
"language": "jvm"
"project": "twitter4j"
"target_name": "TwitterObjectFactoryFuzzer"
"target_path": "/src/TwitterObjectFactoryFuzzer.java"
55 changes: 46 additions & 9 deletions data_prep/introspector.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,10 +336,28 @@ def _get_clean_arg_types(function: dict, project: str) -> list[str]:
return [clean_type(arg_type) for arg_type in raw_arg_types]


def _get_arg_names(function: dict, project: str) -> list[str]:
def _get_arg_count(function: dict) -> int:
"""Count the number of arguments for this function."""
raw_arg_types = (function.get('arg-types') or
function.get('function_arguments', []))
return len(raw_arg_types)


def _get_arg_names(function: dict, project: str, language: str) -> list[str]:
"""Returns the function argument names."""
arg_names = (function.get('arg-names') or
function.get('function_argument_names', []))
if language == 'jvm':
# The fuzz-introspector front end of JVM projects cannot get the original
# argument name. Thus the argument name here uses var_{argument_type} as
# argument name reference. Some argument types are full-qualified names of
# Java classes with [] and . and that is not allowed for Java variable names
# and they are removed and form the temporary argment name for reference.
jvm_args = _get_clean_arg_types(function, project)
arg_names = [
f'var_{name.split(".")[-1].replace("[]", "")}' for name in jvm_args
]
else:
arg_names = (function.get('arg-names') or
function.get('function_argument_names', []))
if not arg_names:
logging.error(
'Missing argument names in project: %s\n'
Expand All @@ -351,6 +369,9 @@ def _get_arg_names(function: dict, project: str) -> list[str]:
def get_function_signature(function: dict, project: str) -> str:
"""Returns the function signature."""
function_signature = function.get('function_signature', '')
if function_signature == "N/A":
# For JVM projects, the full function signature are the raw function name
return get_raw_function_name(function, project)
if not function_signature:
logging.error(
'Missing function signature in project: %s\n'
Expand Down Expand Up @@ -392,10 +413,18 @@ def populate_benchmarks_using_introspector(project: str, language: str,
logging.error('No functions found using the oracles: %s', target_oracles)
return []

filenames = [
os.path.basename(function['function_filename']) for function in functions
]
result = project_src.search_source(project, filenames)
if language == 'jvm':
filenames = [
f'{function["function_filename"].split("$")[0].replace(".", "/")}.java'
for function in functions
]
else:
filenames = [
os.path.basename(function['function_filename'])
for function in functions
]

result = project_src.search_source(project, filenames, language)
if not result:
return []

Expand All @@ -405,17 +434,25 @@ def populate_benchmarks_using_introspector(project: str, language: str,
logging.error('No fuzz target found in project %s.', project)
return []
logging.info('Fuzz target file found for project %s: %s', project, harness)

target_name = get_target_name(project, harness)
logging.info('Fuzz target binary found for project %s: %s', project,
target_name)

potential_benchmarks = []
for function in functions:
if _get_arg_count(function) == 0:
# Skipping functions / methods that does not take in any arguments.
# Those functions / methods are not fuzz-worthy.
continue

filename = os.path.basename(function['function_filename'])
if filename not in [os.path.basename(i) for i in interesting]:
if filename not in [os.path.basename(i) for i in interesting.keys()]:
# TODO: Bazel messes up paths to include "/proc/self/cwd/..."
# Ignore jvm project for this checking.
logging.error('error: %s %s', filename, interesting.keys())
continue

function_signature = get_function_signature(function, project)
if not function_signature:
continue
Expand All @@ -429,7 +466,7 @@ def populate_benchmarks_using_introspector(project: str, language: str,
_get_clean_return_type(function, project),
_group_function_params(
_get_clean_arg_types(function, project),
_get_arg_names(function, project)),
_get_arg_names(function, project, language)),
harness,
target_name,
function_dict=function))
Expand Down
58 changes: 38 additions & 20 deletions data_prep/project_src.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ def _format_source(src_file: str) -> str:
if os.path.isfile(src_file):
return _read_harness(src_file) or _read_harness(src_file, 'ignore') or ''
logging.warning('Failed to find file: %s', src_file)

return ''


Expand All @@ -119,11 +120,16 @@ def _get_interesting_file(src_file: str, out: str) -> tuple[str, str]:
return short_path, content


def _get_harness(src_file: str, out: str) -> tuple[str, str]:
def _get_harness(src_file: str, out: str, language: str) -> tuple[str, str]:
"""Returns the path name and content of harness."""

content = _format_source(src_file)
if 'int LLVMFuzzerTestOneInput' not in content:

if language == 'c++' and 'int LLVMFuzzerTestOneInput' not in content:
return '', ''
if language == 'jvm' and 'static void fuzzerTestOneInput' not in content:
return '', ''

short_path = src_file[len(out):]
return short_path, content

Expand Down Expand Up @@ -269,15 +275,16 @@ def _copy_project_src_from_local(project: str, out: str):
logging.info('Done copying %s /src to %s.', project, out)


def _identify_fuzz_targets(
out: str, interesting_filenames: list[str]) -> tuple[list[str], list[str]]:
def _identify_fuzz_targets(out: str, interesting_filenames: list[str],
language: str) -> tuple[list[str], list[str]]:
"""
Identifies fuzz target file contents and |interesting_filenames| in |out|.
"""
logging.debug('len(interesting_filenames): %d', len(interesting_filenames))

interesting_filepaths = []
potential_harnesses = []

for root, _, filenames in os.walk(out):
is_bad = False
for ignore_dir in SEARCH_IGNORE_DIRS:
Expand All @@ -291,22 +298,31 @@ def _identify_fuzz_targets(
if not benchmark.get_file_type(filename):
continue
path = os.path.join(root, filename)
short_path = path[len(out):]
if short_path in interesting_filenames:
interesting_filepaths.append(path)
# TODO(dongge): Figure out why the path does not match for Bazel projects.
if os.path.basename(short_path) in interesting_filenames:
interesting_filepaths.append(path)
# This should also include .cpp and .cc but exclude headers which
# usually don't contain fuzzer definitions.
if '.c' in path:
potential_harnesses.append(path)
if language == 'jvm':
# For JVM
if path.endswith(tuple(interesting_filenames)):
interesting_filepaths.append(path)
if path.endswith('.java'):
potential_harnesses.append(path)
else:
# For C/C++
short_path = path[len(out):]
if short_path in interesting_filenames:
interesting_filepaths.append(path)
# TODO(dongge): Figure out why the path does not match Bazel projects.
if os.path.basename(short_path) in interesting_filenames:
interesting_filepaths.append(path)
# This should also include .cpp and .cc but exclude headers which
# usually don't contain fuzzer definitions.
if '.c' in path:
potential_harnesses.append(path)

return potential_harnesses, interesting_filepaths


def _parse_fuzz_targets(
project: str, out: str, potential_harnesses: list[str],
interesting_filepaths: list[str]) -> tuple[dict[str, str], dict[str, str]]:
def _parse_fuzz_targets(project: str, out: str, potential_harnesses: list[str],
interesting_filepaths: list[str],
language: str) -> tuple[dict[str, str], dict[str, str]]:
"""
Parses fuzz target file contents and |interesting_filenames| in |out|.
"""
Expand All @@ -319,7 +335,7 @@ def _parse_fuzz_targets(

fuzz_targets = {}
for harness in potential_harnesses:
short_path, content = _get_harness(harness, out)
short_path, content = _get_harness(harness, out, language)
if short_path == content == '':
continue
fuzz_targets[short_path] = content
Expand Down Expand Up @@ -352,6 +368,7 @@ def _copy_fuzz_targets(harness_path: str, dest_dir: str, project: str):
def search_source(
project: str,
interesting_filenames: list,
language: str,
result_dir: str = '',
cloud_experiment_bucket: str = '',
) -> tuple[Dict[str, str], Dict[str, str]]:
Expand All @@ -363,10 +380,11 @@ def search_source(
os.makedirs(out)

_copy_project_src(project, out, cloud_experiment_bucket)

potential_harnesses, interesting_filepaths = _identify_fuzz_targets(
out, interesting_filenames)
out, interesting_filenames, language)
fuzz_targets, interesting_files = _parse_fuzz_targets(
project, out, potential_harnesses, interesting_filepaths)
project, out, potential_harnesses, interesting_filepaths, language)

for short_path in fuzz_targets.keys():
_copy_fuzz_targets(os.path.join(out, short_path[1:]), result_dir, project)
Expand Down
19 changes: 16 additions & 3 deletions data_prep/project_targets.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ def _bucket_match_target_content_signatures(


def generate_data(project_name: str,
language: str,
sig_per_target: int = 1,
max_samples: int = 1,
cloud_experiment_bucket: str = ''):
Expand All @@ -148,7 +149,7 @@ def generate_data(project_name: str,
f'from Google Cloud Bucket: {OSS_FUZZ_EXP_BUCKET}.')
print('Will try to build from Google Cloud or local docker image.')
target_content_signature_dict = _match_target_content_signatures(
target_funcs, project_name, cloud_experiment_bucket)
target_funcs, project_name, language, cloud_experiment_bucket)
if not target_content_signature_dict:
return []

Expand Down Expand Up @@ -200,6 +201,7 @@ def filter_target_lines(target_content: str) -> str:
def _match_target_content_signatures(
target_funcs: Dict[str, List[Dict]],
project_name: str,
language: str,
cloud_experiment_bucket: str = '') -> Dict[str, List[str]]:
"""Returns a list of dictionary with function signatures as keys and
its fuzz target content as values."""
Expand All @@ -208,7 +210,9 @@ def _match_target_content_signatures(
return {}

source_content = project_src.search_source(
project_name, [], cloud_experiment_bucket=cloud_experiment_bucket)
project_name, [],
language,
cloud_experiment_bucket=cloud_experiment_bucket)

if not source_content[0]:
print(f'Error: No fuzz target found for project {project_name}.')
Expand Down Expand Up @@ -296,6 +300,12 @@ def _parse_arguments():
default='',
help='A gcloud bucket to store experiment files.')

parser.add_argument('-l',
'--language',
type=str,
default='c++',
help='Language of projects.')

parsed_args = parser.parse_args()
if not parsed_args.result_path:
parsed_args.result_path = f'{parsed_args.project_name}.json'
Expand All @@ -305,9 +315,11 @@ def _parse_arguments():
def _generate_project_training_data(project_name: str,
sig_per_target,
max_samples,
language,
cloud_experiment_bucket: str = ''):
"""Generate project training data."""
try:
return generate_data(project_name, sig_per_target, max_samples,
return generate_data(project_name, language, sig_per_target, max_samples,
cloud_experiment_bucket)
except Exception as e:
print(f'Project {project_name} failed:\n{e}')
Expand All @@ -333,6 +345,7 @@ def main():
project,
sig_per_target,
max_samples,
args.language,
args.cloud_experiment_bucket,
] for project in all_projects]
with ThreadPool(num_threads) as p:
Expand Down
Loading

0 comments on commit c78a3a6

Please sign in to comment.