From 5abcf3943e168277d7480e5fb64bb36d8786c3de Mon Sep 17 00:00:00 2001 From: Jiradet Ounjai Date: Tue, 3 May 2022 11:56:26 +0700 Subject: [PATCH 1/3] implement target fuzzing --- common/experiment_utils.py | 6 + common/target_fuzzing_utils.py | 151 ++++++++++++++++++ database/models.py | 13 ++ experiment/dispatcher.py | 8 +- experiment/measurer/coverage_utils.py | 4 +- experiment/measurer/measure_manager.py | 110 +++++++++---- .../runner-startup-script-template.sh | 2 + experiment/run_experiment.py | 16 +- experiment/runner.py | 19 ++- experiment/scheduler.py | 11 +- 10 files changed, 306 insertions(+), 34 deletions(-) create mode 100644 common/target_fuzzing_utils.py diff --git a/common/experiment_utils.py b/common/experiment_utils.py index 2bbd64837..ab6684e43 100644 --- a/common/experiment_utils.py +++ b/common/experiment_utils.py @@ -78,6 +78,12 @@ def get_custom_seed_corpora_filestore_path(): 'custom_seed_corpora') +def get_target_fuzzing_corpora_filestore_path(): + """Returns path containing seed corpora for the target fuzzing experiment.""" + return posixpath.join(get_experiment_filestore_path(), + 'target-fuzzing-corpora') + + def get_dispatcher_instance_name(experiment: str) -> str: """Returns a dispatcher instance name for an experiment.""" return 'd-%s' % experiment diff --git a/common/target_fuzzing_utils.py b/common/target_fuzzing_utils.py new file mode 100644 index 000000000..15d5330cb --- /dev/null +++ b/common/target_fuzzing_utils.py @@ -0,0 +1,151 @@ +import random +import os +import zipfile +import tempfile +import tarfile +import multiprocessing +import itertools +from typing import List + +from common import experiment_utils +from common import filesystem +from experiment.measurer import coverage_utils +from experiment.measurer import run_coverage +from database import utils as db_utils +from database import models +from common import logs +from common import benchmark_utils +from experiment.build import build_utils +from common import experiment_path as exp_path + +MAX_CORPUS_FILES = 5 + + +def get_covered_branches_per_function(coverage_info): + function_coverage_info = coverage_info["data"][0]["functions"] + covered_branches = set([]) + for function in function_coverage_info: + function_name = function["name"] + for branch in function["branches"]: + if branch[4]: + coverage_key = "{} {}:{}-{}:{} T".format( + function_name, branch[0], branch[1], branch[2], branch[3]) + covered_branches.add(coverage_key) + if branch[5]: + coverage_key = "{} {}:{}-{}:{} F".format( + function_name, branch[0], branch[1], branch[2], branch[3]) + covered_branches.add(coverage_key) + return covered_branches + + +def get_covered_branches(coverage_binary, corpus_dir): + with tempfile.TemporaryDirectory() as tmp_dir: + profdata_file = os.path.join(tmp_dir, 'data.profdata') + merged_profdata_file = os.path.join(tmp_dir, 'merged.profdata') + merged_summary_json_file = os.path.join(tmp_dir, 'merged.json') + crashes_dir = os.path.join(tmp_dir, 'crashes') + filesystem.create_directory(crashes_dir) + + run_coverage.do_coverage_run(coverage_binary, corpus_dir, profdata_file, + crashes_dir) + coverage_utils.merge_profdata_files([profdata_file], + merged_profdata_file) + coverage_utils.generate_json_summary(coverage_binary, + merged_profdata_file, + merged_summary_json_file, + summary_only=False) + coverage_info = coverage_utils.get_coverage_infomation( + merged_summary_json_file) + return get_covered_branches_per_function(coverage_info) + + +def main_loop(benchmarks: List[str], num_trials: int): + pool_args = () + with multiprocessing.Pool(*pool_args) as pool: + target_coverage_list = pool.starmap( + setup_fuzzing_target, + [(benchmark, num_trials) for benchmark in benchmarks]) + target_coverage = list(itertools.chain(*target_coverage_list)) + logs.info('Done Preparing target fuzzing (total %d target)', + len(target_coverage)) + db_utils.bulk_save(target_coverage) + + +def setup_fuzzing_target(benchmark: str, num_trials: int): + with tempfile.TemporaryDirectory() as tmp_dir: + coverage_binaries_dir = build_utils.get_coverage_binaries_dir() + archive_name = 'coverage-build-%s.tar.gz' % benchmark + archive_filestore_path = exp_path.filestore(coverage_binaries_dir / + archive_name) + filesystem.copy(archive_filestore_path, tmp_dir) + archive_path = os.path.join(tmp_dir, archive_name) + tar = tarfile.open(archive_path, 'r:gz') + tar.extractall(tmp_dir) + os.remove(archive_path) + coverage_binary = os.path.join( + tmp_dir, benchmark_utils.get_fuzz_target(benchmark)) + return prepare_target_fuzzing_corpus(benchmark, num_trials, + coverage_binary) + + +def prepare_target_fuzzing_corpus(benchmark: str, num_trials: int, + coverage_binary: str): + """Prepare corpus for target fuzzing.""" + + target_coverage = [] + + # path used to store and feed seed corpus for benchmark runner + # each trial group will have the same seed input(s) + target_fuzzing_benchmark = os.path.join( + experiment_utils.get_target_fuzzing_corpora_filestore_path(), benchmark) + filesystem.create_directory(target_fuzzing_benchmark) + + # randomly pick from custom seed corpus + corpus_archive_filename = os.path.join( + experiment_utils.get_custom_seed_corpora_filestore_path(), + f'{benchmark}.zip') + with tempfile.TemporaryDirectory() as tmp_dir: + with zipfile.ZipFile(corpus_archive_filename) as zip_file: + # only consider file not directory + corpus_files = [ + f for f in zip_file.infolist() if not f.filename.endswith('/') + ] + for trial_group_num in range(num_trials): + logs.info('Preparing target fuzzing: %s, trial_group: %d', + benchmark, trial_group_num) + + trial_group_subdir = 'trial-group-%d' % trial_group_num + target_fuzzing_trial_dir = os.path.join( + target_fuzzing_benchmark, trial_group_subdir) + src_dir = os.path.join(tmp_dir, "source") + dest_dir = os.path.join(tmp_dir, "dest") + filesystem.recreate_directory(src_dir) + filesystem.recreate_directory(dest_dir) + + source_files = random.sample(corpus_files, MAX_CORPUS_FILES) + for file in source_files: + zip_file.extract(file, src_dir) + + dest_files = random.sample(corpus_files, MAX_CORPUS_FILES) + for file in dest_files: + zip_file.extract(file, dest_dir) + + src_branches = get_covered_branches(coverage_binary, src_dir) + dest_branches = get_covered_branches(coverage_binary, dest_dir) + target_branches = dest_branches - src_branches + + if not target_branches: + raise RuntimeError( + 'Unable to find target branches for %s.' % benchmark) + + for branch in target_branches: + target_cov = models.TargetCoverage() + target_cov.trial_group_num = int(trial_group_num) + target_cov.benchmark = benchmark + target_cov.target_location = branch + target_coverage.append(target_cov) + + # copy only the src directory + filesystem.copytree(src_dir, target_fuzzing_trial_dir) + + return target_coverage diff --git a/database/models.py b/database/models.py index 7cf902397..848849df9 100644 --- a/database/models.py +++ b/database/models.py @@ -50,6 +50,7 @@ class Trial(Base): benchmark = Column(String, nullable=False) time_started = Column(DateTime(), nullable=True) time_ended = Column(DateTime(), nullable=True) + trial_group_num = Column(Integer, nullable=True) # Columns used for preemptible experiments. preemptible = Column(Boolean, default=False, nullable=False) @@ -71,6 +72,8 @@ class Snapshot(Base): trial_id = Column(Integer, ForeignKey('trial.id'), primary_key=True) trial = sqlalchemy.orm.relationship('Trial', back_populates='snapshots') edges_covered = Column(Integer, nullable=False) + targets_covered = Column(Integer, nullable=False) + trial_group_num = Column(Integer, nullable=False) fuzzer_stats = Column(JSON, nullable=True) crashes = sqlalchemy.orm.relationship( 'Crash', @@ -94,3 +97,13 @@ class Crash(Base): __table_args__ = (ForeignKeyConstraint( [time, trial_id], ['snapshot.time', 'snapshot.trial_id']),) + + +class TargetCoverage(Base): + """Represents target branches for the target fuzzing mode.""" + __tablename__ = 'target_coverage' + + id = Column(Integer, primary_key=True) + benchmark = Column(String, nullable=False) + trial_group_num = Column(Integer, nullable=False) + target_location = Column(String, nullable=False) diff --git a/experiment/dispatcher.py b/experiment/dispatcher.py index c14d50c16..d801d5d25 100755 --- a/experiment/dispatcher.py +++ b/experiment/dispatcher.py @@ -24,6 +24,7 @@ import time from typing import List +from common import target_fuzzing_utils from common import experiment_path as exp_path from common import experiment_utils from common import logs @@ -131,7 +132,8 @@ def build_images_for_trials(fuzzers: List[str], models.Trial(fuzzer=fuzzer, experiment=experiment_name, benchmark=benchmark, - preemptible=preemptible) for _ in range(num_trials) + preemptible=preemptible, + trial_group_num=trial) for trial in range(num_trials) ] trials.extend(fuzzer_benchmark_trials) return trials @@ -159,6 +161,10 @@ def dispatcher_main(): experiment.config['concurrent_builds']) _initialize_trials_in_db(trials) + if experiment.config['target_fuzzing']: + target_fuzzing_utils.main_loop(experiment.benchmarks, + experiment.num_trials) + create_work_subdirs(['experiment-folders', 'measurement-folders']) # Start measurer and scheduler in seperate threads/processes. diff --git a/experiment/measurer/coverage_utils.py b/experiment/measurer/coverage_utils.py index 0122b8454..935e77402 100644 --- a/experiment/measurer/coverage_utils.py +++ b/experiment/measurer/coverage_utils.py @@ -233,10 +233,12 @@ def get_coverage_infomation(coverage_summary_file): class TrialCoverage: # pylint: disable=too-many-instance-attributes """Base class for storing and getting coverage data for a trial.""" - def __init__(self, fuzzer: str, benchmark: str, trial_num: int): + def __init__(self, fuzzer: str, benchmark: str, trial_num: int, + trial_group_num: int): self.fuzzer = fuzzer self.benchmark = benchmark self.trial_num = trial_num + self.trial_group_num = trial_group_num self.benchmark_fuzzer_trial_dir = exp_utils.get_trial_dir( fuzzer, benchmark, trial_num) self.work_dir = exp_utils.get_work_dir() diff --git a/experiment/measurer/measure_manager.py b/experiment/measurer/measure_manager.py index 07e48dda3..05338321f 100644 --- a/experiment/measurer/measure_manager.py +++ b/experiment/measurer/measure_manager.py @@ -32,6 +32,7 @@ from sqlalchemy import func from sqlalchemy import orm +from common import target_fuzzing_utils from common import benchmark_config from common import experiment_utils from common import experiment_path as exp_path @@ -51,7 +52,8 @@ logger = logs.Logger('measurer') # pylint: disable=invalid-name SnapshotMeasureRequest = collections.namedtuple( - 'SnapshotMeasureRequest', ['fuzzer', 'benchmark', 'trial_id', 'cycle']) + 'SnapshotMeasureRequest', + ['fuzzer', 'benchmark', 'trial_id', 'cycle', 'trial_group_num']) NUM_RETRIES = 3 RETRY_DELAY = 3 @@ -76,7 +78,10 @@ def measure_main(experiment_config): max_total_time = experiment_config['max_total_time'] measurers_cpus = experiment_config['measurers_cpus'] runners_cpus = experiment_config['runners_cpus'] - measure_loop(experiment, max_total_time, measurers_cpus, runners_cpus) + trials = experiment_config['trials'] + target_fuzzing = experiment_config['target_fuzzing'] + measure_loop(experiment, trials, max_total_time, measurers_cpus, + runners_cpus, target_fuzzing) # Clean up resources. gc.collect() @@ -95,9 +100,11 @@ def _process_init(cores_queue): def measure_loop(experiment: str, + trials: int, max_total_time: int, measurers_cpus=None, - runners_cpus=None): + runners_cpus=None, + target_fuzzing=False): """Continuously measure trials for |experiment|.""" logger.info('Start measure_loop.') @@ -116,7 +123,7 @@ def measure_loop(experiment: str, with multiprocessing.Pool( *pool_args) as pool, multiprocessing.Manager() as manager: - set_up_coverage_binaries(pool, experiment) + set_up_coverage_binaries(pool, experiment, trials) # Using Multiprocessing.Queue will fail with a complaint about # inheriting queue. q = manager.Queue() # pytype: disable=attribute-error @@ -126,7 +133,8 @@ def measure_loop(experiment: str, # races. all_trials_ended = scheduler.all_trials_ended(experiment) - if not measure_all_trials(experiment, max_total_time, pool, q): + if not measure_all_trials(experiment, max_total_time, pool, q, + target_fuzzing): # We didn't measure any trials. if all_trials_ended: # There are no trials producing snapshots to measure. @@ -141,7 +149,11 @@ def measure_loop(experiment: str, logger.info('Finished measure loop.') -def measure_all_trials(experiment: str, max_total_time: int, pool, q) -> bool: # pylint: disable=invalid-name +def measure_all_trials(experiment: str, + max_total_time: int, + pool, + q, + target_fuzzing=False) -> bool: # pylint: disable=invalid-name """Get coverage data (with coverage runs) for all active trials. Note that this should not be called unless multiprocessing.set_start_method('spawn') was called first. Otherwise it will use fork which breaks logging.""" @@ -158,7 +170,7 @@ def measure_all_trials(experiment: str, max_total_time: int, pool, q) -> bool: return False measure_trial_coverage_args = [ - (unmeasured_snapshot, max_cycle, q) + (unmeasured_snapshot, max_cycle, q, target_fuzzing) for unmeasured_snapshot in unmeasured_snapshots ] @@ -253,13 +265,15 @@ def _get_unmeasured_first_snapshots( snapshot for their trial. The trials are trials in |experiment|.""" trials_without_snapshots = _query_unmeasured_trials(experiment) return [ - SnapshotMeasureRequest(trial.fuzzer, trial.benchmark, trial.id, 1) + SnapshotMeasureRequest(trial.fuzzer, trial.benchmark, trial.id, 1, + trial.trial_group_num) for trial in trials_without_snapshots ] SnapshotWithTime = collections.namedtuple( - 'SnapshotWithTime', ['fuzzer', 'benchmark', 'trial_id', 'time']) + 'SnapshotWithTime', + ['fuzzer', 'benchmark', 'trial_id', 'time', 'trial_group_num']) def _query_measured_latest_snapshots(experiment: str): @@ -270,7 +284,8 @@ def _query_measured_latest_snapshots(experiment: str): # The order of these columns must correspond to the fields in # SnapshotWithTime. columns = (models.Trial.fuzzer, models.Trial.benchmark, - models.Snapshot.trial_id, latest_time_column) + models.Snapshot.trial_id, latest_time_column, + models.Trial.trial_group_num) experiment_filter = models.Snapshot.trial.has(experiment=experiment) group_by_columns = (models.Snapshot.trial_id, models.Trial.benchmark, models.Trial.fuzzer) @@ -300,7 +315,8 @@ def _get_unmeasured_next_snapshots( snapshot_with_cycle = SnapshotMeasureRequest(snapshot.fuzzer, snapshot.benchmark, snapshot.trial_id, - next_cycle) + next_cycle, + snapshot.trial_group_num) next_snapshots.append(snapshot_with_cycle) return next_snapshots @@ -357,8 +373,8 @@ class SnapshotMeasurer(coverage_utils.TrialCoverage): # pylint: disable=too-man UNIT_BLACKLIST = collections.defaultdict(set) def __init__(self, fuzzer: str, benchmark: str, trial_num: int, - trial_logger: logs.Logger): - super().__init__(fuzzer, benchmark, trial_num) + trial_logger: logs.Logger, trial_group_num: int): + super().__init__(fuzzer, benchmark, trial_num, trial_group_num) self.logger = trial_logger self.corpus_dir = os.path.join(self.measurement_dir, 'corpus') @@ -428,6 +444,31 @@ def generate_summary(self, cycle: int, summary_only=False): self.logger.error( 'Coverage summary json file generation failed in the end.') + def get_current_target_coverage(self) -> int: + """Get the current number of lines covered.""" + if not os.path.exists(self.cov_summary_file): + self.logger.warning('No coverage summary json file found.') + return 0 + try: + total_target_covered = 0 + coverage_info = coverage_utils.get_coverage_infomation( + self.cov_summary_file) + covered_branches = target_fuzzing_utils.get_covered_branches_per_function( + coverage_info) + # measure target coverage + with db_utils.session_scope() as session: + target_branches = session.query(models.TargetCoverage).filter( + models.TargetCoverage.trial_group_num == + self.trial_group_num).all() + for target_branch in target_branches: + if target_branch.target_location in covered_branches: + total_target_covered += 1 + return total_target_covered + except Exception: # pylint: disable=broad-except + self.logger.error( + 'Coverage summary json file defective or missing.') + return 0 + def get_current_coverage(self) -> int: """Get the current number of lines covered.""" if not os.path.exists(self.cov_summary_file): @@ -612,8 +653,8 @@ def get_fuzzer_stats(stats_filestore_path): def measure_trial_coverage( # pylint: disable=invalid-name - measure_req, max_cycle: int, - q: multiprocessing.Queue) -> models.Snapshot: + measure_req, max_cycle: int, q: multiprocessing.Queue, + target_fuzzing) -> models.Snapshot: """Measure the coverage obtained by |trial_num| on |benchmark| using |fuzzer|.""" initialize_logs() @@ -624,24 +665,28 @@ def measure_trial_coverage( # pylint: disable=invalid-name try: snapshot = measure_snapshot_coverage(measure_req.fuzzer, measure_req.benchmark, - measure_req.trial_id, cycle) + measure_req.trial_id, cycle, + measure_req.trial_group_num, + target_fuzzing) if not snapshot: break q.put(snapshot) except Exception: # pylint: disable=broad-except - logger.error('Error measuring cycle.', - extras={ - 'fuzzer': measure_req.fuzzer, - 'benchmark': measure_req.benchmark, - 'trial_id': str(measure_req.trial_id), - 'cycle': str(cycle), - }) + logger.error( + 'Error measuring cycle.', + extras={ + 'fuzzer': measure_req.fuzzer, + 'benchmark': measure_req.benchmark, + 'trial_id': str(measure_req.trial_id), + 'trial_group_num': str(measure_req.trial_group_num), + 'cycle': str(cycle), + }) logger.debug('Done measuring trial: %d.', measure_req.trial_id) def measure_snapshot_coverage( # pylint: disable=too-many-locals - fuzzer: str, benchmark: str, trial_num: int, - cycle: int) -> models.Snapshot: + fuzzer: str, benchmark: str, trial_num: int, cycle: int, + trial_group_num: int, target_fuzzing: bool) -> models.Snapshot: """Measure coverage of the snapshot for |cycle| for |trial_num| of |fuzzer| and |benchmark|.""" snapshot_logger = logs.Logger('measurer', @@ -650,9 +695,10 @@ def measure_snapshot_coverage( # pylint: disable=too-many-locals 'benchmark': benchmark, 'trial_id': str(trial_num), 'cycle': str(cycle), + 'trial_group_num': str(trial_group_num) }) snapshot_measurer = SnapshotMeasurer(fuzzer, benchmark, trial_num, - snapshot_logger) + snapshot_logger, trial_group_num) measuring_start_time = time.time() snapshot_logger.info('Measuring cycle: %d.', cycle) @@ -660,9 +706,14 @@ def measure_snapshot_coverage( # pylint: disable=too-many-locals if snapshot_measurer.is_cycle_unchanged(cycle): snapshot_logger.info('Cycle: %d is unchanged.', cycle) regions_covered = snapshot_measurer.get_current_coverage() + targets_covered = 0 + if target_fuzzing: + targets_covered = snapshot_measurer.get_current_target_coverage() fuzzer_stats_data = snapshot_measurer.get_fuzzer_stats(cycle) return models.Snapshot(time=this_time, trial_id=trial_num, + trial_group_num=trial_group_num, + targets_covered=targets_covered, edges_covered=regions_covered, fuzzer_stats=fuzzer_stats_data, crashes=[]) @@ -698,8 +749,13 @@ def measure_snapshot_coverage( # pylint: disable=too-many-locals # Get the coverage of the new corpus units. regions_covered = snapshot_measurer.get_current_coverage() + targets_covered = 0 + if target_fuzzing: + targets_covered = snapshot_measurer.get_current_target_coverage() fuzzer_stats_data = snapshot_measurer.get_fuzzer_stats(cycle) snapshot = models.Snapshot(time=this_time, + trial_group_num=trial_group_num, + targets_covered=targets_covered, trial_id=trial_num, edges_covered=regions_covered, fuzzer_stats=fuzzer_stats_data, @@ -714,7 +770,7 @@ def measure_snapshot_coverage( # pylint: disable=too-many-locals return snapshot -def set_up_coverage_binaries(pool, experiment): +def set_up_coverage_binaries(pool, experiment, trials): """Set up coverage binaries for all benchmarks in |experiment|.""" # Use set comprehension to select distinct benchmarks. with db_utils.session_scope() as session: diff --git a/experiment/resources/runner-startup-script-template.sh b/experiment/resources/runner-startup-script-template.sh index e0ff19f2d..653d939ca 100644 --- a/experiment/resources/runner-startup-script-template.sh +++ b/experiment/resources/runner-startup-script-template.sh @@ -42,8 +42,10 @@ docker run \ -e BENCHMARK={{benchmark}} \ -e EXPERIMENT={{experiment}} \ -e TRIAL_ID={{trial_id}} \ +-e TRIAL_GROUP_NUM={{trial_group_num}} \ -e MAX_TOTAL_TIME={{max_total_time}} \ -e NO_SEEDS={{no_seeds}} \ +-e TARGET_FUZZING={{target_fuzzing}} \ -e NO_DICTIONARIES={{no_dictionaries}} \ -e OSS_FUZZ_CORPUS={{oss_fuzz_corpus}} \ -e CUSTOM_SEED_CORPUS_DIR={{custom_seed_corpus_dir}} \ diff --git a/experiment/run_experiment.py b/experiment/run_experiment.py index 04d85a656..ee43871ed 100644 --- a/experiment/run_experiment.py +++ b/experiment/run_experiment.py @@ -261,7 +261,8 @@ def start_experiment( # pylint: disable=too-many-arguments concurrent_builds=None, measurers_cpus=None, runners_cpus=None, - custom_seed_corpus_dir=None): + custom_seed_corpus_dir=None, + target_fuzzing=False): """Start a fuzzer benchmarking experiment.""" if not allow_uncommitted_changes: check_no_uncommitted_changes() @@ -295,6 +296,7 @@ def start_experiment( # pylint: disable=too-many-arguments if config['custom_seed_corpus_dir']: validate_and_pack_custom_seed_corpus(config['custom_seed_corpus_dir'], benchmarks) + config['target_fuzzing'] = target_fuzzing return start_experiment_from_full_config(config) @@ -611,6 +613,12 @@ def main(): required=False, default=False, action='store_true') + parser.add_argument('-tf', + '--target-fuzzing', + help='Target fuzzing mode.', + required=False, + default=False, + action='store_true') parser.add_argument( '-o', '--oss-fuzz-corpus', @@ -655,6 +663,9 @@ def main(): parser.error('Cannot enable options "custom_seed_corpus_dir" and ' '"oss_fuzz_corpus" at the same time') + if args.target_fuzzing and not args.custom_seed_corpus_dir: + parser.error('Target fuzzing can only be used with custom seed corpus') + start_experiment(args.experiment_name, args.experiment_config, args.benchmarks, @@ -667,7 +678,8 @@ def main(): concurrent_builds=concurrent_builds, measurers_cpus=measurers_cpus, runners_cpus=runners_cpus, - custom_seed_corpus_dir=args.custom_seed_corpus_dir) + custom_seed_corpus_dir=args.custom_seed_corpus_dir, + target_fuzzing=args.target_fuzzing) return 0 diff --git a/experiment/runner.py b/experiment/runner.py index 6dcb2c49c..9dd45cb54 100644 --- a/experiment/runner.py +++ b/experiment/runner.py @@ -115,6 +115,20 @@ def get_clusterfuzz_seed_corpus_path(fuzz_target_path): return seed_corpus_path if os.path.exists(seed_corpus_path) else None +def _unpack_target_fuzzing_corpus(corpus_directory): + # remove initial seed corpus + shutil.rmtree(corpus_directory) + + benchmark = environment.get('BENCHMARK') + trial_group_num = environment.get('TRIAL_GROUP_NUM') + target_fuzzing_corpora_dir = experiment_utils.get_target_fuzzing_corpora_filestore_path( + ) + target_fuzzing_sub_dir = 'trial-group-%s' % int(trial_group_num) + target_fuzzing_dir = posixpath.join(target_fuzzing_corpora_dir, benchmark, + target_fuzzing_sub_dir) + shutil.copytree(target_fuzzing_dir, corpus_directory) + + def _unpack_custom_seed_corpus(corpus_directory): "Unpack seed corpus provided by user" # remove initial seed corpus @@ -200,7 +214,10 @@ def run_fuzzer(max_total_time, log_filename): return if environment.get('CUSTOM_SEED_CORPUS_DIR'): - _unpack_custom_seed_corpus(input_corpus) + if environment.get('TARGET_FUZZING'): + _unpack_target_fuzzing_corpus(input_corpus) + else: + _unpack_custom_seed_corpus(input_corpus) else: _unpack_clusterfuzz_seed_corpus(target_binary, input_corpus) _clean_seed_corpus(input_corpus) diff --git a/experiment/scheduler.py b/experiment/scheduler.py index effb9f319..a7a732d36 100644 --- a/experiment/scheduler.py +++ b/experiment/scheduler.py @@ -651,6 +651,7 @@ def __init__(self, trial): self.time_started = trial.time_started self.time_ended = trial.time_ended self.preemptible = trial.preemptible + self.trial_group_num = trial.trial_group_num def _initialize_logs(experiment): @@ -678,7 +679,8 @@ def _start_trial(trial: TrialProxy, experiment_config: dict): _initialize_logs(experiment_config['experiment']) logger.info('Start trial %d.', trial.id) started = create_trial_instance(trial.fuzzer, trial.benchmark, trial.id, - experiment_config, trial.preemptible) + trial.trial_group_num, experiment_config, + trial.preemptible) if started: trial.time_started = datetime_now() return trial @@ -688,6 +690,7 @@ def _start_trial(trial: TrialProxy, experiment_config: dict): def render_startup_script_template(instance_name: str, fuzzer: str, benchmark: str, trial_id: int, + trial_group_num: int, experiment_config: dict): """Render the startup script using the template and the parameters provided and return the result.""" @@ -705,6 +708,7 @@ def render_startup_script_template(instance_name: str, fuzzer: str, 'experiment': experiment, 'fuzzer': fuzzer, 'trial_id': trial_id, + 'trial_group_num': trial_group_num, 'max_total_time': experiment_config['max_total_time'], 'experiment_filestore': experiment_config['experiment_filestore'], 'report_filestore': experiment_config['report_filestore'], @@ -713,6 +717,7 @@ def render_startup_script_template(instance_name: str, fuzzer: str, 'docker_registry': experiment_config['docker_registry'], 'local_experiment': local_experiment, 'no_seeds': experiment_config['no_seeds'], + 'target_fuzzing': experiment_config['target_fuzzing'], 'no_dictionaries': experiment_config['no_dictionaries'], 'oss_fuzz_corpus': experiment_config['oss_fuzz_corpus'], 'num_cpu_cores': experiment_config['runner_num_cpu_cores'], @@ -728,13 +733,15 @@ def render_startup_script_template(instance_name: str, fuzzer: str, def create_trial_instance(fuzzer: str, benchmark: str, trial_id: int, - experiment_config: dict, preemptible: bool) -> bool: + trial_group_num: int, experiment_config: dict, + preemptible: bool) -> bool: """Create or start a trial instance for a specific trial_id,fuzzer,benchmark.""" instance_name = experiment_utils.get_trial_instance_name( experiment_config['experiment'], trial_id) startup_script = render_startup_script_template(instance_name, fuzzer, benchmark, trial_id, + trial_group_num, experiment_config) startup_script_path = '/tmp/%s-start-docker.sh' % instance_name with open(startup_script_path, 'w') as file_handle: From 14d5152487aab9f96a1e2f95d8e88f43607b0348 Mon Sep 17 00:00:00 2001 From: Jiradet Ounjai Date: Tue, 10 May 2022 20:23:27 +0700 Subject: [PATCH 2/3] add option to run random seed selection --- common/experiment_utils.py | 4 +- ...tils.py => random_corpus_fuzzing_utils.py} | 126 ++++++++++-------- experiment/dispatcher.py | 9 +- experiment/measurer/measure_manager.py | 4 +- .../runner-startup-script-template.sh | 1 + experiment/run_experiment.py | 14 +- experiment/runner.py | 16 +-- experiment/scheduler.py | 1 + 8 files changed, 102 insertions(+), 73 deletions(-) rename common/{target_fuzzing_utils.py => random_corpus_fuzzing_utils.py} (51%) diff --git a/common/experiment_utils.py b/common/experiment_utils.py index ab6684e43..58a7a1c3d 100644 --- a/common/experiment_utils.py +++ b/common/experiment_utils.py @@ -78,10 +78,10 @@ def get_custom_seed_corpora_filestore_path(): 'custom_seed_corpora') -def get_target_fuzzing_corpora_filestore_path(): +def get_random_corpora_filestore_path(): """Returns path containing seed corpora for the target fuzzing experiment.""" return posixpath.join(get_experiment_filestore_path(), - 'target-fuzzing-corpora') + 'random_corpora') def get_dispatcher_instance_name(experiment: str) -> str: diff --git a/common/target_fuzzing_utils.py b/common/random_corpus_fuzzing_utils.py similarity index 51% rename from common/target_fuzzing_utils.py rename to common/random_corpus_fuzzing_utils.py index 15d5330cb..e219997ff 100644 --- a/common/target_fuzzing_utils.py +++ b/common/random_corpus_fuzzing_utils.py @@ -18,8 +18,7 @@ from experiment.build import build_utils from common import experiment_path as exp_path -MAX_CORPUS_FILES = 5 - +MAX_RANDOM_CORPUS_FILES = 5 def get_covered_branches_per_function(coverage_info): function_coverage_info = coverage_info["data"][0]["functions"] @@ -59,93 +58,108 @@ def get_covered_branches(coverage_binary, corpus_dir): return get_covered_branches_per_function(coverage_info) -def main_loop(benchmarks: List[str], num_trials: int): +def initialize_random_corpus_fuzzing(benchmarks: List[str], + num_trials: int, + target_fuzzing: bool = False): + """Get targeting coverage from the given corpus.""" pool_args = () with multiprocessing.Pool(*pool_args) as pool: - target_coverage_list = pool.starmap( - setup_fuzzing_target, - [(benchmark, num_trials) for benchmark in benchmarks]) + target_coverage_list = pool.starmap(prepare_benchmark_random_corpus, [ + (benchmark, num_trials, target_fuzzing) for benchmark in benchmarks + ]) target_coverage = list(itertools.chain(*target_coverage_list)) logs.info('Done Preparing target fuzzing (total %d target)', len(target_coverage)) db_utils.bulk_save(target_coverage) -def setup_fuzzing_target(benchmark: str, num_trials: int): - with tempfile.TemporaryDirectory() as tmp_dir: - coverage_binaries_dir = build_utils.get_coverage_binaries_dir() - archive_name = 'coverage-build-%s.tar.gz' % benchmark - archive_filestore_path = exp_path.filestore(coverage_binaries_dir / - archive_name) - filesystem.copy(archive_filestore_path, tmp_dir) - archive_path = os.path.join(tmp_dir, archive_name) - tar = tarfile.open(archive_path, 'r:gz') - tar.extractall(tmp_dir) - os.remove(archive_path) - coverage_binary = os.path.join( - tmp_dir, benchmark_utils.get_fuzz_target(benchmark)) - return prepare_target_fuzzing_corpus(benchmark, num_trials, - coverage_binary) - - -def prepare_target_fuzzing_corpus(benchmark: str, num_trials: int, - coverage_binary: str): +def get_coverage_binary(benchmark, tmp_dir): + """Copy coverage binary to temp directory for temporary usage.""" + coverage_binaries_dir = build_utils.get_coverage_binaries_dir() + archive_name = 'coverage-build-%s.tar.gz' % benchmark + archive_filestore_path = exp_path.filestore(coverage_binaries_dir / + archive_name) + filesystem.copy(archive_filestore_path, tmp_dir) + archive_path = os.path.join(tmp_dir, archive_name) + tar = tarfile.open(archive_path, 'r:gz') + tar.extractall(tmp_dir) + os.remove(archive_path) + coverage_binary = os.path.join(tmp_dir, + benchmark_utils.get_fuzz_target(benchmark)) + return coverage_binary + + +def prepare_benchmark_random_corpus(benchmark: str, + num_trials: int, + target_fuzzing: bool = False): """Prepare corpus for target fuzzing.""" - + coverage_binary = None target_coverage = [] - # path used to store and feed seed corpus for benchmark runner # each trial group will have the same seed input(s) - target_fuzzing_benchmark = os.path.join( - experiment_utils.get_target_fuzzing_corpora_filestore_path(), benchmark) - filesystem.create_directory(target_fuzzing_benchmark) + benchmark_random_corpora = os.path.join( + experiment_utils.get_random_corpora_filestore_path(), benchmark) + filesystem.create_directory(benchmark_random_corpora) - # randomly pick from custom seed corpus + # get inputs from the custom seed corpus directory corpus_archive_filename = os.path.join( experiment_utils.get_custom_seed_corpora_filestore_path(), f'{benchmark}.zip') + with tempfile.TemporaryDirectory() as tmp_dir: + if target_fuzzing: + coverage_binary = get_coverage_binary(benchmark, tmp_dir) + with zipfile.ZipFile(corpus_archive_filename) as zip_file: # only consider file not directory corpus_files = [ f for f in zip_file.infolist() if not f.filename.endswith('/') ] for trial_group_num in range(num_trials): - logs.info('Preparing target fuzzing: %s, trial_group: %d', + logs.info('Preparing random corpus: %s, trial_group: %d', benchmark, trial_group_num) trial_group_subdir = 'trial-group-%d' % trial_group_num - target_fuzzing_trial_dir = os.path.join( - target_fuzzing_benchmark, trial_group_subdir) + custom_corpus_trial_dir = os.path.join(benchmark_random_corpora, + trial_group_subdir) src_dir = os.path.join(tmp_dir, "source") - dest_dir = os.path.join(tmp_dir, "dest") filesystem.recreate_directory(src_dir) - filesystem.recreate_directory(dest_dir) - source_files = random.sample(corpus_files, MAX_CORPUS_FILES) + source_files = random.sample(corpus_files, + MAX_RANDOM_CORPUS_FILES) for file in source_files: zip_file.extract(file, src_dir) - dest_files = random.sample(corpus_files, MAX_CORPUS_FILES) - for file in dest_files: - zip_file.extract(file, dest_dir) - - src_branches = get_covered_branches(coverage_binary, src_dir) - dest_branches = get_covered_branches(coverage_binary, dest_dir) - target_branches = dest_branches - src_branches - - if not target_branches: - raise RuntimeError( - 'Unable to find target branches for %s.' % benchmark) - - for branch in target_branches: - target_cov = models.TargetCoverage() - target_cov.trial_group_num = int(trial_group_num) - target_cov.benchmark = benchmark - target_cov.target_location = branch - target_coverage.append(target_cov) + if target_fuzzing: + dest_dir = os.path.join(tmp_dir, "dest") + filesystem.recreate_directory(dest_dir) + + dest_files = random.sample(corpus_files, + MAX_RANDOM_CORPUS_FILES) + for file in dest_files: + zip_file.extract(file, dest_dir) + + # extract covered branches of source and destination inputs + # then subtract to get targeting branches + src_branches = get_covered_branches(coverage_binary, + src_dir) + dest_branches = get_covered_branches( + coverage_binary, dest_dir) + target_branches = dest_branches - src_branches + + if not target_branches: + raise RuntimeError( + 'Unable to find target branches for %s.' % + benchmark) + + for branch in target_branches: + target_cov = models.TargetCoverage() + target_cov.trial_group_num = int(trial_group_num) + target_cov.benchmark = benchmark + target_cov.target_location = branch + target_coverage.append(target_cov) # copy only the src directory - filesystem.copytree(src_dir, target_fuzzing_trial_dir) + filesystem.copytree(src_dir, custom_corpus_trial_dir) return target_coverage diff --git a/experiment/dispatcher.py b/experiment/dispatcher.py index d801d5d25..71aae29b0 100755 --- a/experiment/dispatcher.py +++ b/experiment/dispatcher.py @@ -24,7 +24,7 @@ import time from typing import List -from common import target_fuzzing_utils +from common import random_corpus_fuzzing_utils from common import experiment_path as exp_path from common import experiment_utils from common import logs @@ -161,9 +161,10 @@ def dispatcher_main(): experiment.config['concurrent_builds']) _initialize_trials_in_db(trials) - if experiment.config['target_fuzzing']: - target_fuzzing_utils.main_loop(experiment.benchmarks, - experiment.num_trials) + if experiment.config['random_corpus'] or experiment.config['target_fuzzing']: + random_corpus_fuzzing_utils.initialize_random_corpus_fuzzing( + experiment.benchmarks, experiment.num_trials, + experiment.config['target_fuzzing']) create_work_subdirs(['experiment-folders', 'measurement-folders']) diff --git a/experiment/measurer/measure_manager.py b/experiment/measurer/measure_manager.py index 05338321f..5779e376e 100644 --- a/experiment/measurer/measure_manager.py +++ b/experiment/measurer/measure_manager.py @@ -32,7 +32,7 @@ from sqlalchemy import func from sqlalchemy import orm -from common import target_fuzzing_utils +from common import random_corpus_fuzzing_utils from common import benchmark_config from common import experiment_utils from common import experiment_path as exp_path @@ -453,7 +453,7 @@ def get_current_target_coverage(self) -> int: total_target_covered = 0 coverage_info = coverage_utils.get_coverage_infomation( self.cov_summary_file) - covered_branches = target_fuzzing_utils.get_covered_branches_per_function( + covered_branches = random_corpus_fuzzing_utils.get_covered_branches_per_function( coverage_info) # measure target coverage with db_utils.session_scope() as session: diff --git a/experiment/resources/runner-startup-script-template.sh b/experiment/resources/runner-startup-script-template.sh index 653d939ca..aed81d188 100644 --- a/experiment/resources/runner-startup-script-template.sh +++ b/experiment/resources/runner-startup-script-template.sh @@ -45,6 +45,7 @@ docker run \ -e TRIAL_GROUP_NUM={{trial_group_num}} \ -e MAX_TOTAL_TIME={{max_total_time}} \ -e NO_SEEDS={{no_seeds}} \ +-e RANDOM_CORPUS={{random_corpus}} \ -e TARGET_FUZZING={{target_fuzzing}} \ -e NO_DICTIONARIES={{no_dictionaries}} \ -e OSS_FUZZ_CORPUS={{oss_fuzz_corpus}} \ diff --git a/experiment/run_experiment.py b/experiment/run_experiment.py index ee43871ed..9889476ff 100644 --- a/experiment/run_experiment.py +++ b/experiment/run_experiment.py @@ -262,6 +262,7 @@ def start_experiment( # pylint: disable=too-many-arguments measurers_cpus=None, runners_cpus=None, custom_seed_corpus_dir=None, + random_corpus=None, target_fuzzing=False): """Start a fuzzer benchmarking experiment.""" if not allow_uncommitted_changes: @@ -296,6 +297,7 @@ def start_experiment( # pylint: disable=too-many-arguments if config['custom_seed_corpus_dir']: validate_and_pack_custom_seed_corpus(config['custom_seed_corpus_dir'], benchmarks) + config['random_corpus'] = random_corpus config['target_fuzzing'] = target_fuzzing return start_experiment_from_full_config(config) @@ -613,6 +615,12 @@ def main(): required=False, default=False, action='store_true') + parser.add_argument('-rs', + '--random-corpus', + help='Randomly pick seed corpus.', + required=False, + default=False, + action='store_true') parser.add_argument('-tf', '--target-fuzzing', help='Target fuzzing mode.', @@ -664,7 +672,10 @@ def main(): '"oss_fuzz_corpus" at the same time') if args.target_fuzzing and not args.custom_seed_corpus_dir: - parser.error('Target fuzzing can only be used with custom seed corpus') + parser.error('Target fuzzing can only be run with custom seed corpus') + + if args.random_corpus and not args.custom_seed_corpus_dir: + parser.error('Random corpus experiment can only be run with custom seed corpus') start_experiment(args.experiment_name, args.experiment_config, @@ -679,6 +690,7 @@ def main(): measurers_cpus=measurers_cpus, runners_cpus=runners_cpus, custom_seed_corpus_dir=args.custom_seed_corpus_dir, + random_corpus=args.random_corpus, target_fuzzing=args.target_fuzzing) return 0 diff --git a/experiment/runner.py b/experiment/runner.py index 9dd45cb54..9f526e1a1 100644 --- a/experiment/runner.py +++ b/experiment/runner.py @@ -115,18 +115,18 @@ def get_clusterfuzz_seed_corpus_path(fuzz_target_path): return seed_corpus_path if os.path.exists(seed_corpus_path) else None -def _unpack_target_fuzzing_corpus(corpus_directory): +def _unpack_random_corpus(corpus_directory): # remove initial seed corpus shutil.rmtree(corpus_directory) benchmark = environment.get('BENCHMARK') trial_group_num = environment.get('TRIAL_GROUP_NUM') - target_fuzzing_corpora_dir = experiment_utils.get_target_fuzzing_corpora_filestore_path( + random_corpora_dir = experiment_utils.get_random_corpora_filestore_path( ) - target_fuzzing_sub_dir = 'trial-group-%s' % int(trial_group_num) - target_fuzzing_dir = posixpath.join(target_fuzzing_corpora_dir, benchmark, - target_fuzzing_sub_dir) - shutil.copytree(target_fuzzing_dir, corpus_directory) + random_corpora_sub_dir = 'trial-group-%s' % int(trial_group_num) + random_corpus_dir = posixpath.join(random_corpora_dir, benchmark, + random_corpora_sub_dir) + shutil.copytree(random_corpus_dir, corpus_directory) def _unpack_custom_seed_corpus(corpus_directory): @@ -214,8 +214,8 @@ def run_fuzzer(max_total_time, log_filename): return if environment.get('CUSTOM_SEED_CORPUS_DIR'): - if environment.get('TARGET_FUZZING'): - _unpack_target_fuzzing_corpus(input_corpus) + if environment.get('RANDOM_CORPUS') or environment.get('TARGET_FUZZING'): + _unpack_random_corpus(input_corpus) else: _unpack_custom_seed_corpus(input_corpus) else: diff --git a/experiment/scheduler.py b/experiment/scheduler.py index a7a732d36..e04e368cb 100644 --- a/experiment/scheduler.py +++ b/experiment/scheduler.py @@ -717,6 +717,7 @@ def render_startup_script_template(instance_name: str, fuzzer: str, 'docker_registry': experiment_config['docker_registry'], 'local_experiment': local_experiment, 'no_seeds': experiment_config['no_seeds'], + 'random_corpus': experiment_config['random_corpus'], 'target_fuzzing': experiment_config['target_fuzzing'], 'no_dictionaries': experiment_config['no_dictionaries'], 'oss_fuzz_corpus': experiment_config['oss_fuzz_corpus'], From f4e531426d516a35ec423d3286e0ac04becbf344 Mon Sep 17 00:00:00 2001 From: Jiradet Ounjai Date: Tue, 10 May 2022 20:29:14 +0700 Subject: [PATCH 3/3] format code --- common/experiment_utils.py | 3 +-- common/random_corpus_fuzzing_utils.py | 3 ++- experiment/run_experiment.py | 3 ++- experiment/runner.py | 8 ++++---- 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/common/experiment_utils.py b/common/experiment_utils.py index 58a7a1c3d..97adde82f 100644 --- a/common/experiment_utils.py +++ b/common/experiment_utils.py @@ -80,8 +80,7 @@ def get_custom_seed_corpora_filestore_path(): def get_random_corpora_filestore_path(): """Returns path containing seed corpora for the target fuzzing experiment.""" - return posixpath.join(get_experiment_filestore_path(), - 'random_corpora') + return posixpath.join(get_experiment_filestore_path(), 'random_corpora') def get_dispatcher_instance_name(experiment: str) -> str: diff --git a/common/random_corpus_fuzzing_utils.py b/common/random_corpus_fuzzing_utils.py index e219997ff..444e0f323 100644 --- a/common/random_corpus_fuzzing_utils.py +++ b/common/random_corpus_fuzzing_utils.py @@ -20,6 +20,7 @@ MAX_RANDOM_CORPUS_FILES = 5 + def get_covered_branches_per_function(coverage_info): function_coverage_info = coverage_info["data"][0]["functions"] covered_branches = set([]) @@ -138,7 +139,7 @@ def prepare_benchmark_random_corpus(benchmark: str, MAX_RANDOM_CORPUS_FILES) for file in dest_files: zip_file.extract(file, dest_dir) - + # extract covered branches of source and destination inputs # then subtract to get targeting branches src_branches = get_covered_branches(coverage_binary, diff --git a/experiment/run_experiment.py b/experiment/run_experiment.py index 9889476ff..19927fe53 100644 --- a/experiment/run_experiment.py +++ b/experiment/run_experiment.py @@ -675,7 +675,8 @@ def main(): parser.error('Target fuzzing can only be run with custom seed corpus') if args.random_corpus and not args.custom_seed_corpus_dir: - parser.error('Random corpus experiment can only be run with custom seed corpus') + parser.error( + 'Random corpus option can only be run with custom seed corpus') start_experiment(args.experiment_name, args.experiment_config, diff --git a/experiment/runner.py b/experiment/runner.py index 9f526e1a1..ba99787ea 100644 --- a/experiment/runner.py +++ b/experiment/runner.py @@ -121,11 +121,10 @@ def _unpack_random_corpus(corpus_directory): benchmark = environment.get('BENCHMARK') trial_group_num = environment.get('TRIAL_GROUP_NUM') - random_corpora_dir = experiment_utils.get_random_corpora_filestore_path( - ) + random_corpora_dir = experiment_utils.get_random_corpora_filestore_path() random_corpora_sub_dir = 'trial-group-%s' % int(trial_group_num) random_corpus_dir = posixpath.join(random_corpora_dir, benchmark, - random_corpora_sub_dir) + random_corpora_sub_dir) shutil.copytree(random_corpus_dir, corpus_directory) @@ -214,7 +213,8 @@ def run_fuzzer(max_total_time, log_filename): return if environment.get('CUSTOM_SEED_CORPUS_DIR'): - if environment.get('RANDOM_CORPUS') or environment.get('TARGET_FUZZING'): + if environment.get('RANDOM_CORPUS') or environment.get( + 'TARGET_FUZZING'): _unpack_random_corpus(input_corpus) else: _unpack_custom_seed_corpus(input_corpus)