diff --git a/.gitignore b/.gitignore index 2f78806..b211824 100644 --- a/.gitignore +++ b/.gitignore @@ -4,8 +4,9 @@ .envrc .pytest_cache __pycache__ -benchmark*.csv benchmark-head.py +benchmark-versions.csv +benchmark-versions.ods build dataiter.egg-info dist diff --git a/benchmark-versions.sh b/benchmark-versions.sh index 57e3538..bcc1908 100755 --- a/benchmark-versions.sh +++ b/benchmark-versions.sh @@ -2,7 +2,7 @@ SCRIPT=benchmark-head.py SCRIPT_ARGS="$@" -OUTPUT_FILE=benchmark-versions.csv +OUT_FILE=benchmark-versions.csv TMP_FILE=tmp.csv benchmark() { @@ -10,15 +10,14 @@ benchmark() { printf "\n$VERSION:\n" git checkout -q $VERSION ./$SCRIPT -o $TMP_FILE --version=$VERSION $SCRIPT_ARGS || true - tail -n+2 $TMP_FILE >> $OUTPUT_FILE - sed -i 's/"//g' $OUTPUT_FILE + tail -n+2 $TMP_FILE >> $OUT_FILE + sed -i 's/"//g' $OUT_FILE } set -e -rm -f $OUTPUT_FILE -echo "name,version,elapsed" > $OUTPUT_FILE +rm -f $OUT_FILE +echo "name,version,elapsed" > $OUT_FILE cp -fv benchmark.py $SCRIPT -for VERSION in 0.99 master; do - benchmark $VERSION -done +benchmark 0.99 +benchmark master rm -f $SCRIPT $TMP_FILE diff --git a/benchmark.py b/benchmark.py index b2db6fd..3f3d16f 100755 --- a/benchmark.py +++ b/benchmark.py @@ -21,21 +21,15 @@ def _data_frame(path, nrow): def data_frame(path, nrow=1_000_000): return _data_frame(path, nrow).deepcopy() -def data_frame_random(nrows, ngroups): +@functools.cache +def _data_frame_random(nrows, ngroups): return di.DataFrame(g=np.random.choice(ngroups, nrows, replace=True), a=np.random.normal(10, 2, nrows)) -def data_frame_full_join(): - data = data_frame("vehicles.csv") - meta = data.select("make", "model").unique() - meta = meta.rbind(meta.modify(model="X")) - meta.random = np.random.random(meta.nrow) - assert meta.anti_join(data, "make", "model").nrow > 0 - start = time.time() - data.full_join(meta, "make", "model") - return time.time() - start +def data_frame_random(nrows, ngroups): + return _data_frame_random(nrows, ngroups).deepcopy() -def data_frame_group_by_aggregate_128(): +def data_frame_aggregate_128(): data = data_frame("vehicles.csv") start = time.time() (data @@ -46,7 +40,7 @@ def data_frame_group_by_aggregate_128(): cty=di.mean("cty"))) return time.time() - start -def data_frame_group_by_aggregate_3264(): +def data_frame_aggregate_3264(): data = data_frame("vehicles.csv") start = time.time() (data @@ -57,7 +51,7 @@ def data_frame_group_by_aggregate_3264(): cty=di.mean("cty"))) return time.time() - start -def data_frame_group_by_aggregate_14668(): +def data_frame_aggregate_14668(): data = data_frame("vehicles.csv") start = time.time() (data @@ -68,7 +62,7 @@ def data_frame_group_by_aggregate_14668(): cty=di.mean("cty"))) return time.time() - start -def data_frame_group_by_aggregate_100000_lambda(): +def data_frame_aggregate_100000_lambda(): data = data_frame_random(1_000_000, 100_000) start = time.time() (data @@ -78,7 +72,7 @@ def data_frame_group_by_aggregate_100000_lambda(): a_std=lambda x: np.std(x.a))) return time.time() - start -def data_frame_group_by_aggregate_100000_short(): +def data_frame_aggregate_100000_short(): with patch("dataiter.USE_NUMBA", False): data = data_frame_random(1_000_000, 100_000) start = time.time() @@ -89,7 +83,7 @@ def data_frame_group_by_aggregate_100000_short(): a_std=di.std("a"))) return time.time() - start -def data_frame_group_by_aggregate_100000_short_numba(): +def data_frame_aggregate_100000_short_numba(): with patch("dataiter.USE_NUMBA", True): data = data_frame_random(1_000_000, 100_000) start = time.time() @@ -100,6 +94,16 @@ def data_frame_group_by_aggregate_100000_short_numba(): a_std=di.std("a"))) return time.time() - start +def data_frame_full_join(): + data = data_frame("vehicles.csv") + meta = data.select("make", "model").unique() + meta = meta.rbind(meta.modify(model="X")) + meta.random = np.random.random(meta.nrow) + assert meta.anti_join(data, "make", "model").nrow > 0 + start = time.time() + data.full_join(meta, "make", "model") + return time.time() - start + def data_frame_left_join(): data = data_frame("vehicles.csv") meta = data.select("make", "model").unique() @@ -151,23 +155,17 @@ def data_frame_unique(): data.unique("make", "model", "year") return time.time() - start -def list_of_dicts(path, length=100_000): +def _list_of_dicts(path, length): data = test.list_of_dicts(path) n = length // len(data) + 1 data = data * n return data.head(length) -def list_of_dicts_full_join(): - data = list_of_dicts("vehicles.json") - meta = data.deepcopy().select("make", "model").unique() - meta = meta + meta.deepcopy().modify(model=lambda x: "X") - meta = meta.modify(random=lambda x: random.random()) - assert len(meta.anti_join(data, "make", "model")) > 0 - start = time.time() - data.full_join(meta, "make", "model") - return time.time() - start +@functools.cache +def list_of_dicts(path, length=100_000): + return _list_of_dicts(path, length).deepcopy() -def list_of_dicts_group_by_aggregate_128(): +def list_of_dicts_aggregate_128(): data = list_of_dicts("vehicles.json") start = time.time() (data @@ -178,7 +176,7 @@ def list_of_dicts_group_by_aggregate_128(): cty=lambda x: mean(x.pluck("cty")))) return time.time() - start -def list_of_dicts_group_by_aggregate_3264(): +def list_of_dicts_aggregate_3264(): data = list_of_dicts("vehicles.json") start = time.time() (data @@ -189,7 +187,7 @@ def list_of_dicts_group_by_aggregate_3264(): cty=lambda x: mean(x.pluck("cty")))) return time.time() - start -def list_of_dicts_group_by_aggregate_14668(): +def list_of_dicts_aggregate_14668(): data = list_of_dicts("vehicles.json") start = time.time() (data @@ -200,10 +198,20 @@ def list_of_dicts_group_by_aggregate_14668(): cty=lambda x: mean(x.pluck("cty")))) return time.time() - start -def list_of_dicts_left_join(): +def list_of_dicts_full_join(): data = list_of_dicts("vehicles.json") meta = data.deepcopy().select("make", "model").unique() + meta = meta + meta.deepcopy().modify(model=lambda x: "X") meta = meta.modify(random=lambda x: random.random()) + assert len(meta.anti_join(data, "make", "model")) > 0 + start = time.time() + data.full_join(meta, "make", "model") + return time.time() - start + +def list_of_dicts_left_join(): + data = list_of_dicts("vehicles.json") + meta = data.deepcopy().select("make", "model").unique() + meta = meta.deepcopy().modify(random=lambda x: random.random()) start = time.time() data.left_join(meta, "make", "model") return time.time() - start @@ -221,7 +229,7 @@ def list_of_dicts_read_json(): def list_of_dicts_sort(): data = list_of_dicts("vehicles.csv") start = time.time() - data.sort(make=1, model=1, year=-1) + data.sort(make=1, model=1, year=1) return time.time() - start def vector_fast_list(): @@ -230,7 +238,7 @@ def vector_fast_list(): di.Vector.fast(seq, int) return time.time() - start -def vector_fast_np(): +def vector_fast_np_array(): seq = list(range(1_000_000)) seq = np.array(seq) start = time.time() @@ -243,7 +251,7 @@ def vector_new_list(): di.Vector(seq) return time.time() - start -def vector_new_np(): +def vector_new_np_array(): seq = list(range(1_000_000)) seq = np.array(seq) start = time.time() @@ -280,43 +288,39 @@ def vector_unique(): data.model.unique() return time.time() - start -BENCHMARKS = sorted([ - x for x in dir() if - x.startswith(("data_frame_", "list_of_dicts_", "vector_")) and - x not in ["data_frame_random"] -], key=lambda x: ( - [x.zfill(9) if x.isdigit() else x for x in x.split("_")] -)) +def is_benchmark(name): + prefixes = ("data_frame_", "list_of_dicts_", "vector_") + return name.startswith(prefixes) and name != "data_frame_random" -@click.command() -@click.option("-o", "--output", help="Filename for optional CSV output") -@click.option("-r", "--rounds", default=5, help="Number of rounds per benchmark") -@click.option("--version", default=di.__version__, help="Version number for CSV output") -@click.argument("pattern", nargs=-1) -def main(output, rounds, version, pattern): - """Benchmark dataiter functions.""" - benchmarks = BENCHMARKS.copy() - if pattern: - f = lambda x: any(y in x for y in pattern) - benchmarks = list(filter(f, benchmarks)) - results = di.ListOfDicts() +BENCHMARKS = sorted(filter(is_benchmark, dir()), key=lambda x: ( + [x.zfill(9) if x.isdigit() else x for x in x.split("_")])) + +def run_benchmarks(benchmarks, output, rounds): + width = max(map(len, benchmarks)) + 2 for i, benchmark in enumerate(benchmarks): - width = max(map(len, benchmarks)) - padding = "." * (width + 1 - len(benchmark)) - print(f"{i+1:2d}/{len(benchmarks)}. {benchmark} {padding} ", end="", flush=True) + print(f"{i+1:2d}/{len(benchmarks)}. ", end="", flush=True) + print(f"{benchmark+' ':.<{width}} ", end="", flush=True) try: f = globals()[benchmark] elapsed = 1000 * min(f() for i in range(rounds)) print("{:5.0f} ms".format(elapsed), flush=True) - except Exception as e: + except Exception as error: elapsed = -1 - print(e.__class__.__name__) + print(error.__class__.__name__) if not output: raise - list.append(results, { - "name": benchmark, - "version": version, - "elapsed": round(elapsed), - }) + yield {"name": benchmark, "elapsed": round(elapsed)} + +@click.command() +@click.option("-o", "--output", help="Filename for optional CSV output") +@click.option("-r", "--rounds", default=5, help="Number of rounds per benchmark") +@click.option("--version", default=di.__version__, help="Version number for CSV output") +@click.argument("pattern", nargs=-1) +def main(output, rounds, version, pattern): + pattern = pattern or "_" + f = lambda x: any(y in x for y in pattern) + benchmarks = list(filter(f, BENCHMARKS)) + results = di.ListOfDicts(run_benchmarks(benchmarks, output, rounds)) + results = results.modify(version=lambda x: version) if output: assert output.endswith(".csv") print(f"Writing {output}...")