Skip to content

Commit

Permalink
Refactor benchmark
Browse files Browse the repository at this point in the history
  • Loading branch information
otsaloma committed Dec 14, 2024
1 parent b5d84db commit f8f758b
Show file tree
Hide file tree
Showing 3 changed files with 75 additions and 71 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@
.envrc
.pytest_cache
__pycache__
benchmark*.csv
benchmark-head.py
benchmark-versions.csv
benchmark-versions.ods
build
dataiter.egg-info
dist
Expand Down
15 changes: 7 additions & 8 deletions benchmark-versions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,22 @@

SCRIPT=benchmark-head.py
SCRIPT_ARGS="$@"
OUTPUT_FILE=benchmark-versions.csv
OUT_FILE=benchmark-versions.csv
TMP_FILE=tmp.csv

benchmark() {
VERSION=$1
printf "\n$VERSION:\n"
git checkout -q $VERSION
./$SCRIPT -o $TMP_FILE --version=$VERSION $SCRIPT_ARGS || true
tail -n+2 $TMP_FILE >> $OUTPUT_FILE
sed -i 's/"//g' $OUTPUT_FILE
tail -n+2 $TMP_FILE >> $OUT_FILE
sed -i 's/"//g' $OUT_FILE
}

set -e
rm -f $OUTPUT_FILE
echo "name,version,elapsed" > $OUTPUT_FILE
rm -f $OUT_FILE
echo "name,version,elapsed" > $OUT_FILE
cp -fv benchmark.py $SCRIPT
for VERSION in 0.99 master; do
benchmark $VERSION
done
benchmark 0.99
benchmark master
rm -f $SCRIPT $TMP_FILE
128 changes: 66 additions & 62 deletions benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,21 +21,15 @@ def _data_frame(path, nrow):
def data_frame(path, nrow=1_000_000):
return _data_frame(path, nrow).deepcopy()

def data_frame_random(nrows, ngroups):
@functools.cache
def _data_frame_random(nrows, ngroups):
return di.DataFrame(g=np.random.choice(ngroups, nrows, replace=True),
a=np.random.normal(10, 2, nrows))

def data_frame_full_join():
data = data_frame("vehicles.csv")
meta = data.select("make", "model").unique()
meta = meta.rbind(meta.modify(model="X"))
meta.random = np.random.random(meta.nrow)
assert meta.anti_join(data, "make", "model").nrow > 0
start = time.time()
data.full_join(meta, "make", "model")
return time.time() - start
def data_frame_random(nrows, ngroups):
return _data_frame_random(nrows, ngroups).deepcopy()

def data_frame_group_by_aggregate_128():
def data_frame_aggregate_128():
data = data_frame("vehicles.csv")
start = time.time()
(data
Expand All @@ -46,7 +40,7 @@ def data_frame_group_by_aggregate_128():
cty=di.mean("cty")))
return time.time() - start

def data_frame_group_by_aggregate_3264():
def data_frame_aggregate_3264():
data = data_frame("vehicles.csv")
start = time.time()
(data
Expand All @@ -57,7 +51,7 @@ def data_frame_group_by_aggregate_3264():
cty=di.mean("cty")))
return time.time() - start

def data_frame_group_by_aggregate_14668():
def data_frame_aggregate_14668():
data = data_frame("vehicles.csv")
start = time.time()
(data
Expand All @@ -68,7 +62,7 @@ def data_frame_group_by_aggregate_14668():
cty=di.mean("cty")))
return time.time() - start

def data_frame_group_by_aggregate_100000_lambda():
def data_frame_aggregate_100000_lambda():
data = data_frame_random(1_000_000, 100_000)
start = time.time()
(data
Expand All @@ -78,7 +72,7 @@ def data_frame_group_by_aggregate_100000_lambda():
a_std=lambda x: np.std(x.a)))
return time.time() - start

def data_frame_group_by_aggregate_100000_short():
def data_frame_aggregate_100000_short():
with patch("dataiter.USE_NUMBA", False):
data = data_frame_random(1_000_000, 100_000)
start = time.time()
Expand All @@ -89,7 +83,7 @@ def data_frame_group_by_aggregate_100000_short():
a_std=di.std("a")))
return time.time() - start

def data_frame_group_by_aggregate_100000_short_numba():
def data_frame_aggregate_100000_short_numba():
with patch("dataiter.USE_NUMBA", True):
data = data_frame_random(1_000_000, 100_000)
start = time.time()
Expand All @@ -100,6 +94,16 @@ def data_frame_group_by_aggregate_100000_short_numba():
a_std=di.std("a")))
return time.time() - start

def data_frame_full_join():
data = data_frame("vehicles.csv")
meta = data.select("make", "model").unique()
meta = meta.rbind(meta.modify(model="X"))
meta.random = np.random.random(meta.nrow)
assert meta.anti_join(data, "make", "model").nrow > 0
start = time.time()
data.full_join(meta, "make", "model")
return time.time() - start

def data_frame_left_join():
data = data_frame("vehicles.csv")
meta = data.select("make", "model").unique()
Expand Down Expand Up @@ -151,23 +155,17 @@ def data_frame_unique():
data.unique("make", "model", "year")
return time.time() - start

def list_of_dicts(path, length=100_000):
def _list_of_dicts(path, length):
data = test.list_of_dicts(path)
n = length // len(data) + 1
data = data * n
return data.head(length)

def list_of_dicts_full_join():
data = list_of_dicts("vehicles.json")
meta = data.deepcopy().select("make", "model").unique()
meta = meta + meta.deepcopy().modify(model=lambda x: "X")
meta = meta.modify(random=lambda x: random.random())
assert len(meta.anti_join(data, "make", "model")) > 0
start = time.time()
data.full_join(meta, "make", "model")
return time.time() - start
@functools.cache
def list_of_dicts(path, length=100_000):
return _list_of_dicts(path, length).deepcopy()

def list_of_dicts_group_by_aggregate_128():
def list_of_dicts_aggregate_128():
data = list_of_dicts("vehicles.json")
start = time.time()
(data
Expand All @@ -178,7 +176,7 @@ def list_of_dicts_group_by_aggregate_128():
cty=lambda x: mean(x.pluck("cty"))))
return time.time() - start

def list_of_dicts_group_by_aggregate_3264():
def list_of_dicts_aggregate_3264():
data = list_of_dicts("vehicles.json")
start = time.time()
(data
Expand All @@ -189,7 +187,7 @@ def list_of_dicts_group_by_aggregate_3264():
cty=lambda x: mean(x.pluck("cty"))))
return time.time() - start

def list_of_dicts_group_by_aggregate_14668():
def list_of_dicts_aggregate_14668():
data = list_of_dicts("vehicles.json")
start = time.time()
(data
Expand All @@ -200,10 +198,20 @@ def list_of_dicts_group_by_aggregate_14668():
cty=lambda x: mean(x.pluck("cty"))))
return time.time() - start

def list_of_dicts_left_join():
def list_of_dicts_full_join():
data = list_of_dicts("vehicles.json")
meta = data.deepcopy().select("make", "model").unique()
meta = meta + meta.deepcopy().modify(model=lambda x: "X")
meta = meta.modify(random=lambda x: random.random())
assert len(meta.anti_join(data, "make", "model")) > 0
start = time.time()
data.full_join(meta, "make", "model")
return time.time() - start

def list_of_dicts_left_join():
data = list_of_dicts("vehicles.json")
meta = data.deepcopy().select("make", "model").unique()
meta = meta.deepcopy().modify(random=lambda x: random.random())
start = time.time()
data.left_join(meta, "make", "model")
return time.time() - start
Expand All @@ -221,7 +229,7 @@ def list_of_dicts_read_json():
def list_of_dicts_sort():
data = list_of_dicts("vehicles.csv")
start = time.time()
data.sort(make=1, model=1, year=-1)
data.sort(make=1, model=1, year=1)
return time.time() - start

def vector_fast_list():
Expand All @@ -230,7 +238,7 @@ def vector_fast_list():
di.Vector.fast(seq, int)
return time.time() - start

def vector_fast_np():
def vector_fast_np_array():
seq = list(range(1_000_000))
seq = np.array(seq)
start = time.time()
Expand All @@ -243,7 +251,7 @@ def vector_new_list():
di.Vector(seq)
return time.time() - start

def vector_new_np():
def vector_new_np_array():
seq = list(range(1_000_000))
seq = np.array(seq)
start = time.time()
Expand Down Expand Up @@ -280,43 +288,39 @@ def vector_unique():
data.model.unique()
return time.time() - start

BENCHMARKS = sorted([
x for x in dir() if
x.startswith(("data_frame_", "list_of_dicts_", "vector_")) and
x not in ["data_frame_random"]
], key=lambda x: (
[x.zfill(9) if x.isdigit() else x for x in x.split("_")]
))
def is_benchmark(name):
prefixes = ("data_frame_", "list_of_dicts_", "vector_")
return name.startswith(prefixes) and name != "data_frame_random"

@click.command()
@click.option("-o", "--output", help="Filename for optional CSV output")
@click.option("-r", "--rounds", default=5, help="Number of rounds per benchmark")
@click.option("--version", default=di.__version__, help="Version number for CSV output")
@click.argument("pattern", nargs=-1)
def main(output, rounds, version, pattern):
"""Benchmark dataiter functions."""
benchmarks = BENCHMARKS.copy()
if pattern:
f = lambda x: any(y in x for y in pattern)
benchmarks = list(filter(f, benchmarks))
results = di.ListOfDicts()
BENCHMARKS = sorted(filter(is_benchmark, dir()), key=lambda x: (
[x.zfill(9) if x.isdigit() else x for x in x.split("_")]))

def run_benchmarks(benchmarks, output, rounds):
width = max(map(len, benchmarks)) + 2
for i, benchmark in enumerate(benchmarks):
width = max(map(len, benchmarks))
padding = "." * (width + 1 - len(benchmark))
print(f"{i+1:2d}/{len(benchmarks)}. {benchmark} {padding} ", end="", flush=True)
print(f"{i+1:2d}/{len(benchmarks)}. ", end="", flush=True)
print(f"{benchmark+' ':.<{width}} ", end="", flush=True)
try:
f = globals()[benchmark]
elapsed = 1000 * min(f() for i in range(rounds))
print("{:5.0f} ms".format(elapsed), flush=True)
except Exception as e:
except Exception as error:
elapsed = -1
print(e.__class__.__name__)
print(error.__class__.__name__)
if not output: raise
list.append(results, {
"name": benchmark,
"version": version,
"elapsed": round(elapsed),
})
yield {"name": benchmark, "elapsed": round(elapsed)}

@click.command()
@click.option("-o", "--output", help="Filename for optional CSV output")
@click.option("-r", "--rounds", default=5, help="Number of rounds per benchmark")
@click.option("--version", default=di.__version__, help="Version number for CSV output")
@click.argument("pattern", nargs=-1)
def main(output, rounds, version, pattern):
pattern = pattern or "_"
f = lambda x: any(y in x for y in pattern)
benchmarks = list(filter(f, BENCHMARKS))
results = di.ListOfDicts(run_benchmarks(benchmarks, output, rounds))
results = results.modify(version=lambda x: version)
if output:
assert output.endswith(".csv")
print(f"Writing {output}...")
Expand Down

0 comments on commit f8f758b

Please sign in to comment.