Skip to content

Commit

Permalink
Improve: Extend benchmarks
Browse files Browse the repository at this point in the history
  • Loading branch information
ashvardanian committed Jan 22, 2024
1 parent b6face0 commit 1a4c4dc
Show file tree
Hide file tree
Showing 4 changed files with 132 additions and 27 deletions.
2 changes: 1 addition & 1 deletion scripts/bench.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
#include <stringzilla/stringzilla.h>
#include <stringzilla/stringzilla.hpp>

#ifdef SZ_DEBUG // Make debugging faster
#if SZ_DEBUG // Make debugging faster
#define default_seconds_m 10
#else
#define default_seconds_m 30
Expand Down
91 changes: 67 additions & 24 deletions scripts/bench_search.py
Original file line number Diff line number Diff line change
@@ -1,61 +1,104 @@
import time
import re
import random
from typing import List

import fire

from stringzilla import Str, File
from stringzilla import Str


def log(name: str, bytes_length: int, operator: callable):
def log(name: str, haystack, patterns, operator: callable):
a = time.time_ns()
operator()
for pattern in patterns:
operator(haystack, pattern)
b = time.time_ns()
bytes_length = len(haystack) * len(patterns)
secs = (b - a) / 1e9
gb_per_sec = bytes_length / (1e9 * secs)
print(f"{name}: took {secs:} seconds ~ {gb_per_sec:.3f} GB/s")


def find_all(haystack, pattern) -> int:
count, start = 0, 0
while True:
index = haystack.find(pattern, start)
if index == -1:
break
count += 1
start = index + 1
return count


def rfind_all(haystack, pattern) -> int:
count, start = 0, len(haystack) - 1
while True:
index = haystack.rfind(pattern, 0, start + 1)
if index == -1:
break
count += 1
start = index - 1
return count


def find_all_regex(haystack: str, characters: str) -> int:
regex_matcher = re.compile(f"[{characters}]")
count = 0
for _ in re.finditer(regex_matcher, haystack):
count += 1
return count


def find_all_sets(haystack: Str, characters: str) -> int:
count, start = 0, 0
while True:
index = haystack.find_first_of(characters, start)
if index == -1:
break
count += 1
start = index + 1
return count


def log_functionality(
pattern: str,
bytes_length: int,
tokens: List[str],
pythonic_str: str,
stringzilla_str: Str,
stringzilla_file: File,
):
log("str.count", bytes_length, lambda: pythonic_str.count(pattern))
log("Str.count", bytes_length, lambda: stringzilla_str.count(pattern))
if stringzilla_file:
log("File.count", bytes_length, lambda: stringzilla_file.count(pattern))

log("str.split", bytes_length, lambda: pythonic_str.split(pattern))
log("Str.split", bytes_length, lambda: stringzilla_str.split(pattern))
if stringzilla_file:
log("File.split", bytes_length, lambda: stringzilla_file.split(pattern))

log("str.split.sort", bytes_length, lambda: pythonic_str.split(pattern).sort())
log("Str.split.sort", bytes_length, lambda: stringzilla_str.split(pattern).sort())
if stringzilla_file:
log("File.split", bytes_length, lambda: stringzilla_file.split(pattern).sort())
log("str.find", pythonic_str, tokens, find_all)
log("Str.find", stringzilla_str, tokens, find_all)
log("str.rfind", pythonic_str, tokens, rfind_all)
log("Str.rfind", stringzilla_str, tokens, rfind_all)
log("re.finditer", pythonic_str, [r" \t\n\r"], find_all_regex)
log("Str.find_first_of", stringzilla_str, [r" \t\n\r"], find_all_sets)


def bench(
needle: str = None,
haystack_path: str = None,
haystack_pattern: str = None,
haystack_length: int = None,
):
if haystack_path:
pythonic_str: str = open(haystack_path, "r").read()
stringzilla_file = File(haystack_path)
else:
haystack_length = int(haystack_length)
repetitions = haystack_length // len(haystack_pattern)
pythonic_str: str = haystack_pattern * repetitions
stringzilla_file = None

stringzilla_str = Str(pythonic_str)
tokens = pythonic_str.split()
total_tokens = len(tokens)
mean_token_length = sum(len(t) for t in tokens) / total_tokens

print(
f"Parsed the file with {total_tokens:,} words of {mean_token_length:.2f} mean length!"
)

tokens = random.sample(tokens, 100)
log_functionality(
needle, len(stringzilla_str), pythonic_str, stringzilla_str, stringzilla_file
tokens,
pythonic_str,
stringzilla_str,
)


Expand Down
8 changes: 6 additions & 2 deletions scripts/bench_similarity.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,12 @@ void bench_similarity_on_bio_data() {
std::size_t length_upper_bound;
char const *name;
} bio_cases[] = {
{60, 60, "60 aminoacids"}, {100, 100, "100 aminoacids"}, {300, 300, "300 aminoacids"},
{1000, 1000, "1000 aminoacids"}, {100, 1000, "100-1000 aminoacids"}, {1000, 10000, "1000-10000 aminoacids"},
{60, 60, "60 aminoacids"}, //
{100, 100, "100 aminoacids"}, //
{300, 300, "300 aminoacids"}, //
{1000, 1000, "1000 aminoacids"}, //
{100, 1000, "100-1000 aminoacids"}, //
{1000, 10000, "1000-10000 aminoacids"}, //
};
std::random_device random_device;
std::mt19937 generator(random_device());
Expand Down
58 changes: 58 additions & 0 deletions scripts/bench_sort.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import time

import fire

from stringzilla import Str, File


def log(name: str, bytes_length: int, operator: callable):
a = time.time_ns()
operator()
b = time.time_ns()
secs = (b - a) / 1e9
gb_per_sec = bytes_length / (1e9 * secs)
print(f"{name}: took {secs:} seconds ~ {gb_per_sec:.3f} GB/s")


def log_functionality(
pattern: str,
bytes_length: int,
pythonic_str: str,
stringzilla_str: Str,
stringzilla_file: File,
):
log("str.split", bytes_length, lambda: pythonic_str.split(pattern))
log("Str.split", bytes_length, lambda: stringzilla_str.split(pattern))
if stringzilla_file:
log("File.split", bytes_length, lambda: stringzilla_file.split(pattern))

log("str.split.sort", bytes_length, lambda: pythonic_str.split(pattern).sort())
log("Str.split.sort", bytes_length, lambda: stringzilla_str.split(pattern).sort())
if stringzilla_file:
log("File.split", bytes_length, lambda: stringzilla_file.split(pattern).sort())


def bench(
haystack_path: str = None,
haystack_pattern: str = None,
haystack_length: int = None,
needle: str = None,
):
if haystack_path:
pythonic_str: str = open(haystack_path, "r").read()
stringzilla_file = File(haystack_path)
else:
haystack_length = int(haystack_length)
repetitions = haystack_length // len(haystack_pattern)
pythonic_str: str = haystack_pattern * repetitions
stringzilla_file = None

stringzilla_str = Str(pythonic_str)

log_functionality(
needle, len(stringzilla_str), pythonic_str, stringzilla_str, stringzilla_file
)


if __name__ == "__main__":
fire.Fire(bench)

0 comments on commit 1a4c4dc

Please sign in to comment.