Skip to content

Commit

Permalink
Merge branch 'main-dev' of https://github.com/ashvardanian/StringZilla
Browse files Browse the repository at this point in the history
…into main-dev
  • Loading branch information
ashvardanian committed Jan 20, 2024
2 parents 5da45bc + 41b8eda commit b6face0
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 33 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
build/
build_debug/
.DS_Store
tmp/
build_debug/
Expand All @@ -16,9 +17,11 @@ substr_search_cpp
*.egg-info
*.whl
*.pyd
.venv/*
node_modules/

# Recommended datasets
leipzig1M.txt
enwik9.txt
xlsum.csv
enwik9
57 changes: 25 additions & 32 deletions python/lib.c
Original file line number Diff line number Diff line change
Expand Up @@ -711,44 +711,37 @@ static PyObject *Strs_subscript(Strs *self, PyObject *key) {
// Depending on the layout, the procedure will be different.
self_slice->type = self->type;
switch (self->type) {
case STRS_CONSECUTIVE_32: {
struct consecutive_slices_32bit_t *from = &self->data.consecutive_32bit;
struct consecutive_slices_32bit_t *to = &self_slice->data.consecutive_32bit;
to->count = stop - start;
to->separator_length = from->separator_length;
to->parent = from->parent;

size_t first_length;
str_at_offset_consecutive_32bit(self, start, count, &to->parent, &to->start, &first_length);
uint32_t first_offset = to->start - from->start;
to->end_offsets = malloc(sizeof(uint32_t) * to->count);
if (to->end_offsets == NULL && PyErr_NoMemory()) {
Py_XDECREF(self_slice);
return NULL;
}
for (size_t i = 0; i != to->count; ++i) to->end_offsets[i] = from->end_offsets[i] - first_offset;
Py_INCREF(to->parent);
/* Usable as consecutive_logic(64bit), e.g. */
#define consecutive_logic(type) \
typedef uint64_t index_64bit_t; \
typedef uint32_t index_32bit_t; \
typedef index_##type##_t index_t; \
typedef struct consecutive_slices_##type##_t slice_t; \
slice_t *from = &self->data.consecutive_##type; \
slice_t *to = &self_slice->data.consecutive_##type; \
to->count = stop - start; \
to->separator_length = from->separator_length; \
to->parent = from->parent; \
size_t first_length; \
str_at_offset_consecutive_##type(self, start, count, &to->parent, &to->start, &first_length); \
index_t first_offset = to->start - from->start; \
to->end_offsets = malloc(sizeof(index_t) * to->count); \
if (to->end_offsets == NULL && PyErr_NoMemory()) { \
Py_XDECREF(self_slice); \
return NULL; \
} \
for (size_t i = 0; i != to->count; ++i) to->end_offsets[i] = from->end_offsets[i + start] - first_offset; \
Py_INCREF(to->parent);
case STRS_CONSECUTIVE_32: {
consecutive_logic(32bit);
break;
}
case STRS_CONSECUTIVE_64: {
struct consecutive_slices_64bit_t *from = &self->data.consecutive_64bit;
struct consecutive_slices_64bit_t *to = &self_slice->data.consecutive_64bit;
to->count = stop - start;
to->separator_length = from->separator_length;
to->parent = from->parent;

size_t first_length;
str_at_offset_consecutive_64bit(self, start, count, &to->parent, &to->start, &first_length);
uint64_t first_offset = to->start - from->start;
to->end_offsets = malloc(sizeof(uint64_t) * to->count);
if (to->end_offsets == NULL && PyErr_NoMemory()) {
Py_XDECREF(self_slice);
return NULL;
}
for (size_t i = 0; i != to->count; ++i) to->end_offsets[i] = from->end_offsets[i] - first_offset;
Py_INCREF(to->parent);
consecutive_logic(64bit);
break;
}
#undef consecutive_logic
case STRS_REORDERED: {
struct reordered_slices_t *from = &self->data.reordered;
struct reordered_slices_t *to = &self_slice->data.reordered;
Expand Down
27 changes: 26 additions & 1 deletion scripts/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,10 @@ def test_unit_contains():
def test_unit_rich_comparisons():
assert Str("aa") == "aa"
assert Str("aa") < "b"
assert Str("abb")[1:] == "bb"
s2 = Str("abb")
assert s2[1:] == "bb"
assert s2[:-1] == "ab"
assert s2[-1:] == "b"


def test_unit_buffer_protocol():
Expand Down Expand Up @@ -108,6 +111,28 @@ def test_unit_globals():
assert sz.edit_distance("abababab", "aaaaaaaa", 2) == 2
assert sz.edit_distance("abababab", "aaaaaaaa", bound=2) == 2

def test_unit_len():
w = sz.Str("abcd")
assert 4 == len(w)

def test_slice_of_split():
def impl(native_str):
native_split = native_str.split()
text = sz.Str(native_str)
split = text.split()
for split_idx in range(len(native_split)):
native_slice = native_split[split_idx:]
idx = split_idx
for word in split[split_idx:]:
assert str(word) == native_split[idx]
idx += 1
native_str = 'Weebles wobble before they fall down, don\'t they?'
impl(native_str)
# ~5GB to overflow 32-bit sizes
copies = int(len(native_str) / 5e9)
# Eek. Cover 64-bit indices
impl(native_str * copies)


def get_random_string(
length: Optional[int] = None,
Expand Down

0 comments on commit b6face0

Please sign in to comment.