From a1ea3caa5102a72ffdf40a94113e48b3690f1769 Mon Sep 17 00:00:00 2001 From: Keith Adams Date: Fri, 12 Jan 2024 19:39:50 -0800 Subject: [PATCH] Fix: python slices of splits used incorrect offsets. This was the cause of some SEGVs and negative-length Python string constructions. --- .gitignore | 2 ++ python/lib.c | 57 ++++++++++++++++++++++--------------------------- scripts/test.py | 27 ++++++++++++++++++++++- 3 files changed, 53 insertions(+), 33 deletions(-) diff --git a/.gitignore b/.gitignore index f8dc7bdb..a8eae29c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ build/ +build_debug/ .DS_Store tmp/ build_debug/ @@ -22,3 +23,4 @@ node_modules/ leipzig1M.txt xlsum.csv enwik9 +.venv/* diff --git a/python/lib.c b/python/lib.c index c2b81fa5..a431b9e1 100644 --- a/python/lib.c +++ b/python/lib.c @@ -711,44 +711,37 @@ static PyObject *Strs_subscript(Strs *self, PyObject *key) { // Depending on the layout, the procedure will be different. self_slice->type = self->type; switch (self->type) { - case STRS_CONSECUTIVE_32: { - struct consecutive_slices_32bit_t *from = &self->data.consecutive_32bit; - struct consecutive_slices_32bit_t *to = &self_slice->data.consecutive_32bit; - to->count = stop - start; - to->separator_length = from->separator_length; - to->parent = from->parent; - size_t first_length; - str_at_offset_consecutive_32bit(self, start, count, &to->parent, &to->start, &first_length); - uint32_t first_offset = to->start - from->start; - to->end_offsets = malloc(sizeof(uint32_t) * to->count); - if (to->end_offsets == NULL && PyErr_NoMemory()) { - Py_XDECREF(self_slice); - return NULL; - } - for (size_t i = 0; i != to->count; ++i) to->end_offsets[i] = from->end_offsets[i] - first_offset; - Py_INCREF(to->parent); +/* Usable as consecutive_logic(64bit), e.g. */ +#define consecutive_logic(type) \ + typedef uint64_t index_64bit_t; \ + typedef uint32_t index_32bit_t; \ + typedef index_##type##_t index_t; \ + typedef struct consecutive_slices_##type##_t slice_t; \ + slice_t *from = &self->data.consecutive_##type; \ + slice_t *to = &self_slice->data.consecutive_##type; \ + to->count = stop - start; \ + to->separator_length = from->separator_length; \ + to->parent = from->parent; \ + size_t first_length; \ + str_at_offset_consecutive_##type(self, start, count, &to->parent, &to->start, &first_length); \ + index_t first_offset = to->start - from->start; \ + to->end_offsets = malloc(sizeof(index_t) * to->count); \ + if (to->end_offsets == NULL && PyErr_NoMemory()) { \ + Py_XDECREF(self_slice); \ + return NULL; \ + } \ + for (size_t i = 0; i != to->count; ++i) to->end_offsets[i] = from->end_offsets[i + start] - first_offset; \ + Py_INCREF(to->parent); + case STRS_CONSECUTIVE_32: { + consecutive_logic(32bit); break; } case STRS_CONSECUTIVE_64: { - struct consecutive_slices_64bit_t *from = &self->data.consecutive_64bit; - struct consecutive_slices_64bit_t *to = &self_slice->data.consecutive_64bit; - to->count = stop - start; - to->separator_length = from->separator_length; - to->parent = from->parent; - - size_t first_length; - str_at_offset_consecutive_64bit(self, start, count, &to->parent, &to->start, &first_length); - uint64_t first_offset = to->start - from->start; - to->end_offsets = malloc(sizeof(uint64_t) * to->count); - if (to->end_offsets == NULL && PyErr_NoMemory()) { - Py_XDECREF(self_slice); - return NULL; - } - for (size_t i = 0; i != to->count; ++i) to->end_offsets[i] = from->end_offsets[i] - first_offset; - Py_INCREF(to->parent); + consecutive_logic(64bit); break; } +#undef consecutive_logic case STRS_REORDERED: { struct reordered_slices_t *from = &self->data.reordered; struct reordered_slices_t *to = &self_slice->data.reordered; diff --git a/scripts/test.py b/scripts/test.py index fa98a069..41cd17a2 100644 --- a/scripts/test.py +++ b/scripts/test.py @@ -43,7 +43,10 @@ def test_unit_contains(): def test_unit_rich_comparisons(): assert Str("aa") == "aa" assert Str("aa") < "b" - assert Str("abb")[1:] == "bb" + s2 = Str("abb") + assert s2[1:] == "bb" + assert s2[:-1] == "ab" + assert s2[-1:] == "b" def test_unit_buffer_protocol(): @@ -108,6 +111,28 @@ def test_unit_globals(): assert sz.edit_distance("abababab", "aaaaaaaa", 2) == 2 assert sz.edit_distance("abababab", "aaaaaaaa", bound=2) == 2 +def test_unit_len(): + w = sz.Str("abcd") + assert 4 == len(w) + +def test_slice_of_split(): + def impl(native_str): + native_split = native_str.split() + text = sz.Str(native_str) + split = text.split() + for split_idx in range(len(native_split)): + native_slice = native_split[split_idx:] + idx = split_idx + for word in split[split_idx:]: + assert str(word) == native_split[idx] + idx += 1 + native_str = 'Weebles wobble before they fall down, don\'t they?' + impl(native_str) + # ~5GB to overflow 32-bit sizes + copies = int(len(native_str) / 5e9) + # Eek. Cover 64-bit indices + impl(native_str * copies) + def get_random_string( length: Optional[int] = None,