diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 16ac4d8..0e34a8e 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -42,4 +42,5 @@ jobs: go-version: '1.20' - run: go env - - run: go test ./mdbx ./exp/mdbxpool + - run: go test -p 1 ./mdbx + - run: go test -p 1 ./exp/mdbxpool diff --git a/mdbx/cursor_test.go b/mdbx/cursor_test.go index 08676b4..5f59f91 100644 --- a/mdbx/cursor_test.go +++ b/mdbx/cursor_test.go @@ -730,7 +730,299 @@ func TestCursor_Del(t *testing.T) { } } +func TestDupCursor_EmptyKeyValues1(t *testing.T) { + env, _ := setup(t) + + var db DBI + err := env.Update(func(txn *Txn) (err error) { + db, err = txn.OpenDBI("testingdup", Create|DupSort, nil, nil) + if err != nil { + return err + } + cur, err := txn.OpenCursor(db) + if err != nil { + return err + } + defer cur.Close() + + // empty value - must function as valid dupsort value + if err = txn.Put(db, []byte{1}, []byte{}, 0); err != nil { + panic(err) + } + if err = txn.Put(db, []byte{1}, []byte{8}, 0); err != nil { + panic(err) + } + + _, v, err := cur.Get([]byte{1}, []byte{}, GetBothRange) + if err != nil { + panic(err) + } + if !bytes.Equal(v, []byte{}) { + panic(v) + } + _, v, err = cur.Get([]byte{1}, []byte{0}, GetBothRange) + if err != nil { + panic(err) + } + if !bytes.Equal(v, []byte{8}) { + panic(v) + } + _, v, err = cur.Get([]byte{}, []byte{0}, GetBoth) + if err == nil { + panic("expecting 'not found' error") + } + if v != nil { + panic(v) + } + + // can use empty key as valid key in non-dupsort operations + k, v, err := cur.Get([]byte{}, nil, SetRange) + if err != nil { + panic(err) + } + if k == nil { + panic("nil") + } + if !bytes.Equal(k, []byte{1}) { + panic(fmt.Sprintf("%x", k)) + } + if !bytes.Equal(v, []byte{}) { + panic(fmt.Sprintf("%x", v)) + } + k, v, err = cur.Get([]byte{}, nil, Set) + if err == nil { + panic("expected 'not found' error") + } + if k != nil { + panic("nil") + } + + return nil + }) + if err != nil { + t.Error(err) + } +} + +func TestDupCursor_EmptyKeyValues2(t *testing.T) { + t.Skip() + env, _ := setup(t) + + var db DBI + err := env.Update(func(txn *Txn) (err error) { + db, err = txn.OpenDBI("testingdup", Create|DupSort, nil, nil) + if err != nil { + return err + } + cur, err := txn.OpenCursor(db) + if err != nil { + return err + } + defer cur.Close() + + // empty value - must function as valid dupsort value + if err = txn.Put(db, []byte{1}, []byte{}, 0); err != nil { + panic(err) + } + if err = txn.Put(db, []byte{1}, []byte{8}, 0); err != nil { + panic(err) + } + + _, v, err := cur.Get([]byte{1}, []byte{}, GetBothRange) + if err != nil { + panic(err) + } + if !bytes.Equal(v, []byte{}) { + panic(v) + } + _, v, err = cur.Get([]byte{1}, []byte{0}, GetBothRange) + if err != nil { + panic(err) + } + if !bytes.Equal(v, []byte{8}) { + panic(v) + } + _, v, err = cur.Get([]byte{}, []byte{0}, GetBoth) + if err == nil { + panic("expecting 'not found' error") + } + if v != nil { + panic(v) + } + + // can use empty key as valid key in non-dupsort operations + k, v, err := cur.Get([]byte{}, nil, SetRange) + if err != nil { + panic(err) + } + if k == nil { + panic("nil") + } + if !bytes.Equal(k, []byte{1}) { + panic(fmt.Sprintf("%x", k)) + } + if !bytes.Equal(v, []byte{}) { + panic(fmt.Sprintf("%x", v)) + } + k, v, err = cur.Get([]byte{}, nil, Set) + if err == nil { + panic("expected 'not found' error") + } + if k != nil { + panic("nil") + } + + // empty key - must function as valid dupsort key + if err = txn.Put(db, []byte{}, []byte{}, 0); err != nil { + panic(err) + } + if err = txn.Put(db, []byte{}, []byte{2}, 0); err != nil { + panic(err) + } + _, v, err = cur.Get([]byte{}, []byte{}, GetBothRange) + if err != nil { + panic(err) + } + if !bytes.Equal(v, []byte{}) { + panic(v) + } + _, v, err = cur.Get([]byte{}, []byte{0}, GetBothRange) + if err != nil { + panic(err) + } + if !bytes.Equal(v, []byte{2}) { + panic(v) + } + _, v, err = cur.Get([]byte{}, []byte{0}, GetBoth) + if err == nil { + panic("expecting 'not found' error ") + } + if v != nil { + panic(v) + } + + return nil + }) + if err != nil { + t.Error(err) + } +} + +func TestDupCursor_EmptyKeyValues3(t *testing.T) { + t.Skip() + env, _ := setup(t) + + var db DBI + err := env.Update(func(txn *Txn) (err error) { + db, err = txn.OpenDBI("testingdup", Create|DupSort, nil, nil) + if err != nil { + return err + } + cur, err := txn.OpenCursor(db) + if err != nil { + return err + } + defer cur.Close() + + // empty value - must function as valid dupsort value + if err = txn.Put(db, []byte{1}, []byte{}, 0); err != nil { + panic(err) + } + if err = txn.Put(db, []byte{1}, []byte{8}, 0); err != nil { + panic(err) + } + + _, v, err := cur.Get([]byte{1}, []byte{}, GetBothRange) + if err != nil { + panic(err) + } + if !bytes.Equal(v, []byte{}) { + panic(v) + } + _, v, err = cur.Get([]byte{1}, []byte{0}, GetBothRange) + if err != nil { + panic(err) + } + if !bytes.Equal(v, []byte{8}) { + panic(v) + } + _, v, err = cur.Get([]byte{}, []byte{0}, GetBoth) + if err == nil { + panic("expecting 'not found' error") + } + if v != nil { + panic(v) + } + + // can use empty key as valid key in non-dupsort operations + k, v, err := cur.Get([]byte{}, nil, SetRange) + if err != nil { + panic(err) + } + if k == nil { + panic("nil") + } + if !bytes.Equal(k, []byte{1}) { + panic(fmt.Sprintf("%x", k)) + } + if !bytes.Equal(v, []byte{}) { + panic(fmt.Sprintf("%x", v)) + } + k, v, err = cur.Get([]byte{}, nil, Set) + if err == nil { + panic("expected 'not found' error") + } + if k != nil { + panic("nil") + } + + // empty key - must function as valid dupsort key + if err = txn.Put(db, []byte{}, []byte{}, 0); err != nil { + panic(err) + } + if err = txn.Put(db, []byte{}, []byte{2}, 0); err != nil { + panic(err) + } + _, v, err = cur.Get([]byte{}, []byte{}, GetBothRange) + if err != nil { + panic(err) + } + if !bytes.Equal(v, []byte{}) { + panic(v) + } + _, v, err = cur.Get([]byte{}, []byte{0}, GetBothRange) + if err != nil { + panic(err) + } + if !bytes.Equal(v, []byte{2}) { + panic(v) + } + _, v, err = cur.Get([]byte{}, []byte{0}, GetBoth) + if err == nil { + panic("expecting 'not found' error ") + } + if v != nil { + panic(v) + } + + // non-existing key + _, v, err = cur.Get([]byte{7}, []byte{}, GetBoth) + if err == nil { + panic("expecting 'not found' error") + } + if v != nil { + panic(v) + } + + return nil + }) + if err != nil { + t.Error(err) + } +} + func TestDupCursor_EmptyKeyValues(t *testing.T) { + t.Skip() env, _ := setup(t) var db DBI diff --git a/mdbx/mdbx.c b/mdbx/mdbx.c index 2317ba8..4a6ea35 100644 --- a/mdbx/mdbx.c +++ b/mdbx/mdbx.c @@ -12,7 +12,7 @@ * . */ #define xMDBX_ALLOY 1 -#define MDBX_BUILD_SOURCERY a3bc805b6c34de756a896bd408af909f74b858903ba60515d7c218c0181e020b_v0_12_9_0_g185e43f3 +#define MDBX_BUILD_SOURCERY 0c86daff919dc09ab5e99c8e5bfb1d51e1367babca9c630c97af92a2aa988ea9_v0_12_9_16_gfff3fbd8 #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif @@ -3628,8 +3628,12 @@ struct MDBX_env { struct MDBX_lockinfo *me_lck; unsigned me_psize; /* DB page size, initialized from me_os_psize */ - unsigned me_leaf_nodemax; /* max size of a leaf-node */ - unsigned me_branch_nodemax; /* max size of a branch-node */ + uint16_t me_leaf_nodemax; /* max size of a leaf-node */ + uint16_t me_branch_nodemax; /* max size of a branch-node */ + uint16_t me_subpage_limit; + uint16_t me_subpage_room_threshold; + uint16_t me_subpage_reserve_prereq; + uint16_t me_subpage_reserve_limit; atomic_pgno_t me_mlocked_pgno; uint8_t me_psize2log; /* log2 of DB page size */ int8_t me_stuck_meta; /* recovery-only: target meta page or less that zero */ @@ -7371,7 +7375,7 @@ static int __must_check_result node_add_leaf2(MDBX_cursor *mc, size_t indx, const MDBX_val *key); static void node_del(MDBX_cursor *mc, size_t ksize); -static void node_shrink(MDBX_page *mp, size_t indx); +static MDBX_node *node_shrink(MDBX_page *mp, size_t indx, MDBX_node *node); static int __must_check_result node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft); static int __must_check_result node_read(MDBX_cursor *mc, const MDBX_node *leaf, @@ -10232,6 +10236,7 @@ __cold static int dxb_resize(MDBX_env *const env, const pgno_t used_pgno, #endif /* MDBX_ENABLE_MADVISE */ rc = osal_mresize(mresize_flags, &env->me_dxb_mmap, size_bytes, limit_bytes); + eASSERT(env, env->me_dxb_mmap.limit >= env->me_dxb_mmap.current); #if MDBX_ENABLE_MADVISE if (rc == MDBX_SUCCESS) { @@ -10257,6 +10262,7 @@ __cold static int dxb_resize(MDBX_env *const env, const pgno_t used_pgno, bailout: if (rc == MDBX_SUCCESS) { + eASSERT(env, env->me_dxb_mmap.limit >= env->me_dxb_mmap.current); eASSERT(env, limit_bytes == env->me_dxb_mmap.limit); eASSERT(env, size_bytes <= env->me_dxb_mmap.filesize); if (mode == explicit_resize) @@ -10287,6 +10293,7 @@ __cold static int dxb_resize(MDBX_env *const env, const pgno_t used_pgno, "present %" PRIuPTR " -> %" PRIuPTR ", " "limit %" PRIuPTR " -> %" PRIuPTR ", errcode %d", prev_size, size_bytes, prev_limit, limit_bytes, rc); + eASSERT(env, env->me_dxb_mmap.limit >= env->me_dxb_mmap.current); } if (!env->me_dxb_mmap.base) { env->me_flags |= MDBX_FATAL_ERROR; @@ -11324,7 +11331,7 @@ static pgr_t page_alloc_slowpath(const MDBX_cursor *const mc, const size_t num, //--------------------------------------------------------------------------- if (unlikely(!is_gc_usable(txn, mc, flags))) { - eASSERT(env, txn->mt_flags & MDBX_TXN_DRAINED_GC); + eASSERT(env, (txn->mt_flags & MDBX_TXN_DRAINED_GC) || num > 1); goto no_gc; } @@ -11986,7 +11993,7 @@ __hot static int page_touch(MDBX_cursor *mc) { np->mp_txnid = txn->mt_front; return MDBX_SUCCESS; } - tASSERT(txn, !IS_OVERFLOW(mp)); + tASSERT(txn, !IS_OVERFLOW(mp) && !IS_SUBP(mp)); if (IS_FROZEN(txn, mp)) { /* CoW the page */ @@ -13098,6 +13105,7 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { const size_t used_bytes = pgno2bytes(env, txn->mt_next_pgno); const size_t required_bytes = (txn->mt_flags & MDBX_TXN_RDONLY) ? used_bytes : size_bytes; + eASSERT(env, env->me_dxb_mmap.limit >= env->me_dxb_mmap.current); if (unlikely(required_bytes > env->me_dxb_mmap.current)) { /* Размер БД (для пишущих транзакций) или используемых данных (для * читающих транзакций) больше предыдущего/текущего размера внутри @@ -13115,6 +13123,7 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { txn->mt_geo.upper, implicit_grow); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; + eASSERT(env, env->me_dxb_mmap.limit >= env->me_dxb_mmap.current); } else if (unlikely(size_bytes < env->me_dxb_mmap.current)) { /* Размер БД меньше предыдущего/текущего размера внутри процесса, можно * уменьшить, но всё сложнее: @@ -13140,11 +13149,15 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { rc = osal_fastmutex_acquire(&env->me_remap_guard); #endif if (likely(rc == MDBX_SUCCESS)) { + eASSERT(env, env->me_dxb_mmap.limit >= env->me_dxb_mmap.current); rc = osal_filesize(env->me_dxb_mmap.fd, &env->me_dxb_mmap.filesize); if (likely(rc == MDBX_SUCCESS)) { eASSERT(env, env->me_dxb_mmap.filesize >= required_bytes); if (env->me_dxb_mmap.current > env->me_dxb_mmap.filesize) - env->me_dxb_mmap.current = (size_t)env->me_dxb_mmap.filesize; + env->me_dxb_mmap.current = + (env->me_dxb_mmap.limit < env->me_dxb_mmap.filesize) + ? env->me_dxb_mmap.limit + : (size_t)env->me_dxb_mmap.filesize; } #if defined(_WIN32) || defined(_WIN64) osal_srwlock_ReleaseShared(&env->me_remap_guard); @@ -13209,10 +13222,11 @@ static __always_inline int check_txn(const MDBX_txn *txn, int bad_bits) { if (unlikely(txn->mt_flags & bad_bits)) return MDBX_BAD_TXN; - tASSERT(txn, (txn->mt_flags & MDBX_NOTLS) == - ((txn->mt_flags & MDBX_TXN_RDONLY) - ? txn->mt_env->me_flags & MDBX_NOTLS - : 0)); + tASSERT(txn, (txn->mt_flags & MDBX_TXN_FINISHED) || + (txn->mt_flags & MDBX_NOTLS) == + ((txn->mt_flags & MDBX_TXN_RDONLY) + ? txn->mt_env->me_flags & MDBX_NOTLS + : 0)); #if MDBX_TXN_CHECKOWNER STATIC_ASSERT(MDBX_NOTLS > MDBX_TXN_FINISHED + MDBX_TXN_RDONLY); if (unlikely(txn->mt_owner != osal_thread_self()) && @@ -14271,7 +14285,9 @@ static int gcu_prepare_backlog(MDBX_txn *txn, gcu_context_t *ctx) { const size_t for_all_before_touch = for_relist + for_tree_before_touch; const size_t for_all_after_touch = for_relist + for_tree_after_touch; - if (likely(for_relist < 2 && gcu_backlog_size(txn) > for_all_before_touch)) + if (likely(for_relist < 2 && gcu_backlog_size(txn) > for_all_before_touch) && + (ctx->cursor.mc_snum == 0 || + IS_MODIFIABLE(txn, ctx->cursor.mc_pg[ctx->cursor.mc_top]))) return MDBX_SUCCESS; TRACE(">> retired-stored %zu, left %zi, backlog %zu, need %zu (4list %zu, " @@ -17115,13 +17131,23 @@ __cold static void setup_pagesize(MDBX_env *env, const size_t pagesize) { leaf_nodemax > (intptr_t)(sizeof(MDBX_db) + NODESIZE + 42) && leaf_nodemax >= branch_nodemax && leaf_nodemax < (int)UINT16_MAX && leaf_nodemax % 2 == 0); - env->me_leaf_nodemax = (unsigned)leaf_nodemax; - env->me_branch_nodemax = (unsigned)branch_nodemax; + env->me_leaf_nodemax = (uint16_t)leaf_nodemax; + env->me_branch_nodemax = (uint16_t)branch_nodemax; env->me_psize2log = (uint8_t)log2n_powerof2(pagesize); eASSERT(env, pgno2bytes(env, 1) == pagesize); eASSERT(env, bytes2pgno(env, pagesize + pagesize) == 2); recalculate_merge_threshold(env); + /* TODO: recalculate me_subpage_xyz values from MDBX_opt_subpage_xyz. */ + env->me_subpage_limit = env->me_leaf_nodemax - NODESIZE; + env->me_subpage_room_threshold = 0; + env->me_subpage_reserve_prereq = env->me_leaf_nodemax; + env->me_subpage_reserve_limit = env->me_subpage_limit / 42; + eASSERT(env, + env->me_subpage_reserve_prereq > + env->me_subpage_room_threshold + env->me_subpage_reserve_limit); + eASSERT(env, env->me_leaf_nodemax >= env->me_subpage_limit + NODESIZE); + const pgno_t max_pgno = bytes2pgno(env, MAX_MAPSIZE); if (!env->me_options.flags.non_auto.dp_limit) { /* auto-setup dp_limit by "The42" ;-) */ @@ -19728,8 +19754,12 @@ __hot static __always_inline int page_get_checker_lite(const uint16_t ILL, if (((ILL & P_OVERFLOW) || !IS_OVERFLOW(page)) && (ILL & (P_BRANCH | P_LEAF | P_LEAF2)) == 0) { - if (unlikely(page->mp_upper < page->mp_lower || - ((page->mp_lower | page->mp_upper) & 1) || + /* Контроль четности page->mp_upper тут либо приводит к ложным ошибкам, + * либо слишком дорог по количеству операций. Заковырка в том, что mp_upper + * может быть нечетным на LEAF2-страницах, при нечетном количестве элементов + * нечетной длины. Поэтому четность page->mp_upper здесь не проверяется, но + * соответствующие полные проверки есть в page_check(). */ + if (unlikely(page->mp_upper < page->mp_lower || (page->mp_lower & 1) || PAGEHDRSZ + page->mp_upper > txn->mt_env->me_psize)) return bad_page(page, "invalid page' lower(%u)/upper(%u) with limit %zu\n", @@ -21309,6 +21339,26 @@ static __hot int cursor_touch(MDBX_cursor *const mc, const MDBX_val *key, return rc; } +static size_t leaf2_reserve(const MDBX_env *const env, size_t host_page_room, + size_t subpage_len, size_t item_len) { + eASSERT(env, (subpage_len & 1) == 0); + eASSERT(env, + env->me_subpage_reserve_prereq > env->me_subpage_room_threshold + + env->me_subpage_reserve_limit && + env->me_leaf_nodemax >= env->me_subpage_limit + NODESIZE); + size_t reserve = 0; + for (size_t n = 0; + n < 5 && reserve + item_len <= env->me_subpage_reserve_limit && + EVEN(subpage_len + item_len) <= env->me_subpage_limit && + host_page_room >= + env->me_subpage_reserve_prereq + EVEN(subpage_len + item_len); + ++n) { + subpage_len += item_len; + reserve += item_len; + } + return reserve + (subpage_len & 1); +} + static __hot int cursor_put_nochecklen(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, unsigned flags) { int err; @@ -21376,11 +21426,11 @@ static __hot int cursor_put_nochecklen(MDBX_cursor *mc, const MDBX_val *key, rc = MDBX_NO_ROOT; } else if ((flags & MDBX_CURRENT) == 0) { bool exact = false; - MDBX_val lastkey, olddata; + MDBX_val last_key, old_data; if ((flags & MDBX_APPEND) && mc->mc_db->md_entries > 0) { - rc = cursor_last(mc, &lastkey, &olddata); + rc = cursor_last(mc, &last_key, &old_data); if (likely(rc == MDBX_SUCCESS)) { - const int cmp = mc->mc_dbx->md_cmp(key, &lastkey); + const int cmp = mc->mc_dbx->md_cmp(key, &last_key); if (likely(cmp > 0)) { mc->mc_ki[mc->mc_top]++; /* step forward for appending */ rc = MDBX_NOTFOUND; @@ -21395,7 +21445,7 @@ static __hot int cursor_put_nochecklen(MDBX_cursor *mc, const MDBX_val *key, } else { struct cursor_set_result csr = /* olddata may not be updated in case LEAF2-page of dupfixed-subDB */ - cursor_set(mc, (MDBX_val *)key, &olddata, MDBX_SET); + cursor_set(mc, (MDBX_val *)key, &old_data, MDBX_SET); rc = csr.err; exact = csr.exact; } @@ -21403,14 +21453,14 @@ static __hot int cursor_put_nochecklen(MDBX_cursor *mc, const MDBX_val *key, if (exact) { if (unlikely(flags & MDBX_NOOVERWRITE)) { DEBUG("duplicate key [%s]", DKEY_DEBUG(key)); - *data = olddata; + *data = old_data; return MDBX_KEYEXIST; } if (unlikely(mc->mc_flags & C_SUB)) { /* nested subtree of DUPSORT-database with the same key, * nothing to update */ eASSERT(env, data->iov_len == 0 && - (olddata.iov_len == 0 || + (old_data.iov_len == 0 || /* olddata may not be updated in case LEAF2-page of dupfixed-subDB */ (mc->mc_db->md_flags & MDBX_DUPFIXED))); @@ -21426,8 +21476,8 @@ static __hot int cursor_put_nochecklen(MDBX_cursor *mc, const MDBX_val *key, exact = false; } else if (!(flags & (MDBX_RESERVE | MDBX_MULTIPLE))) { /* checking for early exit without dirtying pages */ - if (unlikely(eq_fast(data, &olddata))) { - cASSERT(mc, mc->mc_dbx->md_dcmp(data, &olddata) == 0); + if (unlikely(eq_fast(data, &old_data))) { + cASSERT(mc, mc->mc_dbx->md_dcmp(data, &old_data) == 0); if (mc->mc_xcursor) { if (flags & MDBX_NODUPDATA) return MDBX_KEYEXIST; @@ -21437,7 +21487,7 @@ static __hot int cursor_put_nochecklen(MDBX_cursor *mc, const MDBX_val *key, /* the same data, nothing to update */ return MDBX_SUCCESS; } - cASSERT(mc, mc->mc_dbx->md_dcmp(data, &olddata) != 0); + cASSERT(mc, mc->mc_dbx->md_dcmp(data, &old_data) != 0); } } } else if (unlikely(rc != MDBX_NOTFOUND)) @@ -21445,17 +21495,16 @@ static __hot int cursor_put_nochecklen(MDBX_cursor *mc, const MDBX_val *key, } mc->mc_flags &= ~C_DEL; - MDBX_val xdata, *rdata = data; - size_t mcount = 0, dcount = 0; + MDBX_val xdata, *ref_data = data; + size_t *batch_dupfixed_done = nullptr, batch_dupfixed_given = 0; if (unlikely(flags & MDBX_MULTIPLE)) { - dcount = data[1].iov_len; - data[1].iov_len = 0 /* reset done item counter */; - rdata = &xdata; - xdata.iov_len = data->iov_len * dcount; + batch_dupfixed_given = data[1].iov_len; + batch_dupfixed_done = &data[1].iov_len; + *batch_dupfixed_done = 0; } /* Cursor is positioned, check for room in the dirty list */ - err = cursor_touch(mc, key, rdata); + err = cursor_touch(mc, key, ref_data); if (unlikely(err)) return err; @@ -21484,13 +21533,13 @@ static __hot int cursor_put_nochecklen(MDBX_cursor *mc, const MDBX_val *key, mc->mc_xcursor->mx_dbx.md_klen_min = mc->mc_xcursor->mx_dbx.md_klen_max = data->iov_len); + if (mc->mc_flags & C_SUB) + npr.page->mp_flags |= P_LEAF2; } - if ((mc->mc_db->md_flags & (MDBX_DUPSORT | MDBX_DUPFIXED)) == MDBX_DUPFIXED) - npr.page->mp_flags |= P_LEAF2; mc->mc_flags |= C_INITIALIZED; } - MDBX_val dkey, olddata; + MDBX_val old_singledup, old_data; MDBX_db nested_dupdb; MDBX_page *sub_root = nullptr; bool insert_key, insert_data; @@ -21498,19 +21547,19 @@ static __hot int cursor_put_nochecklen(MDBX_cursor *mc, const MDBX_val *key, MDBX_page *fp = env->me_pbuf; fp->mp_txnid = mc->mc_txn->mt_front; insert_key = insert_data = (rc != MDBX_SUCCESS); - dkey.iov_base = nullptr; + old_singledup.iov_base = nullptr; if (insert_key) { /* The key does not exist */ DEBUG("inserting key at index %i", mc->mc_ki[mc->mc_top]); if ((mc->mc_db->md_flags & MDBX_DUPSORT) && node_size(key, data) > env->me_leaf_nodemax) { /* Too big for a node, insert in sub-DB. Set up an empty - * "old sub-page" for prep_subDB to expand to a full page. */ + * "old sub-page" for convert_to_subtree to expand to a full page. */ fp->mp_leaf2_ksize = (mc->mc_db->md_flags & MDBX_DUPFIXED) ? (uint16_t)data->iov_len : 0; fp->mp_lower = fp->mp_upper = 0; - olddata.iov_len = PAGEHDRSZ; - goto prep_subDB; + old_data.iov_len = PAGEHDRSZ; + goto convert_to_subtree; } } else { /* there's only a key anyway, so this is a no-op */ @@ -21555,7 +21604,8 @@ static __hot int cursor_put_nochecklen(MDBX_cursor *mc, const MDBX_val *key, if (unlikely(err != MDBX_SUCCESS)) return err; } - MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + MDBX_node *const node = + page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); /* Large/Overflow page overwrites need special handling */ if (unlikely(node_flags(node) & F_BIGDATA)) { @@ -21629,19 +21679,18 @@ static __hot int cursor_put_nochecklen(MDBX_cursor *mc, const MDBX_val *key, if ((err = page_retire(mc, lp.page)) != MDBX_SUCCESS) return err; } else { - olddata.iov_len = node_ds(node); - olddata.iov_base = node_data(node); - cASSERT(mc, ptr_disp(olddata.iov_base, olddata.iov_len) <= + old_data.iov_len = node_ds(node); + old_data.iov_base = node_data(node); + cASSERT(mc, ptr_disp(old_data.iov_base, old_data.iov_len) <= ptr_disp(mc->mc_pg[mc->mc_top], env->me_psize)); /* DB has dups? */ if (mc->mc_db->md_flags & MDBX_DUPSORT) { /* Prepare (sub-)page/sub-DB to accept the new item, if needed. * fp: old sub-page or a header faking it. - * mp: new (sub-)page. offset: growth in page size. - * xdata: node data with new page or DB. */ - size_t i; - size_t offset = 0; + * mp: new (sub-)page. + * xdata: node data with new sub-page or sub-DB. */ + size_t growth = 0; /* growth in page size.*/ MDBX_page *mp = fp = xdata.iov_base = env->me_pbuf; mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno; @@ -21649,19 +21698,19 @@ static __hot int cursor_put_nochecklen(MDBX_cursor *mc, const MDBX_val *key, if (!(node_flags(node) & F_DUPDATA)) { /* does data match? */ if (flags & MDBX_APPENDDUP) { - const int cmp = mc->mc_dbx->md_dcmp(data, &olddata); - cASSERT(mc, cmp != 0 || eq_fast(data, &olddata)); + const int cmp = mc->mc_dbx->md_dcmp(data, &old_data); + cASSERT(mc, cmp != 0 || eq_fast(data, &old_data)); if (unlikely(cmp <= 0)) return MDBX_EKEYMISMATCH; - } else if (eq_fast(data, &olddata)) { - cASSERT(mc, mc->mc_dbx->md_dcmp(data, &olddata) == 0); + } else if (eq_fast(data, &old_data)) { + cASSERT(mc, mc->mc_dbx->md_dcmp(data, &old_data) == 0); if (flags & MDBX_NODUPDATA) return MDBX_KEYEXIST; /* data is match exactly byte-to-byte, nothing to update */ rc = MDBX_SUCCESS; - if (likely((flags & MDBX_MULTIPLE) == 0)) - return rc; - goto continue_multiple; + if (unlikely(batch_dupfixed_done)) + goto batch_dupfixed_continue; + return rc; } /* Just overwrite the current item */ @@ -21671,62 +21720,143 @@ static __hot int cursor_put_nochecklen(MDBX_cursor *mc, const MDBX_val *key, } /* Back up original data item */ - memcpy(dkey.iov_base = fp + 1, olddata.iov_base, - dkey.iov_len = olddata.iov_len); + memcpy(old_singledup.iov_base = fp + 1, old_data.iov_base, + old_singledup.iov_len = old_data.iov_len); /* Make sub-page header for the dup items, with dummy body */ fp->mp_flags = P_LEAF | P_SUBP; fp->mp_lower = 0; - xdata.iov_len = PAGEHDRSZ + dkey.iov_len + data->iov_len; + xdata.iov_len = PAGEHDRSZ + old_data.iov_len + data->iov_len; if (mc->mc_db->md_flags & MDBX_DUPFIXED) { fp->mp_flags |= P_LEAF2; fp->mp_leaf2_ksize = (uint16_t)data->iov_len; - xdata.iov_len += 2 * data->iov_len; /* leave space for 2 more */ - cASSERT(mc, xdata.iov_len <= env->me_psize); + /* Будем создавать LEAF2-страницу, как минимум с двумя элементами. + * При коротких значениях и наличии свободного места можно сделать + * некоторое резервирование места, чтобы при последующих добавлениях + * не сразу расширять созданную под-страницу. + * Резервирование в целом сомнительно (см ниже), но может сработать + * в плюс (а если в минус то несущественный) при коротких ключах. */ + xdata.iov_len += leaf2_reserve( + env, page_room(mc->mc_pg[mc->mc_top]) + old_data.iov_len, + xdata.iov_len, data->iov_len); + cASSERT(mc, (xdata.iov_len & 1) == 0); } else { xdata.iov_len += 2 * (sizeof(indx_t) + NODESIZE) + - (dkey.iov_len & 1) + (data->iov_len & 1); - cASSERT(mc, xdata.iov_len <= env->me_psize); + (old_data.iov_len & 1) + (data->iov_len & 1); } + cASSERT(mc, (xdata.iov_len & 1) == 0); fp->mp_upper = (uint16_t)(xdata.iov_len - PAGEHDRSZ); - olddata.iov_len = xdata.iov_len; /* pretend olddata is fp */ + old_data.iov_len = xdata.iov_len; /* pretend olddata is fp */ } else if (node_flags(node) & F_SUBDATA) { /* Data is on sub-DB, just store it */ flags |= F_DUPDATA | F_SUBDATA; - goto put_sub; + goto dupsort_put; } else { /* Data is on sub-page */ - fp = olddata.iov_base; + fp = old_data.iov_base; switch (flags) { default: - if (!(mc->mc_db->md_flags & MDBX_DUPFIXED)) { - offset = node_size(data, nullptr) + sizeof(indx_t); - break; - } - offset = fp->mp_leaf2_ksize; - if (page_room(fp) < offset) { - offset *= 4; /* space for 4 more */ - break; + growth = IS_LEAF2(fp) ? fp->mp_leaf2_ksize + : (node_size(data, nullptr) + sizeof(indx_t)); + if (page_room(fp) >= growth) { + /* На текущей под-странице есть место для добавления элемента. + * Оптимальнее продолжить использовать эту страницу, ибо + * добавление вложенного дерева увеличит WAF на одну страницу. */ + goto continue_subpage; } - /* FALLTHRU: Big enough MDBX_DUPFIXED sub-page */ - __fallthrough; + /* На текущей под-странице нет места для еще одного элемента. + * Можно либо увеличить эту под-страницу, либо вынести куст + * значений во вложенное дерево. + * + * Продолжать использовать текущую под-страницу возможно + * только пока и если размер после добавления элемента будет + * меньше me_leaf_nodemax. Соответственно, при превышении + * просто сразу переходим на вложенное дерево. */ + xdata.iov_len = old_data.iov_len + (growth += growth & 1); + if (xdata.iov_len > env->me_subpage_limit) + goto convert_to_subtree; + + /* Можно либо увеличить под-страницу, в том числе с некоторым + * запасом, либо перейти на вложенное поддерево. + * + * Резервирование места на под-странице представляется сомнительным: + * - Резервирование увеличит рыхлость страниц, в том числе + * вероятность разделения основной/гнездовой страницы; + * - Сложно предсказать полезный размер резервирования, + * особенно для не-MDBX_DUPFIXED; + * - Наличие резерва позволяет съекономить только на перемещении + * части элементов основной/гнездовой страницы при последующих + * добавлениях в нее элементов. Причем после первого изменения + * размера под-страницы, её тело будет примыкать + * к неиспользуемому месту на основной/гнездовой странице, + * поэтому последующие последовательные добавления потребуют + * только передвижения в mp_ptrs[]. + * + * Соответственно, более важным/определяющим представляется + * своевременный переход к вложеному дереву, но тут достаточно + * сложный конфликт интересов: + * - При склонности к переходу к вложенным деревьям, суммарно + * в БД будет большее кол-во более рыхлых страниц. Это увеличит + * WAF, а также RAF при последовательных чтениях большой БД. + * Однако, при коротких ключах и большом кол-ве + * дубликатов/мультизначений, плотность ключей в листовых + * страницах основного дерева будет выше. Соответственно, будет + * пропорционально меньше branch-страниц. Поэтому будет выше + * вероятность оседания/не-вымывания страниц основного дерева из + * LRU-кэша, а также попадания в write-back кэш при записи. + * - Наоботот, при склонности к использованию под-страниц, будут + * наблюдаться обратные эффекты. Плюс некоторые накладные расходы + * на лишнее копирование данных под-страниц в сценариях + * нескольких обонвлений дубликатов одного куста в одной + * транзакции. + * + * Суммарно наиболее рациональным представляется такая тактика: + * - Вводим три порога subpage_limit, subpage_room_threshold + * и subpage_reserve_prereq, которые могут быть + * заданы/скорректированы пользователем в ‰ от me_leaf_nodemax; + * - Используем под-страницу пока её размер меньше subpage_limit + * и на основной/гнездовой странице не-менее + * subpage_room_threshold свободного места; + * - Резервируем место только для 1-3 коротких dupfixed-элементов, + * расширяя размер под-страницы на размер кэш-линии ЦПУ, но + * только если на странице не менее subpage_reserve_prereq + * свободного места. + * - По-умолчанию устанавливаем: + * subpage_limit = me_leaf_nodemax (1000‰); + * subpage_room_threshold = 0; + * subpage_reserve_prereq = me_leaf_nodemax (1000‰). + */ + if (IS_LEAF2(fp)) + growth += leaf2_reserve( + env, page_room(mc->mc_pg[mc->mc_top]) + old_data.iov_len, + xdata.iov_len, data->iov_len); + break; + case MDBX_CURRENT | MDBX_NODUPDATA: case MDBX_CURRENT: + continue_subpage: fp->mp_txnid = mc->mc_txn->mt_front; fp->mp_pgno = mp->mp_pgno; mc->mc_xcursor->mx_cursor.mc_pg[0] = fp; flags |= F_DUPDATA; - goto put_sub; + goto dupsort_put; } - xdata.iov_len = olddata.iov_len + offset; + xdata.iov_len = old_data.iov_len + growth; + cASSERT(mc, (xdata.iov_len & 1) == 0); } fp_flags = fp->mp_flags; - if (node_size_len(node_ks(node), xdata.iov_len) > - env->me_leaf_nodemax) { + if (xdata.iov_len > env->me_subpage_limit || + node_size_len(node_ks(node), xdata.iov_len) > + env->me_leaf_nodemax || + (env->me_subpage_room_threshold && + page_room(mc->mc_pg[mc->mc_top]) + + node_size_len(node_ks(node), old_data.iov_len) < + env->me_subpage_room_threshold + + node_size_len(node_ks(node), xdata.iov_len))) { /* Too big for a sub-page, convert to sub-DB */ + convert_to_subtree: fp_flags &= ~P_SUBP; - prep_subDB: nested_dupdb.md_xsize = 0; nested_dupdb.md_flags = flags_db2sub(mc->mc_db->md_flags); if (mc->mc_db->md_flags & MDBX_DUPFIXED) { @@ -21745,8 +21875,9 @@ static __hot int cursor_put_nochecklen(MDBX_cursor *mc, const MDBX_val *key, if (unlikely(par.err != MDBX_SUCCESS)) return par.err; mc->mc_db->md_leaf_pages += 1; - cASSERT(mc, env->me_psize > olddata.iov_len); - offset = env->me_psize - (unsigned)olddata.iov_len; + cASSERT(mc, env->me_psize > old_data.iov_len); + growth = env->me_psize - (unsigned)old_data.iov_len; + cASSERT(mc, (growth & 1) == 0); flags |= F_DUPDATA | F_SUBDATA; nested_dupdb.md_root = mp->mp_pgno; nested_dupdb.md_seq = 0; @@ -21758,29 +21889,33 @@ static __hot int cursor_put_nochecklen(MDBX_cursor *mc, const MDBX_val *key, mp->mp_txnid = mc->mc_txn->mt_front; mp->mp_leaf2_ksize = fp->mp_leaf2_ksize; mp->mp_lower = fp->mp_lower; - cASSERT(mc, fp->mp_upper + offset <= UINT16_MAX); - mp->mp_upper = (indx_t)(fp->mp_upper + offset); + cASSERT(mc, fp->mp_upper + growth < UINT16_MAX); + mp->mp_upper = fp->mp_upper + (indx_t)growth; if (unlikely(fp_flags & P_LEAF2)) { memcpy(page_data(mp), page_data(fp), page_numkeys(fp) * fp->mp_leaf2_ksize); + cASSERT(mc, + (((mp->mp_leaf2_ksize & page_numkeys(mp)) ^ mp->mp_upper) & + 1) == 0); } else { + cASSERT(mc, (mp->mp_upper & 1) == 0); memcpy(ptr_disp(mp, mp->mp_upper + PAGEHDRSZ), ptr_disp(fp, fp->mp_upper + PAGEHDRSZ), - olddata.iov_len - fp->mp_upper - PAGEHDRSZ); + old_data.iov_len - fp->mp_upper - PAGEHDRSZ); memcpy(mp->mp_ptrs, fp->mp_ptrs, page_numkeys(fp) * sizeof(mp->mp_ptrs[0])); - for (i = 0; i < page_numkeys(fp); i++) { - cASSERT(mc, mp->mp_ptrs[i] + offset <= UINT16_MAX); - mp->mp_ptrs[i] += (indx_t)offset; + for (size_t i = 0; i < page_numkeys(fp); i++) { + cASSERT(mc, mp->mp_ptrs[i] + growth <= UINT16_MAX); + mp->mp_ptrs[i] += (indx_t)growth; } } } if (!insert_key) node_del(mc, 0); - rdata = &xdata; + ref_data = &xdata; flags |= F_DUPDATA; - goto new_sub; + goto insert_node; } /* MDBX passes F_SUBDATA in 'flags' to write a DB record */ @@ -21788,15 +21923,15 @@ static __hot int cursor_put_nochecklen(MDBX_cursor *mc, const MDBX_val *key, return MDBX_INCOMPATIBLE; current: - if (data->iov_len == olddata.iov_len) { + if (data->iov_len == old_data.iov_len) { cASSERT(mc, EVEN(key->iov_len) == EVEN(node_ks(node))); /* same size, just replace it. Note that we could * also reuse this node if the new data is smaller, * but instead we opt to shrink the node in that case. */ if (flags & MDBX_RESERVE) - data->iov_base = olddata.iov_base; + data->iov_base = old_data.iov_base; else if (!(mc->mc_flags & C_SUB)) - memcpy(olddata.iov_base, data->iov_base, data->iov_len); + memcpy(old_data.iov_base, data->iov_base, data->iov_len); else { cASSERT(mc, page_numkeys(mc->mc_pg[mc->mc_top]) == 1); cASSERT(mc, PAGETYPE_COMPAT(mc->mc_pg[mc->mc_top]) == P_LEAF); @@ -21821,14 +21956,15 @@ static __hot int cursor_put_nochecklen(MDBX_cursor *mc, const MDBX_val *key, node_del(mc, 0); } - rdata = data; + ref_data = data; -new_sub:; +insert_node:; const unsigned naf = flags & NODE_ADD_FLAGS; - size_t nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->iov_len - : leaf_size(env, key, rdata); + size_t nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) + ? key->iov_len + : leaf_size(env, key, ref_data); if (page_room(mc->mc_pg[mc->mc_top]) < nsize) { - rc = page_split(mc, key, rdata, P_INVALID, + rc = page_split(mc, key, ref_data, P_INVALID, insert_key ? naf : naf | MDBX_SPLIT_REPLACE); if (rc == MDBX_SUCCESS && AUDIT_ENABLED()) rc = insert_key ? cursor_check(mc) : cursor_check_updating(mc); @@ -21836,25 +21972,25 @@ new_sub:; /* There is room already in this leaf page. */ if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { cASSERT(mc, !(naf & (F_BIGDATA | F_SUBDATA | F_DUPDATA)) && - rdata->iov_len == 0); + ref_data->iov_len == 0); rc = node_add_leaf2(mc, mc->mc_ki[mc->mc_top], key); } else - rc = node_add_leaf(mc, mc->mc_ki[mc->mc_top], key, rdata, naf); + rc = node_add_leaf(mc, mc->mc_ki[mc->mc_top], key, ref_data, naf); if (likely(rc == 0)) { /* Adjust other cursors pointing to mp */ const MDBX_dbi dbi = mc->mc_dbi; - const size_t i = mc->mc_top; - MDBX_page *const mp = mc->mc_pg[i]; + const size_t top = mc->mc_top; + MDBX_page *const mp = mc->mc_pg[top]; for (MDBX_cursor *m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { MDBX_cursor *m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; - if (m3 == mc || m3->mc_snum < mc->mc_snum || m3->mc_pg[i] != mp) + if (m3 == mc || m3->mc_snum < mc->mc_snum || m3->mc_pg[top] != mp) continue; - if (m3->mc_ki[i] >= mc->mc_ki[i]) - m3->mc_ki[i] += insert_key; + if (m3->mc_ki[top] >= mc->mc_ki[top]) + m3->mc_ki[top] += insert_key; if (XCURSOR_INITED(m3)) - XCURSOR_REFRESH(m3, mp, m3->mc_ki[i]); + XCURSOR_REFRESH(m3, mp, m3->mc_ki[top]); } } } @@ -21865,18 +22001,18 @@ new_sub:; * size limits on dupdata. The actual data fields of the child * DB are all zero size. */ if (flags & F_DUPDATA) { - unsigned xflags; - size_t ecount; - put_sub: - xdata.iov_len = 0; - xdata.iov_base = nullptr; + MDBX_val empty; + dupsort_put: + empty.iov_len = 0; + empty.iov_base = nullptr; MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); #define SHIFT_MDBX_NODUPDATA_TO_MDBX_NOOVERWRITE 1 STATIC_ASSERT( (MDBX_NODUPDATA >> SHIFT_MDBX_NODUPDATA_TO_MDBX_NOOVERWRITE) == MDBX_NOOVERWRITE); - xflags = MDBX_CURRENT | ((flags & MDBX_NODUPDATA) >> - SHIFT_MDBX_NODUPDATA_TO_MDBX_NOOVERWRITE); + unsigned xflags = + MDBX_CURRENT | ((flags & MDBX_NODUPDATA) >> + SHIFT_MDBX_NODUPDATA_TO_MDBX_NOOVERWRITE); if ((flags & MDBX_CURRENT) == 0) { xflags -= MDBX_CURRENT; err = cursor_xinit1(mc, node, mc->mc_pg[mc->mc_top]); @@ -21886,80 +22022,78 @@ new_sub:; if (sub_root) mc->mc_xcursor->mx_cursor.mc_pg[0] = sub_root; /* converted, write the original data first */ - if (dkey.iov_base) { - rc = cursor_put_nochecklen(&mc->mc_xcursor->mx_cursor, &dkey, &xdata, - xflags); + if (old_singledup.iov_base) { + rc = cursor_put_nochecklen(&mc->mc_xcursor->mx_cursor, &old_singledup, + &empty, xflags); if (unlikely(rc)) - goto bad_sub; + goto dupsort_error; } if (!(node_flags(node) & F_SUBDATA) || sub_root) { /* Adjust other cursors pointing to mp */ - MDBX_cursor *m2; - MDBX_xcursor *mx = mc->mc_xcursor; - size_t i = mc->mc_top; - MDBX_page *mp = mc->mc_pg[i]; + MDBX_xcursor *const mx = mc->mc_xcursor; + const size_t top = mc->mc_top; + MDBX_page *const mp = mc->mc_pg[top]; const intptr_t nkeys = page_numkeys(mp); - for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2 = m2->mc_next) { + for (MDBX_cursor *m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; + m2 = m2->mc_next) { if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; if (!(m2->mc_flags & C_INITIALIZED)) continue; - if (m2->mc_pg[i] == mp) { - if (m2->mc_ki[i] == mc->mc_ki[i]) { - err = cursor_xinit2(m2, mx, dkey.iov_base != nullptr); + if (m2->mc_pg[top] == mp) { + if (m2->mc_ki[top] == mc->mc_ki[top]) { + err = cursor_xinit2(m2, mx, old_singledup.iov_base != nullptr); if (unlikely(err != MDBX_SUCCESS)) return err; - } else if (!insert_key && m2->mc_ki[i] < nkeys) { - XCURSOR_REFRESH(m2, mp, m2->mc_ki[i]); + } else if (!insert_key && m2->mc_ki[top] < nkeys) { + XCURSOR_REFRESH(m2, mp, m2->mc_ki[top]); } } } } cASSERT(mc, mc->mc_xcursor->mx_db.md_entries < PTRDIFF_MAX); - ecount = (size_t)mc->mc_xcursor->mx_db.md_entries; + const size_t probe = (size_t)mc->mc_xcursor->mx_db.md_entries; #define SHIFT_MDBX_APPENDDUP_TO_MDBX_APPEND 1 STATIC_ASSERT((MDBX_APPENDDUP >> SHIFT_MDBX_APPENDDUP_TO_MDBX_APPEND) == MDBX_APPEND); xflags |= (flags & MDBX_APPENDDUP) >> SHIFT_MDBX_APPENDDUP_TO_MDBX_APPEND; - rc = cursor_put_nochecklen(&mc->mc_xcursor->mx_cursor, data, &xdata, + rc = cursor_put_nochecklen(&mc->mc_xcursor->mx_cursor, data, &empty, xflags); if (flags & F_SUBDATA) { void *db = node_data(node); mc->mc_xcursor->mx_db.md_mod_txnid = mc->mc_txn->mt_txnid; memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDBX_db)); } - insert_data = (ecount != (size_t)mc->mc_xcursor->mx_db.md_entries); + insert_data = (probe != (size_t)mc->mc_xcursor->mx_db.md_entries); } /* Increment count unless we just replaced an existing item. */ if (insert_data) mc->mc_db->md_entries++; if (insert_key) { - /* Invalidate txn if we created an empty sub-DB */ - if (unlikely(rc)) - goto bad_sub; + if (unlikely(rc != MDBX_SUCCESS)) + goto dupsort_error; /* If we succeeded and the key didn't exist before, * make sure the cursor is marked valid. */ mc->mc_flags |= C_INITIALIZED; } - if (unlikely(flags & MDBX_MULTIPLE)) { - if (likely(rc == MDBX_SUCCESS)) { - continue_multiple: - mcount++; + if (likely(rc == MDBX_SUCCESS)) { + if (unlikely(batch_dupfixed_done)) { + batch_dupfixed_continue: /* let caller know how many succeeded, if any */ - data[1].iov_len = mcount; - if (mcount < dcount) { + if ((*batch_dupfixed_done += 1) < batch_dupfixed_given) { data[0].iov_base = ptr_disp(data[0].iov_base, data[0].iov_len); insert_key = insert_data = false; - dkey.iov_base = nullptr; + old_singledup.iov_base = nullptr; goto more; } } + if (AUDIT_ENABLED()) + rc = cursor_check(mc); } - if (rc == MDBX_SUCCESS && AUDIT_ENABLED()) - rc = cursor_check(mc); return rc; - bad_sub: + + dupsort_error: if (unlikely(rc == MDBX_KEYEXIST)) { /* should not happen, we deleted that item */ ERROR("Unexpected %i error while put to nested dupsort's hive", rc); @@ -22123,6 +22257,7 @@ static __hot int cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { return rc; MDBX_page *mp = mc->mc_pg[mc->mc_top]; + cASSERT(mc, IS_MODIFIABLE(mc->mc_txn, mp)); if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", mp->mp_pgno, mp->mp_flags); @@ -22141,7 +22276,7 @@ static __hot int cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { if (!(node_flags(node) & F_SUBDATA)) mc->mc_xcursor->mx_cursor.mc_pg[0] = node_data(node); rc = cursor_del(&mc->mc_xcursor->mx_cursor, 0); - if (unlikely(rc)) + if (unlikely(rc != MDBX_SUCCESS)) return rc; /* If sub-DB still has entries, we're done */ if (mc->mc_xcursor->mx_db.md_entries) { @@ -22150,11 +22285,10 @@ static __hot int cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { mc->mc_xcursor->mx_db.md_mod_txnid = mc->mc_txn->mt_txnid; memcpy(node_data(node), &mc->mc_xcursor->mx_db, sizeof(MDBX_db)); } else { - /* shrink fake page */ - node_shrink(mp, mc->mc_ki[mc->mc_top]); - node = page_node(mp, mc->mc_ki[mc->mc_top]); + /* shrink sub-page */ + node = node_shrink(mp, mc->mc_ki[mc->mc_top], node); mc->mc_xcursor->mx_cursor.mc_pg[0] = node_data(node); - /* fix other sub-DB cursors pointed at fake pages on this page */ + /* fix other sub-DB cursors pointed at sub-pages on this page */ for (MDBX_cursor *m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2 = m2->mc_next) { if (m2 == mc || m2->mc_snum < mc->mc_snum) @@ -22381,6 +22515,7 @@ __hot static int __must_check_result node_add_leaf2(MDBX_cursor *mc, const size_t ksize = mc->mc_db->md_xsize; cASSERT(mc, ksize == key->iov_len); const size_t nkeys = page_numkeys(mp); + cASSERT(mc, (((ksize & page_numkeys(mp)) ^ mp->mp_upper) & 1) == 0); /* Just using these for counting */ const intptr_t lower = mp->mp_lower + sizeof(indx_t); @@ -22400,6 +22535,8 @@ __hot static int __must_check_result node_add_leaf2(MDBX_cursor *mc, memmove(ptr_disp(ptr, ksize), ptr, diff * ksize); /* insert new key */ memcpy(ptr, key->iov_base, ksize); + + cASSERT(mc, (((ksize & page_numkeys(mp)) ^ mp->mp_upper) & 1) == 0); return MDBX_SUCCESS; } @@ -22566,6 +22703,7 @@ __hot static void node_del(MDBX_cursor *mc, size_t ksize) { mp->mp_lower -= sizeof(indx_t); cASSERT(mc, (size_t)UINT16_MAX - mp->mp_upper >= ksize - sizeof(indx_t)); mp->mp_upper += (indx_t)(ksize - sizeof(indx_t)); + cASSERT(mc, (((ksize & page_numkeys(mp)) ^ mp->mp_upper) & 1) == 0); return; } @@ -22605,35 +22743,28 @@ __hot static void node_del(MDBX_cursor *mc, size_t ksize) { /* Compact the main page after deleting a node on a subpage. * [in] mp The main page to operate on. * [in] indx The index of the subpage on the main page. */ -static void node_shrink(MDBX_page *mp, size_t indx) { - MDBX_node *node; - MDBX_page *sp, *xp; - size_t nsize, delta, len, ptr; - intptr_t i; - - node = page_node(mp, indx); - sp = (MDBX_page *)node_data(node); - delta = page_room(sp); - assert(delta > 0); +static MDBX_node *node_shrink(MDBX_page *mp, size_t indx, MDBX_node *node) { + assert(node = page_node(mp, indx)); + MDBX_page *sp = (MDBX_page *)node_data(node); + assert(IS_SUBP(sp) && page_numkeys(sp) > 0); + const size_t delta = + EVEN_FLOOR(page_room(sp) /* avoid the node uneven-sized */); + if (unlikely(delta) == 0) + return node; /* Prepare to shift upward, set len = length(subpage part to shift) */ - if (IS_LEAF2(sp)) { - delta &= /* do not make the node uneven-sized */ ~(size_t)1; - if (unlikely(delta) == 0) - return; - nsize = node_ds(node) - delta; - assert(nsize % 1 == 0); - len = nsize; - } else { - xp = ptr_disp(sp, delta); /* destination subpage */ - for (i = page_numkeys(sp); --i >= 0;) { + size_t nsize = node_ds(node) - delta, len = nsize; + assert(nsize % 1 == 0); + if (!IS_LEAF2(sp)) { + len = PAGEHDRSZ; + MDBX_page *xp = ptr_disp(sp, delta); /* destination subpage */ + for (intptr_t i = page_numkeys(sp); --i >= 0;) { assert(sp->mp_ptrs[i] >= delta); xp->mp_ptrs[i] = (indx_t)(sp->mp_ptrs[i] - delta); } - nsize = node_ds(node) - delta; - len = PAGEHDRSZ; } - sp->mp_upper = sp->mp_lower; + assert(sp->mp_upper >= sp->mp_lower + delta); + sp->mp_upper -= (indx_t)delta; sp->mp_pgno = mp->mp_pgno; node_set_ds(node, nsize); @@ -22641,15 +22772,17 @@ static void node_shrink(MDBX_page *mp, size_t indx) { void *const base = ptr_disp(mp, mp->mp_upper + PAGEHDRSZ); memmove(ptr_disp(base, delta), base, ptr_dist(sp, base) + len); - ptr = mp->mp_ptrs[indx]; - for (i = page_numkeys(mp); --i >= 0;) { - if (mp->mp_ptrs[i] <= ptr) { + const size_t pivot = mp->mp_ptrs[indx]; + for (intptr_t i = page_numkeys(mp); --i >= 0;) { + if (mp->mp_ptrs[i] <= pivot) { assert((size_t)UINT16_MAX - mp->mp_ptrs[i] >= delta); mp->mp_ptrs[i] += (indx_t)delta; } } assert((size_t)UINT16_MAX - mp->mp_upper >= delta); mp->mp_upper += (indx_t)delta; + + return ptr_disp(node, delta); } /* Initial setup of a sorted-dups cursor. @@ -23521,7 +23654,6 @@ static int page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { IS_LEAF(cdst->mc_pg[cdst->mc_db->md_depth - 1])); cASSERT(csrc, csrc->mc_snum < csrc->mc_db->md_depth || IS_LEAF(csrc->mc_pg[csrc->mc_db->md_depth - 1])); - cASSERT(cdst, page_room(pdst) >= page_used(cdst->mc_txn->mt_env, psrc)); const int pagetype = PAGETYPE_WHOLE(psrc); /* Move all nodes from src to dst */ @@ -23532,7 +23664,9 @@ static int page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { size_t j = dst_nkeys; if (unlikely(pagetype & P_LEAF2)) { /* Mark dst as dirty. */ - if (unlikely(rc = page_touch(cdst))) + rc = page_touch(cdst); + cASSERT(cdst, rc != MDBX_RESULT_TRUE); + if (unlikely(rc != MDBX_SUCCESS)) return rc; key.iov_len = csrc->mc_db->md_xsize; @@ -23540,6 +23674,7 @@ static int page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { size_t i = 0; do { rc = node_add_leaf2(cdst, j++, &key); + cASSERT(cdst, rc != MDBX_RESULT_TRUE); if (unlikely(rc != MDBX_SUCCESS)) return rc; key.iov_base = ptr_disp(key.iov_base, key.iov_len); @@ -23553,7 +23688,8 @@ static int page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { cursor_copy(csrc, &mn); /* must find the lowest key below src */ rc = page_search_lowest(&mn); - if (unlikely(rc)) + cASSERT(csrc, rc != MDBX_RESULT_TRUE); + if (unlikely(rc != MDBX_SUCCESS)) return rc; const MDBX_page *mp = mn.mc_pg[mn.mc_top]; @@ -23578,7 +23714,9 @@ static int page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { } /* Mark dst as dirty. */ - if (unlikely(rc = page_touch(cdst))) + rc = page_touch(cdst); + cASSERT(cdst, rc != MDBX_RESULT_TRUE); + if (unlikely(rc != MDBX_SUCCESS)) return rc; size_t i = 0; @@ -23592,6 +23730,7 @@ static int page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { cASSERT(csrc, node_flags(srcnode) == 0); rc = node_add_branch(cdst, j++, &key, node_pgno(srcnode)); } + cASSERT(cdst, rc != MDBX_RESULT_TRUE); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -23618,7 +23757,8 @@ static int page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { if (csrc->mc_ki[csrc->mc_top] == 0) { const MDBX_val nullkey = {0, 0}; rc = update_key(csrc, &nullkey); - if (unlikely(rc)) { + cASSERT(csrc, rc != MDBX_RESULT_TRUE); + if (unlikely(rc != MDBX_SUCCESS)) { csrc->mc_top++; return rc; } @@ -23653,7 +23793,8 @@ static int page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { } rc = page_retire(csrc, (MDBX_page *)psrc); - if (unlikely(rc)) + cASSERT(csrc, rc != MDBX_RESULT_TRUE); + if (unlikely(rc != MDBX_SUCCESS)) return rc; cASSERT(cdst, cdst->mc_db->md_entries > 0); @@ -23666,7 +23807,7 @@ static int page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { const uint16_t save_depth = cdst->mc_db->md_depth; cursor_pop(cdst); rc = rebalance(cdst); - if (unlikely(rc)) + if (unlikely(rc != MDBX_SUCCESS)) return rc; cASSERT(cdst, cdst->mc_db->md_entries > 0); @@ -23854,11 +23995,9 @@ static int rebalance(MDBX_cursor *mc) { mc->mc_snum = 0; mc->mc_top = 0; mc->mc_flags &= ~C_INITIALIZED; - - rc = page_retire(mc, mp); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - } else if (IS_BRANCH(mp) && nkeys == 1) { + return page_retire(mc, mp); + } + if (IS_BRANCH(mp) && nkeys == 1) { DEBUG("%s", "collapsing root page!"); mc->mc_db->md_root = node_pgno(page_node(mp, 0)); rc = page_get(mc, mc->mc_db->md_root, &mc->mc_pg[0], mp->mp_txnid); @@ -23891,15 +24030,10 @@ static int rebalance(MDBX_cursor *mc) { PAGETYPE_WHOLE(mc->mc_pg[mc->mc_top]) == pagetype); cASSERT(mc, mc->mc_snum < mc->mc_db->md_depth || IS_LEAF(mc->mc_pg[mc->mc_db->md_depth - 1])); - - rc = page_retire(mc, mp); - if (likely(rc == MDBX_SUCCESS)) - rc = page_touch(mc); - return rc; - } else { - DEBUG("root page %" PRIaPGNO " doesn't need rebalancing (flags 0x%x)", - mp->mp_pgno, mp->mp_flags); + return page_retire(mc, mp); } + DEBUG("root page %" PRIaPGNO " doesn't need rebalancing (flags 0x%x)", + mp->mp_pgno, mp->mp_flags); return MDBX_SUCCESS; } @@ -23948,6 +24082,7 @@ static int rebalance(MDBX_cursor *mc) { const size_t right_nkeys = right ? page_numkeys(right) : 0; bool involve = false; retry: + cASSERT(mc, mc->mc_snum > 1); if (left_room > room_threshold && left_room >= right_room && (IS_MODIFIABLE(mc->mc_txn, left) || involve)) { /* try merge with left */ @@ -24019,7 +24154,18 @@ static int rebalance(MDBX_cursor *mc) { return MDBX_SUCCESS; } - if (likely(!involve)) { + /* Заглушено в ветке v0.12.x, будет работать в v0.13.1 и далее. + * + * if (mc->mc_txn->mt_env->me_options.prefer_waf_insteadof_balance && + * likely(room_threshold > 0)) { + * room_threshold = 0; + * goto retry; + * } + */ + if (likely(!involve) && + (likely(mc->mc_dbi != FREE_DBI) || mc->mc_txn->tw.loose_pages || + MDBX_PNL_GETSIZE(mc->mc_txn->tw.relist) || (mc->mc_flags & C_GCU) || + (mc->mc_txn->mt_flags & MDBX_TXN_DRAINED_GC) || room_threshold)) { involve = true; goto retry; } @@ -24109,8 +24255,7 @@ __cold static int page_check(const MDBX_cursor *const mc, break; } - if (unlikely(mp->mp_upper < mp->mp_lower || - ((mp->mp_lower | mp->mp_upper) & 1) || + if (unlikely(mp->mp_upper < mp->mp_lower || (mp->mp_lower & 1) || PAGEHDRSZ + mp->mp_upper > env->me_psize)) rc = bad_page(mp, "invalid page lower(%u)/upper(%u) with limit %zu\n", mp->mp_lower, mp->mp_upper, page_space(env)); @@ -24126,11 +24271,6 @@ __cold static int page_check(const MDBX_cursor *const mc, bad_page(mp, "%s-page nkeys (%zu) < %u\n", IS_BRANCH(mp) ? "branch" : "leaf", nkeys, 1 + IS_BRANCH(mp)); } - if (!IS_LEAF2(mp) && unlikely(PAGEHDRSZ + mp->mp_upper + - nkeys * sizeof(MDBX_node) + nkeys - 1 > - env->me_psize)) - rc = bad_page(mp, "invalid page upper (%u) for nkeys %zu with limit %zu\n", - mp->mp_upper, nkeys, page_space(env)); const size_t ksize_max = keysize_max(env->me_psize, 0); const size_t leaf2_ksize = mp->mp_leaf2_ksize; @@ -24139,8 +24279,20 @@ __cold static int page_check(const MDBX_cursor *const mc, (mc->mc_db->md_flags & MDBX_DUPFIXED) == 0)) rc = bad_page(mp, "unexpected leaf2-page (db-flags 0x%x)\n", mc->mc_db->md_flags); - if (unlikely(leaf2_ksize < 1 || leaf2_ksize > ksize_max)) - rc = bad_page(mp, "invalid leaf2-key length (%zu)\n", leaf2_ksize); + else if (unlikely(leaf2_ksize != mc->mc_db->md_xsize)) + rc = bad_page(mp, "invalid leaf2_ksize %zu\n", leaf2_ksize); + else if (unlikely(((leaf2_ksize & nkeys) ^ mp->mp_upper) & 1)) + rc = bad_page( + mp, "invalid page upper (%u) for nkeys %zu with leaf2-length %zu\n", + mp->mp_upper, nkeys, leaf2_ksize); + } else { + if (unlikely((mp->mp_upper & 1) || PAGEHDRSZ + mp->mp_upper + + nkeys * sizeof(MDBX_node) + + nkeys - 1 > + env->me_psize)) + rc = + bad_page(mp, "invalid page upper (%u) for nkeys %zu with limit %zu\n", + mp->mp_upper, nkeys, page_space(env)); } MDBX_val here, prev = {0, 0}; @@ -24148,7 +24300,7 @@ __cold static int page_check(const MDBX_cursor *const mc, if (IS_LEAF2(mp)) { const char *const key = page_leaf2key(mp, i, leaf2_ksize); if (unlikely(end_of_page < key + leaf2_ksize)) { - rc = bad_page(mp, "leaf2-key beyond (%zu) page-end\n", + rc = bad_page(mp, "leaf2-item beyond (%zu) page-end\n", key + leaf2_ksize - end_of_page); continue; } @@ -24157,7 +24309,7 @@ __cold static int page_check(const MDBX_cursor *const mc, if (unlikely(leaf2_ksize < mc->mc_dbx->md_klen_min || leaf2_ksize > mc->mc_dbx->md_klen_max)) rc = bad_page( - mp, "leaf2-key size (%zu) <> min/max key-length (%zu/%zu)\n", + mp, "leaf2-item size (%zu) <> min/max length (%zu/%zu)\n", leaf2_ksize, mc->mc_dbx->md_klen_min, mc->mc_dbx->md_klen_max); else mc->mc_dbx->md_klen_min = mc->mc_dbx->md_klen_max = leaf2_ksize; @@ -24166,7 +24318,7 @@ __cold static int page_check(const MDBX_cursor *const mc, here.iov_base = (void *)key; here.iov_len = leaf2_ksize; if (prev.iov_base && unlikely(mc->mc_dbx->md_cmp(&prev, &here) >= 0)) - rc = bad_page(mp, "leaf2-key #%zu wrong order (%s >= %s)\n", i, + rc = bad_page(mp, "leaf2-item #%zu wrong order (%s >= %s)\n", i, DKEY(&prev), DVAL(&here)); prev = here; } @@ -24577,6 +24729,8 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, DKBUF; MDBX_page *const mp = mc->mc_pg[mc->mc_top]; + cASSERT(mc, (mp->mp_flags & P_ILL_BITS) == 0); + const size_t newindx = mc->mc_ki[mc->mc_top]; size_t nkeys = page_numkeys(mp); if (AUDIT_ENABLED()) { @@ -24668,7 +24822,7 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, /* It is reasonable and possible to split the page at the begin */ if (unlikely(newindx < minkeys)) { split_indx = minkeys; - if (newindx == 0 && foliage == 0 && !(naf & MDBX_SPLIT_REPLACE)) { + if (newindx == 0 && !(naf & MDBX_SPLIT_REPLACE)) { split_indx = 0; /* Checking for ability of splitting by the left-side insertion * of a pure page with the new key */ @@ -24688,10 +24842,19 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, } else get_key(page_node(mp, 0), &sepkey); cASSERT(mc, mc->mc_dbx->md_cmp(newkey, &sepkey) < 0); - /* Avoiding rare complex cases of split the parent page */ - if (page_room(mn.mc_pg[ptop]) < branch_size(env, &sepkey)) + /* Avoiding rare complex cases of nested split the parent page(s) */ + if (page_room(mc->mc_pg[ptop]) < branch_size(env, &sepkey)) split_indx = minkeys; } + if (foliage) { + TRACE("pure-left: foliage %u, top %i, ptop %zu, split_indx %zi, " + "minkeys %zi, sepkey %s, parent-room %zu, need4split %zu", + foliage, mc->mc_top, ptop, split_indx, minkeys, + DKEY_DEBUG(&sepkey), page_room(mc->mc_pg[ptop]), + branch_size(env, &sepkey)); + TRACE("pure-left: newkey %s, newdata %s, newindx %zu", + DKEY_DEBUG(newkey), DVAL_DEBUG(newdata), newindx); + } } } @@ -24704,9 +24867,10 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, sepkey = *newkey; } else if (unlikely(pure_left)) { /* newindx == split_indx == 0 */ - TRACE("no-split, but add new pure page at the %s", "left/before"); + TRACE("pure-left: no-split, but add new pure page at the %s", + "left/before"); cASSERT(mc, newindx == 0 && split_indx == 0 && minkeys == 1); - TRACE("old-first-key is %s", DKEY_DEBUG(&sepkey)); + TRACE("pure-left: old-first-key is %s", DKEY_DEBUG(&sepkey)); } else { if (IS_LEAF2(sister)) { /* Move half of the keys to the right sibling */ @@ -24737,6 +24901,7 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, mp->mp_lower += sizeof(indx_t); cASSERT(mc, mp->mp_upper >= ksize - sizeof(indx_t)); mp->mp_upper -= (indx_t)(ksize - sizeof(indx_t)); + cASSERT(mc, (((ksize & page_numkeys(mp)) ^ mp->mp_upper) & 1) == 0); } else { memcpy(sister->mp_ptrs, split, distance * ksize); void *const ins = page_leaf2key(sister, distance, ksize); @@ -24749,6 +24914,8 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, sister->mp_upper -= (indx_t)(ksize - sizeof(indx_t)); cASSERT(mc, distance <= (int)UINT16_MAX); mc->mc_ki[mc->mc_top] = (indx_t)distance; + cASSERT(mc, + (((ksize & page_numkeys(sister)) ^ sister->mp_upper) & 1) == 0); } if (AUDIT_ENABLED()) { @@ -24917,18 +25084,20 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, } } else if (unlikely(pure_left)) { MDBX_page *ptop_page = mc->mc_pg[ptop]; - DEBUG("adding to parent page %u node[%u] left-leaf page #%u key %s", + TRACE("pure-left: adding to parent page %u node[%u] left-leaf page #%u key " + "%s", ptop_page->mp_pgno, mc->mc_ki[ptop], sister->mp_pgno, DKEY(mc->mc_ki[ptop] ? newkey : NULL)); - mc->mc_top--; + assert(mc->mc_top == ptop + 1); + mc->mc_top = (uint8_t)ptop; rc = node_add_branch(mc, mc->mc_ki[ptop], mc->mc_ki[ptop] ? newkey : NULL, sister->mp_pgno); cASSERT(mc, mp == mc->mc_pg[ptop + 1] && newindx == mc->mc_ki[ptop + 1] && ptop == mc->mc_top); if (likely(rc == MDBX_SUCCESS) && mc->mc_ki[ptop] == 0) { - DEBUG("update prev-first key on parent %s", DKEY(&sepkey)); MDBX_node *node = page_node(mc->mc_pg[ptop], 1); + TRACE("pure-left: update prev-first key on parent to %s", DKEY(&sepkey)); cASSERT(mc, node_ks(node) == 0 && node_pgno(node) == mp->mp_pgno); cASSERT(mc, mc->mc_top == ptop && mc->mc_ki[ptop] == 0); mc->mc_ki[ptop] = 1; @@ -24936,6 +25105,9 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, cASSERT(mc, mc->mc_top == ptop && mc->mc_ki[ptop] == 1); cASSERT(mc, mp == mc->mc_pg[ptop + 1] && newindx == mc->mc_ki[ptop + 1]); mc->mc_ki[ptop] = 0; + } else { + TRACE("pure-left: no-need-update prev-first key on parent %s", + DKEY(&sepkey)); } mc->mc_top++; @@ -24984,7 +25156,7 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, &sepkey); if (mc->mc_dbx->md_cmp(newkey, &sepkey) < 0) { mc->mc_top -= (uint8_t)i; - DEBUG("update new-first on parent [%i] page %u key %s", + DEBUG("pure-left: update new-first on parent [%i] page %u key %s", mc->mc_ki[mc->mc_top], mc->mc_pg[mc->mc_top]->mp_pgno, DKEY(newkey)); rc = update_key(mc, newkey); @@ -24995,7 +25167,7 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, break; } } - } else if (tmp_ki_copy /* !IS_LEAF2(mp) */) { + } else if (tmp_ki_copy) { /* !IS_LEAF2(mp) */ /* Move nodes */ mc->mc_pg[mc->mc_top] = sister; i = split_indx; @@ -25114,7 +25286,7 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, m3->mc_ki[k + 1] = m3->mc_ki[k]; m3->mc_pg[k + 1] = m3->mc_pg[k]; } - m3->mc_ki[0] = m3->mc_ki[0] >= nkeys; + m3->mc_ki[0] = m3->mc_ki[0] >= nkeys + pure_left; m3->mc_pg[0] = mc->mc_pg[0]; m3->mc_snum++; m3->mc_top++; @@ -27566,8 +27738,7 @@ __cold static int walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, (mp ? page_room(mp) : pagesize - header_size) - payload_size; size_t align_bytes = 0; - for (size_t i = 0; err == MDBX_SUCCESS && i < nentries; - align_bytes += ((payload_size + align_bytes) & 1), ++i) { + for (size_t i = 0; err == MDBX_SUCCESS && i < nentries; ++i) { if (type == MDBX_page_dupfixed_leaf) { /* LEAF2 pages have no mp_ptrs[] or node headers */ payload_size += mp->mp_leaf2_ksize; @@ -27575,23 +27746,26 @@ __cold static int walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, } MDBX_node *node = page_node(mp, i); - payload_size += NODESIZE + node_ks(node); + const size_t node_key_size = node_ks(node); + payload_size += NODESIZE + node_key_size; if (type == MDBX_page_branch) { assert(i > 0 || node_ks(node) == 0); + align_bytes += node_key_size & 1; continue; } + const size_t node_data_size = node_ds(node); assert(type == MDBX_page_leaf); switch (node_flags(node)) { case 0 /* usual node */: - payload_size += node_ds(node); + payload_size += node_data_size; + align_bytes += (node_key_size + node_data_size) & 1; break; case F_BIGDATA /* long data on the large/overflow page */: { - payload_size += sizeof(pgno_t); const pgno_t large_pgno = node_largedata_pgno(node); - const size_t over_payload = node_ds(node); + const size_t over_payload = node_data_size; const size_t over_header = PAGEHDRSZ; npages = 1; @@ -27610,27 +27784,31 @@ __cold static int walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, over_payload, over_header, over_unused); if (unlikely(rc != MDBX_SUCCESS)) return (rc == MDBX_RESULT_TRUE) ? MDBX_SUCCESS : rc; + payload_size += sizeof(pgno_t); + align_bytes += node_key_size & 1; } break; case F_SUBDATA /* sub-db */: { - const size_t namelen = node_ks(node); - payload_size += node_ds(node); - if (unlikely(namelen == 0 || node_ds(node) != sizeof(MDBX_db))) { + const size_t namelen = node_key_size; + if (unlikely(namelen == 0 || node_data_size != sizeof(MDBX_db))) { assert(err == MDBX_CORRUPTED); err = MDBX_CORRUPTED; } + header_size += node_data_size; + align_bytes += (node_key_size + node_data_size) & 1; } break; case F_SUBDATA | F_DUPDATA /* dupsorted sub-tree */: - payload_size += sizeof(MDBX_db); - if (unlikely(node_ds(node) != sizeof(MDBX_db))) { + if (unlikely(node_data_size != sizeof(MDBX_db))) { assert(err == MDBX_CORRUPTED); err = MDBX_CORRUPTED; } + header_size += node_data_size; + align_bytes += (node_key_size + node_data_size) & 1; break; case F_DUPDATA /* short sub-page */: { - if (unlikely(node_ds(node) <= PAGEHDRSZ)) { + if (unlikely(node_data_size <= PAGEHDRSZ || (node_data_size & 1))) { assert(err == MDBX_CORRUPTED); err = MDBX_CORRUPTED; break; @@ -27658,16 +27836,17 @@ __cold static int walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, err = MDBX_CORRUPTED; } - for (size_t j = 0; err == MDBX_SUCCESS && j < nsubkeys; - subalign_bytes += ((subpayload_size + subalign_bytes) & 1), ++j) { - + for (size_t j = 0; err == MDBX_SUCCESS && j < nsubkeys; ++j) { if (subtype == MDBX_subpage_dupfixed_leaf) { /* LEAF2 pages have no mp_ptrs[] or node headers */ subpayload_size += sp->mp_leaf2_ksize; } else { assert(subtype == MDBX_subpage_leaf); - MDBX_node *subnode = page_node(sp, j); - subpayload_size += NODESIZE + node_ks(subnode) + node_ds(subnode); + const MDBX_node *subnode = page_node(sp, j); + const size_t subnode_size = node_ks(subnode) + node_ds(subnode); + subheader_size += NODESIZE; + subpayload_size += subnode_size; + subalign_bytes += subnode_size & 1; if (unlikely(node_flags(subnode) != 0)) { assert(err == MDBX_CORRUPTED); err = MDBX_CORRUPTED; @@ -27676,7 +27855,7 @@ __cold static int walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, } const int rc = - ctx->mw_visitor(pgno, 0, ctx->mw_user, deep + 1, name, node_ds(node), + ctx->mw_visitor(pgno, 0, ctx->mw_user, deep + 1, name, node_data_size, subtype, err, nsubkeys, subpayload_size, subheader_size, subunused_size + subalign_bytes); if (unlikely(rc != MDBX_SUCCESS)) @@ -27684,7 +27863,7 @@ __cold static int walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, header_size += subheader_size; unused_size += subunused_size; payload_size += subpayload_size; - align_bytes += subalign_bytes; + align_bytes += subalign_bytes + (node_key_size & 1); } break; default: @@ -32401,7 +32580,7 @@ retry_mapview:; ptr_disp(map->base, size), ((map->current < map->limit) ? map->current : map->limit) - size); } - map->current = size; + map->current = (size < map->limit) ? size : map->limit; } if (limit == map->limit) @@ -32562,6 +32741,7 @@ retry_mapview:; map->base = ptr; } map->limit = limit; + map->current = size; #if MDBX_ENABLE_MADVISE #ifdef MADV_DONTFORK @@ -33379,9 +33559,9 @@ __dll_export 0, 12, 9, - 0, - {"2023-12-11T23:24:05+03:00", "44ee35910be133a64a24525537f125bca0d5e037", "185e43f3a86b6d62482c933a1062a3e95c82b93c", - "v0.12.9-0-g185e43f3"}, + 16, + {"2024-03-06T22:58:31+03:00", "c5e6e3a4f75727b9e0039ad420ae167d3487d006", "fff3fbd866c50ee3c77b244a9b05f497e06a65e8", + "v0.12.9-16-gfff3fbd8"}, sourcery}; __dll_export diff --git a/mdbxdist/ChangeLog.md b/mdbxdist/ChangeLog.md index 6978e6b..0934d9f 100644 --- a/mdbxdist/ChangeLog.md +++ b/mdbxdist/ChangeLog.md @@ -4,6 +4,78 @@ ChangeLog English version [by Google](https://gitflic-ru.translate.goog/project/erthink/libmdbx/blob?file=ChangeLog.md&_x_tr_sl=ru&_x_tr_tl=en) and [by Yandex](https://translated.turbopages.org/proxy_u/ru-en.en/https/gitflic.ru/project/erthink/libmdbx/blob?file=ChangeLog.md). +## v0.12.10 (подготовка к выпуску) + +Поддерживающий выпуск с исправлением обнаруженных ошибок и устранением недочетов. + +``` +git diff' stat: 16 commits, 10 files changed, 665 insertions(+), 238 deletions(-) +Signed-off-by: Леонид Юрьев (Leonid Yuriev) +``` + +Благодарности: + + - [Dvir H](https://t.me/Dvirsw) за [сообщение](https://t.me/libmdbx/5368) + об ошибке `MDBX_CORRUPTED` в сценарии работы в режиме `MDBX_DUPFIXED` и нечетной длинной + мульти-значений, с предоставлением точного минимального сценария воспроизведения. + +Значимые исправления и доработки: + + - Устранение унаследованной от LMDB ошибки приводящей к повреждению БД при использовании `MDBX_DUPFIXED`. + Проблема была выявлена при расширении тестов сценариями с длинными мульти-значениями в режиме `MDBX_DUPFIXED`. + Сообщений о проявлении этой ошибки в эксплуатационных/продуктовых средах не поступало. + + На LEAF2-страниц, используемых для компактного хранения мульти-значений фиксированной длины, + выполнялось резервирование места без учета возможности превышения размера страниц, + с последующим переполнением, повреждением структуры БД и/или повреждением содержимого ОЗУ. + + Вероятность проявления ошибки существенно увеличивалась с увеличением размера/длины + мульти-значений/дубликатов и уменьшением размера страницы БД. Поэтому при использовании `MDBX_INTEGERDUP` + вероятность проявления близка к нулю, а сценарий такого проявления найти не удалось. + + В MDBX ошибка присутствовала с момента отделения проекта от LMDB, + где эта ошибка присутствует более 11 лети, по настоящее время. + + - Исправление ложной ошибки `MDBX_CORRUPTED (-30796)` в сценарии работы + в режиме `MDBX_DUPFIXED` и нечетной длинной мульти-значений. + + - Исправление недочета корректировки сопутствующих курсоров при разделении страницы + по сценарию добавления пустой страницы слева. + + - Доработка `rebalance()` ради уменьшения WAF. Новый функционал, включая + контролируемую пользователем опцию `enum MDBX_option_t`, будет доступен + в выпусках ветки `0.13.x`, а в этом выпуске доработка сводится к тактике + не-вовленичения чистой страницы при нехватке запаса страниц в ходе обновления GC, + за счет ухудшения баланса дерева страниц. + + - Устранение упущения приводящего к нелогичной ситуации + `me_dxb_mmap.curren > me_dxb_mmap.limit` при "дребезге" размера БД. + В текущем понимании, последствий кроме срабатывания assert-проверки нет, а + вероятность проявления близка к нулю. + + - Исправление в функционале обхода дерева, используемого утилитой + `mdbx_chk`, подсчета места затраченного на выравнивание на вложенной + под-странице, в случае нечетного количества dupfixed-элементов нечетного + размера. + Сообщений о проявлении этой ошибки в эксплуатационных/продуктовых средах не поступало. + + - Исправление assert-проверки внутри `check_txn()` для случая завершенных транзакций в режиме `MDBX_NO_TLS`. + Последствий ошибки, кроме срабатывания assert-проверки в отладочных сборках, нет. + +Прочее: + + - Расширение стохастического теста dupfixed-сценариями. + - Корректировка условия в assert-проверке для `MDBX_TXN_DRAINED_GC`. + - Добавление в jitter-сценарий простого теста сброса и перезапуска читающих транзакций. + - Вынужденное продолжение очистки/рефакторинга унаследованных ребусов в `cursor_put_nochecklen()`. + - Фиксация транзакции при ошибках теста для последующего анализа содержимого БД. + - Сопутствующий рефакторинг `node_shrink()` для ясности исходного кода. + - Приведение в соответствие протоколируемых имен тестов опциям командной строки. + - Добавление cmoke-теста `extra/dupfixed_addodd`. + + +-------------------------------------------------------------------------------- + ## v0.12.9 "Ясень-4" от 2023-12-11 diff --git a/mdbxdist/VERSION.txt b/mdbxdist/VERSION.txt index 84a2b44..d14e996 100644 --- a/mdbxdist/VERSION.txt +++ b/mdbxdist/VERSION.txt @@ -1 +1 @@ -0.12.9.0 +0.12.9.16 diff --git a/mdbxdist/mdbx.c b/mdbxdist/mdbx.c index 2317ba8..4a6ea35 100644 --- a/mdbxdist/mdbx.c +++ b/mdbxdist/mdbx.c @@ -12,7 +12,7 @@ * . */ #define xMDBX_ALLOY 1 -#define MDBX_BUILD_SOURCERY a3bc805b6c34de756a896bd408af909f74b858903ba60515d7c218c0181e020b_v0_12_9_0_g185e43f3 +#define MDBX_BUILD_SOURCERY 0c86daff919dc09ab5e99c8e5bfb1d51e1367babca9c630c97af92a2aa988ea9_v0_12_9_16_gfff3fbd8 #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif @@ -3628,8 +3628,12 @@ struct MDBX_env { struct MDBX_lockinfo *me_lck; unsigned me_psize; /* DB page size, initialized from me_os_psize */ - unsigned me_leaf_nodemax; /* max size of a leaf-node */ - unsigned me_branch_nodemax; /* max size of a branch-node */ + uint16_t me_leaf_nodemax; /* max size of a leaf-node */ + uint16_t me_branch_nodemax; /* max size of a branch-node */ + uint16_t me_subpage_limit; + uint16_t me_subpage_room_threshold; + uint16_t me_subpage_reserve_prereq; + uint16_t me_subpage_reserve_limit; atomic_pgno_t me_mlocked_pgno; uint8_t me_psize2log; /* log2 of DB page size */ int8_t me_stuck_meta; /* recovery-only: target meta page or less that zero */ @@ -7371,7 +7375,7 @@ static int __must_check_result node_add_leaf2(MDBX_cursor *mc, size_t indx, const MDBX_val *key); static void node_del(MDBX_cursor *mc, size_t ksize); -static void node_shrink(MDBX_page *mp, size_t indx); +static MDBX_node *node_shrink(MDBX_page *mp, size_t indx, MDBX_node *node); static int __must_check_result node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft); static int __must_check_result node_read(MDBX_cursor *mc, const MDBX_node *leaf, @@ -10232,6 +10236,7 @@ __cold static int dxb_resize(MDBX_env *const env, const pgno_t used_pgno, #endif /* MDBX_ENABLE_MADVISE */ rc = osal_mresize(mresize_flags, &env->me_dxb_mmap, size_bytes, limit_bytes); + eASSERT(env, env->me_dxb_mmap.limit >= env->me_dxb_mmap.current); #if MDBX_ENABLE_MADVISE if (rc == MDBX_SUCCESS) { @@ -10257,6 +10262,7 @@ __cold static int dxb_resize(MDBX_env *const env, const pgno_t used_pgno, bailout: if (rc == MDBX_SUCCESS) { + eASSERT(env, env->me_dxb_mmap.limit >= env->me_dxb_mmap.current); eASSERT(env, limit_bytes == env->me_dxb_mmap.limit); eASSERT(env, size_bytes <= env->me_dxb_mmap.filesize); if (mode == explicit_resize) @@ -10287,6 +10293,7 @@ __cold static int dxb_resize(MDBX_env *const env, const pgno_t used_pgno, "present %" PRIuPTR " -> %" PRIuPTR ", " "limit %" PRIuPTR " -> %" PRIuPTR ", errcode %d", prev_size, size_bytes, prev_limit, limit_bytes, rc); + eASSERT(env, env->me_dxb_mmap.limit >= env->me_dxb_mmap.current); } if (!env->me_dxb_mmap.base) { env->me_flags |= MDBX_FATAL_ERROR; @@ -11324,7 +11331,7 @@ static pgr_t page_alloc_slowpath(const MDBX_cursor *const mc, const size_t num, //--------------------------------------------------------------------------- if (unlikely(!is_gc_usable(txn, mc, flags))) { - eASSERT(env, txn->mt_flags & MDBX_TXN_DRAINED_GC); + eASSERT(env, (txn->mt_flags & MDBX_TXN_DRAINED_GC) || num > 1); goto no_gc; } @@ -11986,7 +11993,7 @@ __hot static int page_touch(MDBX_cursor *mc) { np->mp_txnid = txn->mt_front; return MDBX_SUCCESS; } - tASSERT(txn, !IS_OVERFLOW(mp)); + tASSERT(txn, !IS_OVERFLOW(mp) && !IS_SUBP(mp)); if (IS_FROZEN(txn, mp)) { /* CoW the page */ @@ -13098,6 +13105,7 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { const size_t used_bytes = pgno2bytes(env, txn->mt_next_pgno); const size_t required_bytes = (txn->mt_flags & MDBX_TXN_RDONLY) ? used_bytes : size_bytes; + eASSERT(env, env->me_dxb_mmap.limit >= env->me_dxb_mmap.current); if (unlikely(required_bytes > env->me_dxb_mmap.current)) { /* Размер БД (для пишущих транзакций) или используемых данных (для * читающих транзакций) больше предыдущего/текущего размера внутри @@ -13115,6 +13123,7 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { txn->mt_geo.upper, implicit_grow); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; + eASSERT(env, env->me_dxb_mmap.limit >= env->me_dxb_mmap.current); } else if (unlikely(size_bytes < env->me_dxb_mmap.current)) { /* Размер БД меньше предыдущего/текущего размера внутри процесса, можно * уменьшить, но всё сложнее: @@ -13140,11 +13149,15 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { rc = osal_fastmutex_acquire(&env->me_remap_guard); #endif if (likely(rc == MDBX_SUCCESS)) { + eASSERT(env, env->me_dxb_mmap.limit >= env->me_dxb_mmap.current); rc = osal_filesize(env->me_dxb_mmap.fd, &env->me_dxb_mmap.filesize); if (likely(rc == MDBX_SUCCESS)) { eASSERT(env, env->me_dxb_mmap.filesize >= required_bytes); if (env->me_dxb_mmap.current > env->me_dxb_mmap.filesize) - env->me_dxb_mmap.current = (size_t)env->me_dxb_mmap.filesize; + env->me_dxb_mmap.current = + (env->me_dxb_mmap.limit < env->me_dxb_mmap.filesize) + ? env->me_dxb_mmap.limit + : (size_t)env->me_dxb_mmap.filesize; } #if defined(_WIN32) || defined(_WIN64) osal_srwlock_ReleaseShared(&env->me_remap_guard); @@ -13209,10 +13222,11 @@ static __always_inline int check_txn(const MDBX_txn *txn, int bad_bits) { if (unlikely(txn->mt_flags & bad_bits)) return MDBX_BAD_TXN; - tASSERT(txn, (txn->mt_flags & MDBX_NOTLS) == - ((txn->mt_flags & MDBX_TXN_RDONLY) - ? txn->mt_env->me_flags & MDBX_NOTLS - : 0)); + tASSERT(txn, (txn->mt_flags & MDBX_TXN_FINISHED) || + (txn->mt_flags & MDBX_NOTLS) == + ((txn->mt_flags & MDBX_TXN_RDONLY) + ? txn->mt_env->me_flags & MDBX_NOTLS + : 0)); #if MDBX_TXN_CHECKOWNER STATIC_ASSERT(MDBX_NOTLS > MDBX_TXN_FINISHED + MDBX_TXN_RDONLY); if (unlikely(txn->mt_owner != osal_thread_self()) && @@ -14271,7 +14285,9 @@ static int gcu_prepare_backlog(MDBX_txn *txn, gcu_context_t *ctx) { const size_t for_all_before_touch = for_relist + for_tree_before_touch; const size_t for_all_after_touch = for_relist + for_tree_after_touch; - if (likely(for_relist < 2 && gcu_backlog_size(txn) > for_all_before_touch)) + if (likely(for_relist < 2 && gcu_backlog_size(txn) > for_all_before_touch) && + (ctx->cursor.mc_snum == 0 || + IS_MODIFIABLE(txn, ctx->cursor.mc_pg[ctx->cursor.mc_top]))) return MDBX_SUCCESS; TRACE(">> retired-stored %zu, left %zi, backlog %zu, need %zu (4list %zu, " @@ -17115,13 +17131,23 @@ __cold static void setup_pagesize(MDBX_env *env, const size_t pagesize) { leaf_nodemax > (intptr_t)(sizeof(MDBX_db) + NODESIZE + 42) && leaf_nodemax >= branch_nodemax && leaf_nodemax < (int)UINT16_MAX && leaf_nodemax % 2 == 0); - env->me_leaf_nodemax = (unsigned)leaf_nodemax; - env->me_branch_nodemax = (unsigned)branch_nodemax; + env->me_leaf_nodemax = (uint16_t)leaf_nodemax; + env->me_branch_nodemax = (uint16_t)branch_nodemax; env->me_psize2log = (uint8_t)log2n_powerof2(pagesize); eASSERT(env, pgno2bytes(env, 1) == pagesize); eASSERT(env, bytes2pgno(env, pagesize + pagesize) == 2); recalculate_merge_threshold(env); + /* TODO: recalculate me_subpage_xyz values from MDBX_opt_subpage_xyz. */ + env->me_subpage_limit = env->me_leaf_nodemax - NODESIZE; + env->me_subpage_room_threshold = 0; + env->me_subpage_reserve_prereq = env->me_leaf_nodemax; + env->me_subpage_reserve_limit = env->me_subpage_limit / 42; + eASSERT(env, + env->me_subpage_reserve_prereq > + env->me_subpage_room_threshold + env->me_subpage_reserve_limit); + eASSERT(env, env->me_leaf_nodemax >= env->me_subpage_limit + NODESIZE); + const pgno_t max_pgno = bytes2pgno(env, MAX_MAPSIZE); if (!env->me_options.flags.non_auto.dp_limit) { /* auto-setup dp_limit by "The42" ;-) */ @@ -19728,8 +19754,12 @@ __hot static __always_inline int page_get_checker_lite(const uint16_t ILL, if (((ILL & P_OVERFLOW) || !IS_OVERFLOW(page)) && (ILL & (P_BRANCH | P_LEAF | P_LEAF2)) == 0) { - if (unlikely(page->mp_upper < page->mp_lower || - ((page->mp_lower | page->mp_upper) & 1) || + /* Контроль четности page->mp_upper тут либо приводит к ложным ошибкам, + * либо слишком дорог по количеству операций. Заковырка в том, что mp_upper + * может быть нечетным на LEAF2-страницах, при нечетном количестве элементов + * нечетной длины. Поэтому четность page->mp_upper здесь не проверяется, но + * соответствующие полные проверки есть в page_check(). */ + if (unlikely(page->mp_upper < page->mp_lower || (page->mp_lower & 1) || PAGEHDRSZ + page->mp_upper > txn->mt_env->me_psize)) return bad_page(page, "invalid page' lower(%u)/upper(%u) with limit %zu\n", @@ -21309,6 +21339,26 @@ static __hot int cursor_touch(MDBX_cursor *const mc, const MDBX_val *key, return rc; } +static size_t leaf2_reserve(const MDBX_env *const env, size_t host_page_room, + size_t subpage_len, size_t item_len) { + eASSERT(env, (subpage_len & 1) == 0); + eASSERT(env, + env->me_subpage_reserve_prereq > env->me_subpage_room_threshold + + env->me_subpage_reserve_limit && + env->me_leaf_nodemax >= env->me_subpage_limit + NODESIZE); + size_t reserve = 0; + for (size_t n = 0; + n < 5 && reserve + item_len <= env->me_subpage_reserve_limit && + EVEN(subpage_len + item_len) <= env->me_subpage_limit && + host_page_room >= + env->me_subpage_reserve_prereq + EVEN(subpage_len + item_len); + ++n) { + subpage_len += item_len; + reserve += item_len; + } + return reserve + (subpage_len & 1); +} + static __hot int cursor_put_nochecklen(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, unsigned flags) { int err; @@ -21376,11 +21426,11 @@ static __hot int cursor_put_nochecklen(MDBX_cursor *mc, const MDBX_val *key, rc = MDBX_NO_ROOT; } else if ((flags & MDBX_CURRENT) == 0) { bool exact = false; - MDBX_val lastkey, olddata; + MDBX_val last_key, old_data; if ((flags & MDBX_APPEND) && mc->mc_db->md_entries > 0) { - rc = cursor_last(mc, &lastkey, &olddata); + rc = cursor_last(mc, &last_key, &old_data); if (likely(rc == MDBX_SUCCESS)) { - const int cmp = mc->mc_dbx->md_cmp(key, &lastkey); + const int cmp = mc->mc_dbx->md_cmp(key, &last_key); if (likely(cmp > 0)) { mc->mc_ki[mc->mc_top]++; /* step forward for appending */ rc = MDBX_NOTFOUND; @@ -21395,7 +21445,7 @@ static __hot int cursor_put_nochecklen(MDBX_cursor *mc, const MDBX_val *key, } else { struct cursor_set_result csr = /* olddata may not be updated in case LEAF2-page of dupfixed-subDB */ - cursor_set(mc, (MDBX_val *)key, &olddata, MDBX_SET); + cursor_set(mc, (MDBX_val *)key, &old_data, MDBX_SET); rc = csr.err; exact = csr.exact; } @@ -21403,14 +21453,14 @@ static __hot int cursor_put_nochecklen(MDBX_cursor *mc, const MDBX_val *key, if (exact) { if (unlikely(flags & MDBX_NOOVERWRITE)) { DEBUG("duplicate key [%s]", DKEY_DEBUG(key)); - *data = olddata; + *data = old_data; return MDBX_KEYEXIST; } if (unlikely(mc->mc_flags & C_SUB)) { /* nested subtree of DUPSORT-database with the same key, * nothing to update */ eASSERT(env, data->iov_len == 0 && - (olddata.iov_len == 0 || + (old_data.iov_len == 0 || /* olddata may not be updated in case LEAF2-page of dupfixed-subDB */ (mc->mc_db->md_flags & MDBX_DUPFIXED))); @@ -21426,8 +21476,8 @@ static __hot int cursor_put_nochecklen(MDBX_cursor *mc, const MDBX_val *key, exact = false; } else if (!(flags & (MDBX_RESERVE | MDBX_MULTIPLE))) { /* checking for early exit without dirtying pages */ - if (unlikely(eq_fast(data, &olddata))) { - cASSERT(mc, mc->mc_dbx->md_dcmp(data, &olddata) == 0); + if (unlikely(eq_fast(data, &old_data))) { + cASSERT(mc, mc->mc_dbx->md_dcmp(data, &old_data) == 0); if (mc->mc_xcursor) { if (flags & MDBX_NODUPDATA) return MDBX_KEYEXIST; @@ -21437,7 +21487,7 @@ static __hot int cursor_put_nochecklen(MDBX_cursor *mc, const MDBX_val *key, /* the same data, nothing to update */ return MDBX_SUCCESS; } - cASSERT(mc, mc->mc_dbx->md_dcmp(data, &olddata) != 0); + cASSERT(mc, mc->mc_dbx->md_dcmp(data, &old_data) != 0); } } } else if (unlikely(rc != MDBX_NOTFOUND)) @@ -21445,17 +21495,16 @@ static __hot int cursor_put_nochecklen(MDBX_cursor *mc, const MDBX_val *key, } mc->mc_flags &= ~C_DEL; - MDBX_val xdata, *rdata = data; - size_t mcount = 0, dcount = 0; + MDBX_val xdata, *ref_data = data; + size_t *batch_dupfixed_done = nullptr, batch_dupfixed_given = 0; if (unlikely(flags & MDBX_MULTIPLE)) { - dcount = data[1].iov_len; - data[1].iov_len = 0 /* reset done item counter */; - rdata = &xdata; - xdata.iov_len = data->iov_len * dcount; + batch_dupfixed_given = data[1].iov_len; + batch_dupfixed_done = &data[1].iov_len; + *batch_dupfixed_done = 0; } /* Cursor is positioned, check for room in the dirty list */ - err = cursor_touch(mc, key, rdata); + err = cursor_touch(mc, key, ref_data); if (unlikely(err)) return err; @@ -21484,13 +21533,13 @@ static __hot int cursor_put_nochecklen(MDBX_cursor *mc, const MDBX_val *key, mc->mc_xcursor->mx_dbx.md_klen_min = mc->mc_xcursor->mx_dbx.md_klen_max = data->iov_len); + if (mc->mc_flags & C_SUB) + npr.page->mp_flags |= P_LEAF2; } - if ((mc->mc_db->md_flags & (MDBX_DUPSORT | MDBX_DUPFIXED)) == MDBX_DUPFIXED) - npr.page->mp_flags |= P_LEAF2; mc->mc_flags |= C_INITIALIZED; } - MDBX_val dkey, olddata; + MDBX_val old_singledup, old_data; MDBX_db nested_dupdb; MDBX_page *sub_root = nullptr; bool insert_key, insert_data; @@ -21498,19 +21547,19 @@ static __hot int cursor_put_nochecklen(MDBX_cursor *mc, const MDBX_val *key, MDBX_page *fp = env->me_pbuf; fp->mp_txnid = mc->mc_txn->mt_front; insert_key = insert_data = (rc != MDBX_SUCCESS); - dkey.iov_base = nullptr; + old_singledup.iov_base = nullptr; if (insert_key) { /* The key does not exist */ DEBUG("inserting key at index %i", mc->mc_ki[mc->mc_top]); if ((mc->mc_db->md_flags & MDBX_DUPSORT) && node_size(key, data) > env->me_leaf_nodemax) { /* Too big for a node, insert in sub-DB. Set up an empty - * "old sub-page" for prep_subDB to expand to a full page. */ + * "old sub-page" for convert_to_subtree to expand to a full page. */ fp->mp_leaf2_ksize = (mc->mc_db->md_flags & MDBX_DUPFIXED) ? (uint16_t)data->iov_len : 0; fp->mp_lower = fp->mp_upper = 0; - olddata.iov_len = PAGEHDRSZ; - goto prep_subDB; + old_data.iov_len = PAGEHDRSZ; + goto convert_to_subtree; } } else { /* there's only a key anyway, so this is a no-op */ @@ -21555,7 +21604,8 @@ static __hot int cursor_put_nochecklen(MDBX_cursor *mc, const MDBX_val *key, if (unlikely(err != MDBX_SUCCESS)) return err; } - MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + MDBX_node *const node = + page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); /* Large/Overflow page overwrites need special handling */ if (unlikely(node_flags(node) & F_BIGDATA)) { @@ -21629,19 +21679,18 @@ static __hot int cursor_put_nochecklen(MDBX_cursor *mc, const MDBX_val *key, if ((err = page_retire(mc, lp.page)) != MDBX_SUCCESS) return err; } else { - olddata.iov_len = node_ds(node); - olddata.iov_base = node_data(node); - cASSERT(mc, ptr_disp(olddata.iov_base, olddata.iov_len) <= + old_data.iov_len = node_ds(node); + old_data.iov_base = node_data(node); + cASSERT(mc, ptr_disp(old_data.iov_base, old_data.iov_len) <= ptr_disp(mc->mc_pg[mc->mc_top], env->me_psize)); /* DB has dups? */ if (mc->mc_db->md_flags & MDBX_DUPSORT) { /* Prepare (sub-)page/sub-DB to accept the new item, if needed. * fp: old sub-page or a header faking it. - * mp: new (sub-)page. offset: growth in page size. - * xdata: node data with new page or DB. */ - size_t i; - size_t offset = 0; + * mp: new (sub-)page. + * xdata: node data with new sub-page or sub-DB. */ + size_t growth = 0; /* growth in page size.*/ MDBX_page *mp = fp = xdata.iov_base = env->me_pbuf; mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno; @@ -21649,19 +21698,19 @@ static __hot int cursor_put_nochecklen(MDBX_cursor *mc, const MDBX_val *key, if (!(node_flags(node) & F_DUPDATA)) { /* does data match? */ if (flags & MDBX_APPENDDUP) { - const int cmp = mc->mc_dbx->md_dcmp(data, &olddata); - cASSERT(mc, cmp != 0 || eq_fast(data, &olddata)); + const int cmp = mc->mc_dbx->md_dcmp(data, &old_data); + cASSERT(mc, cmp != 0 || eq_fast(data, &old_data)); if (unlikely(cmp <= 0)) return MDBX_EKEYMISMATCH; - } else if (eq_fast(data, &olddata)) { - cASSERT(mc, mc->mc_dbx->md_dcmp(data, &olddata) == 0); + } else if (eq_fast(data, &old_data)) { + cASSERT(mc, mc->mc_dbx->md_dcmp(data, &old_data) == 0); if (flags & MDBX_NODUPDATA) return MDBX_KEYEXIST; /* data is match exactly byte-to-byte, nothing to update */ rc = MDBX_SUCCESS; - if (likely((flags & MDBX_MULTIPLE) == 0)) - return rc; - goto continue_multiple; + if (unlikely(batch_dupfixed_done)) + goto batch_dupfixed_continue; + return rc; } /* Just overwrite the current item */ @@ -21671,62 +21720,143 @@ static __hot int cursor_put_nochecklen(MDBX_cursor *mc, const MDBX_val *key, } /* Back up original data item */ - memcpy(dkey.iov_base = fp + 1, olddata.iov_base, - dkey.iov_len = olddata.iov_len); + memcpy(old_singledup.iov_base = fp + 1, old_data.iov_base, + old_singledup.iov_len = old_data.iov_len); /* Make sub-page header for the dup items, with dummy body */ fp->mp_flags = P_LEAF | P_SUBP; fp->mp_lower = 0; - xdata.iov_len = PAGEHDRSZ + dkey.iov_len + data->iov_len; + xdata.iov_len = PAGEHDRSZ + old_data.iov_len + data->iov_len; if (mc->mc_db->md_flags & MDBX_DUPFIXED) { fp->mp_flags |= P_LEAF2; fp->mp_leaf2_ksize = (uint16_t)data->iov_len; - xdata.iov_len += 2 * data->iov_len; /* leave space for 2 more */ - cASSERT(mc, xdata.iov_len <= env->me_psize); + /* Будем создавать LEAF2-страницу, как минимум с двумя элементами. + * При коротких значениях и наличии свободного места можно сделать + * некоторое резервирование места, чтобы при последующих добавлениях + * не сразу расширять созданную под-страницу. + * Резервирование в целом сомнительно (см ниже), но может сработать + * в плюс (а если в минус то несущественный) при коротких ключах. */ + xdata.iov_len += leaf2_reserve( + env, page_room(mc->mc_pg[mc->mc_top]) + old_data.iov_len, + xdata.iov_len, data->iov_len); + cASSERT(mc, (xdata.iov_len & 1) == 0); } else { xdata.iov_len += 2 * (sizeof(indx_t) + NODESIZE) + - (dkey.iov_len & 1) + (data->iov_len & 1); - cASSERT(mc, xdata.iov_len <= env->me_psize); + (old_data.iov_len & 1) + (data->iov_len & 1); } + cASSERT(mc, (xdata.iov_len & 1) == 0); fp->mp_upper = (uint16_t)(xdata.iov_len - PAGEHDRSZ); - olddata.iov_len = xdata.iov_len; /* pretend olddata is fp */ + old_data.iov_len = xdata.iov_len; /* pretend olddata is fp */ } else if (node_flags(node) & F_SUBDATA) { /* Data is on sub-DB, just store it */ flags |= F_DUPDATA | F_SUBDATA; - goto put_sub; + goto dupsort_put; } else { /* Data is on sub-page */ - fp = olddata.iov_base; + fp = old_data.iov_base; switch (flags) { default: - if (!(mc->mc_db->md_flags & MDBX_DUPFIXED)) { - offset = node_size(data, nullptr) + sizeof(indx_t); - break; - } - offset = fp->mp_leaf2_ksize; - if (page_room(fp) < offset) { - offset *= 4; /* space for 4 more */ - break; + growth = IS_LEAF2(fp) ? fp->mp_leaf2_ksize + : (node_size(data, nullptr) + sizeof(indx_t)); + if (page_room(fp) >= growth) { + /* На текущей под-странице есть место для добавления элемента. + * Оптимальнее продолжить использовать эту страницу, ибо + * добавление вложенного дерева увеличит WAF на одну страницу. */ + goto continue_subpage; } - /* FALLTHRU: Big enough MDBX_DUPFIXED sub-page */ - __fallthrough; + /* На текущей под-странице нет места для еще одного элемента. + * Можно либо увеличить эту под-страницу, либо вынести куст + * значений во вложенное дерево. + * + * Продолжать использовать текущую под-страницу возможно + * только пока и если размер после добавления элемента будет + * меньше me_leaf_nodemax. Соответственно, при превышении + * просто сразу переходим на вложенное дерево. */ + xdata.iov_len = old_data.iov_len + (growth += growth & 1); + if (xdata.iov_len > env->me_subpage_limit) + goto convert_to_subtree; + + /* Можно либо увеличить под-страницу, в том числе с некоторым + * запасом, либо перейти на вложенное поддерево. + * + * Резервирование места на под-странице представляется сомнительным: + * - Резервирование увеличит рыхлость страниц, в том числе + * вероятность разделения основной/гнездовой страницы; + * - Сложно предсказать полезный размер резервирования, + * особенно для не-MDBX_DUPFIXED; + * - Наличие резерва позволяет съекономить только на перемещении + * части элементов основной/гнездовой страницы при последующих + * добавлениях в нее элементов. Причем после первого изменения + * размера под-страницы, её тело будет примыкать + * к неиспользуемому месту на основной/гнездовой странице, + * поэтому последующие последовательные добавления потребуют + * только передвижения в mp_ptrs[]. + * + * Соответственно, более важным/определяющим представляется + * своевременный переход к вложеному дереву, но тут достаточно + * сложный конфликт интересов: + * - При склонности к переходу к вложенным деревьям, суммарно + * в БД будет большее кол-во более рыхлых страниц. Это увеличит + * WAF, а также RAF при последовательных чтениях большой БД. + * Однако, при коротких ключах и большом кол-ве + * дубликатов/мультизначений, плотность ключей в листовых + * страницах основного дерева будет выше. Соответственно, будет + * пропорционально меньше branch-страниц. Поэтому будет выше + * вероятность оседания/не-вымывания страниц основного дерева из + * LRU-кэша, а также попадания в write-back кэш при записи. + * - Наоботот, при склонности к использованию под-страниц, будут + * наблюдаться обратные эффекты. Плюс некоторые накладные расходы + * на лишнее копирование данных под-страниц в сценариях + * нескольких обонвлений дубликатов одного куста в одной + * транзакции. + * + * Суммарно наиболее рациональным представляется такая тактика: + * - Вводим три порога subpage_limit, subpage_room_threshold + * и subpage_reserve_prereq, которые могут быть + * заданы/скорректированы пользователем в ‰ от me_leaf_nodemax; + * - Используем под-страницу пока её размер меньше subpage_limit + * и на основной/гнездовой странице не-менее + * subpage_room_threshold свободного места; + * - Резервируем место только для 1-3 коротких dupfixed-элементов, + * расширяя размер под-страницы на размер кэш-линии ЦПУ, но + * только если на странице не менее subpage_reserve_prereq + * свободного места. + * - По-умолчанию устанавливаем: + * subpage_limit = me_leaf_nodemax (1000‰); + * subpage_room_threshold = 0; + * subpage_reserve_prereq = me_leaf_nodemax (1000‰). + */ + if (IS_LEAF2(fp)) + growth += leaf2_reserve( + env, page_room(mc->mc_pg[mc->mc_top]) + old_data.iov_len, + xdata.iov_len, data->iov_len); + break; + case MDBX_CURRENT | MDBX_NODUPDATA: case MDBX_CURRENT: + continue_subpage: fp->mp_txnid = mc->mc_txn->mt_front; fp->mp_pgno = mp->mp_pgno; mc->mc_xcursor->mx_cursor.mc_pg[0] = fp; flags |= F_DUPDATA; - goto put_sub; + goto dupsort_put; } - xdata.iov_len = olddata.iov_len + offset; + xdata.iov_len = old_data.iov_len + growth; + cASSERT(mc, (xdata.iov_len & 1) == 0); } fp_flags = fp->mp_flags; - if (node_size_len(node_ks(node), xdata.iov_len) > - env->me_leaf_nodemax) { + if (xdata.iov_len > env->me_subpage_limit || + node_size_len(node_ks(node), xdata.iov_len) > + env->me_leaf_nodemax || + (env->me_subpage_room_threshold && + page_room(mc->mc_pg[mc->mc_top]) + + node_size_len(node_ks(node), old_data.iov_len) < + env->me_subpage_room_threshold + + node_size_len(node_ks(node), xdata.iov_len))) { /* Too big for a sub-page, convert to sub-DB */ + convert_to_subtree: fp_flags &= ~P_SUBP; - prep_subDB: nested_dupdb.md_xsize = 0; nested_dupdb.md_flags = flags_db2sub(mc->mc_db->md_flags); if (mc->mc_db->md_flags & MDBX_DUPFIXED) { @@ -21745,8 +21875,9 @@ static __hot int cursor_put_nochecklen(MDBX_cursor *mc, const MDBX_val *key, if (unlikely(par.err != MDBX_SUCCESS)) return par.err; mc->mc_db->md_leaf_pages += 1; - cASSERT(mc, env->me_psize > olddata.iov_len); - offset = env->me_psize - (unsigned)olddata.iov_len; + cASSERT(mc, env->me_psize > old_data.iov_len); + growth = env->me_psize - (unsigned)old_data.iov_len; + cASSERT(mc, (growth & 1) == 0); flags |= F_DUPDATA | F_SUBDATA; nested_dupdb.md_root = mp->mp_pgno; nested_dupdb.md_seq = 0; @@ -21758,29 +21889,33 @@ static __hot int cursor_put_nochecklen(MDBX_cursor *mc, const MDBX_val *key, mp->mp_txnid = mc->mc_txn->mt_front; mp->mp_leaf2_ksize = fp->mp_leaf2_ksize; mp->mp_lower = fp->mp_lower; - cASSERT(mc, fp->mp_upper + offset <= UINT16_MAX); - mp->mp_upper = (indx_t)(fp->mp_upper + offset); + cASSERT(mc, fp->mp_upper + growth < UINT16_MAX); + mp->mp_upper = fp->mp_upper + (indx_t)growth; if (unlikely(fp_flags & P_LEAF2)) { memcpy(page_data(mp), page_data(fp), page_numkeys(fp) * fp->mp_leaf2_ksize); + cASSERT(mc, + (((mp->mp_leaf2_ksize & page_numkeys(mp)) ^ mp->mp_upper) & + 1) == 0); } else { + cASSERT(mc, (mp->mp_upper & 1) == 0); memcpy(ptr_disp(mp, mp->mp_upper + PAGEHDRSZ), ptr_disp(fp, fp->mp_upper + PAGEHDRSZ), - olddata.iov_len - fp->mp_upper - PAGEHDRSZ); + old_data.iov_len - fp->mp_upper - PAGEHDRSZ); memcpy(mp->mp_ptrs, fp->mp_ptrs, page_numkeys(fp) * sizeof(mp->mp_ptrs[0])); - for (i = 0; i < page_numkeys(fp); i++) { - cASSERT(mc, mp->mp_ptrs[i] + offset <= UINT16_MAX); - mp->mp_ptrs[i] += (indx_t)offset; + for (size_t i = 0; i < page_numkeys(fp); i++) { + cASSERT(mc, mp->mp_ptrs[i] + growth <= UINT16_MAX); + mp->mp_ptrs[i] += (indx_t)growth; } } } if (!insert_key) node_del(mc, 0); - rdata = &xdata; + ref_data = &xdata; flags |= F_DUPDATA; - goto new_sub; + goto insert_node; } /* MDBX passes F_SUBDATA in 'flags' to write a DB record */ @@ -21788,15 +21923,15 @@ static __hot int cursor_put_nochecklen(MDBX_cursor *mc, const MDBX_val *key, return MDBX_INCOMPATIBLE; current: - if (data->iov_len == olddata.iov_len) { + if (data->iov_len == old_data.iov_len) { cASSERT(mc, EVEN(key->iov_len) == EVEN(node_ks(node))); /* same size, just replace it. Note that we could * also reuse this node if the new data is smaller, * but instead we opt to shrink the node in that case. */ if (flags & MDBX_RESERVE) - data->iov_base = olddata.iov_base; + data->iov_base = old_data.iov_base; else if (!(mc->mc_flags & C_SUB)) - memcpy(olddata.iov_base, data->iov_base, data->iov_len); + memcpy(old_data.iov_base, data->iov_base, data->iov_len); else { cASSERT(mc, page_numkeys(mc->mc_pg[mc->mc_top]) == 1); cASSERT(mc, PAGETYPE_COMPAT(mc->mc_pg[mc->mc_top]) == P_LEAF); @@ -21821,14 +21956,15 @@ static __hot int cursor_put_nochecklen(MDBX_cursor *mc, const MDBX_val *key, node_del(mc, 0); } - rdata = data; + ref_data = data; -new_sub:; +insert_node:; const unsigned naf = flags & NODE_ADD_FLAGS; - size_t nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->iov_len - : leaf_size(env, key, rdata); + size_t nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) + ? key->iov_len + : leaf_size(env, key, ref_data); if (page_room(mc->mc_pg[mc->mc_top]) < nsize) { - rc = page_split(mc, key, rdata, P_INVALID, + rc = page_split(mc, key, ref_data, P_INVALID, insert_key ? naf : naf | MDBX_SPLIT_REPLACE); if (rc == MDBX_SUCCESS && AUDIT_ENABLED()) rc = insert_key ? cursor_check(mc) : cursor_check_updating(mc); @@ -21836,25 +21972,25 @@ new_sub:; /* There is room already in this leaf page. */ if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { cASSERT(mc, !(naf & (F_BIGDATA | F_SUBDATA | F_DUPDATA)) && - rdata->iov_len == 0); + ref_data->iov_len == 0); rc = node_add_leaf2(mc, mc->mc_ki[mc->mc_top], key); } else - rc = node_add_leaf(mc, mc->mc_ki[mc->mc_top], key, rdata, naf); + rc = node_add_leaf(mc, mc->mc_ki[mc->mc_top], key, ref_data, naf); if (likely(rc == 0)) { /* Adjust other cursors pointing to mp */ const MDBX_dbi dbi = mc->mc_dbi; - const size_t i = mc->mc_top; - MDBX_page *const mp = mc->mc_pg[i]; + const size_t top = mc->mc_top; + MDBX_page *const mp = mc->mc_pg[top]; for (MDBX_cursor *m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { MDBX_cursor *m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; - if (m3 == mc || m3->mc_snum < mc->mc_snum || m3->mc_pg[i] != mp) + if (m3 == mc || m3->mc_snum < mc->mc_snum || m3->mc_pg[top] != mp) continue; - if (m3->mc_ki[i] >= mc->mc_ki[i]) - m3->mc_ki[i] += insert_key; + if (m3->mc_ki[top] >= mc->mc_ki[top]) + m3->mc_ki[top] += insert_key; if (XCURSOR_INITED(m3)) - XCURSOR_REFRESH(m3, mp, m3->mc_ki[i]); + XCURSOR_REFRESH(m3, mp, m3->mc_ki[top]); } } } @@ -21865,18 +22001,18 @@ new_sub:; * size limits on dupdata. The actual data fields of the child * DB are all zero size. */ if (flags & F_DUPDATA) { - unsigned xflags; - size_t ecount; - put_sub: - xdata.iov_len = 0; - xdata.iov_base = nullptr; + MDBX_val empty; + dupsort_put: + empty.iov_len = 0; + empty.iov_base = nullptr; MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); #define SHIFT_MDBX_NODUPDATA_TO_MDBX_NOOVERWRITE 1 STATIC_ASSERT( (MDBX_NODUPDATA >> SHIFT_MDBX_NODUPDATA_TO_MDBX_NOOVERWRITE) == MDBX_NOOVERWRITE); - xflags = MDBX_CURRENT | ((flags & MDBX_NODUPDATA) >> - SHIFT_MDBX_NODUPDATA_TO_MDBX_NOOVERWRITE); + unsigned xflags = + MDBX_CURRENT | ((flags & MDBX_NODUPDATA) >> + SHIFT_MDBX_NODUPDATA_TO_MDBX_NOOVERWRITE); if ((flags & MDBX_CURRENT) == 0) { xflags -= MDBX_CURRENT; err = cursor_xinit1(mc, node, mc->mc_pg[mc->mc_top]); @@ -21886,80 +22022,78 @@ new_sub:; if (sub_root) mc->mc_xcursor->mx_cursor.mc_pg[0] = sub_root; /* converted, write the original data first */ - if (dkey.iov_base) { - rc = cursor_put_nochecklen(&mc->mc_xcursor->mx_cursor, &dkey, &xdata, - xflags); + if (old_singledup.iov_base) { + rc = cursor_put_nochecklen(&mc->mc_xcursor->mx_cursor, &old_singledup, + &empty, xflags); if (unlikely(rc)) - goto bad_sub; + goto dupsort_error; } if (!(node_flags(node) & F_SUBDATA) || sub_root) { /* Adjust other cursors pointing to mp */ - MDBX_cursor *m2; - MDBX_xcursor *mx = mc->mc_xcursor; - size_t i = mc->mc_top; - MDBX_page *mp = mc->mc_pg[i]; + MDBX_xcursor *const mx = mc->mc_xcursor; + const size_t top = mc->mc_top; + MDBX_page *const mp = mc->mc_pg[top]; const intptr_t nkeys = page_numkeys(mp); - for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2 = m2->mc_next) { + for (MDBX_cursor *m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; + m2 = m2->mc_next) { if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; if (!(m2->mc_flags & C_INITIALIZED)) continue; - if (m2->mc_pg[i] == mp) { - if (m2->mc_ki[i] == mc->mc_ki[i]) { - err = cursor_xinit2(m2, mx, dkey.iov_base != nullptr); + if (m2->mc_pg[top] == mp) { + if (m2->mc_ki[top] == mc->mc_ki[top]) { + err = cursor_xinit2(m2, mx, old_singledup.iov_base != nullptr); if (unlikely(err != MDBX_SUCCESS)) return err; - } else if (!insert_key && m2->mc_ki[i] < nkeys) { - XCURSOR_REFRESH(m2, mp, m2->mc_ki[i]); + } else if (!insert_key && m2->mc_ki[top] < nkeys) { + XCURSOR_REFRESH(m2, mp, m2->mc_ki[top]); } } } } cASSERT(mc, mc->mc_xcursor->mx_db.md_entries < PTRDIFF_MAX); - ecount = (size_t)mc->mc_xcursor->mx_db.md_entries; + const size_t probe = (size_t)mc->mc_xcursor->mx_db.md_entries; #define SHIFT_MDBX_APPENDDUP_TO_MDBX_APPEND 1 STATIC_ASSERT((MDBX_APPENDDUP >> SHIFT_MDBX_APPENDDUP_TO_MDBX_APPEND) == MDBX_APPEND); xflags |= (flags & MDBX_APPENDDUP) >> SHIFT_MDBX_APPENDDUP_TO_MDBX_APPEND; - rc = cursor_put_nochecklen(&mc->mc_xcursor->mx_cursor, data, &xdata, + rc = cursor_put_nochecklen(&mc->mc_xcursor->mx_cursor, data, &empty, xflags); if (flags & F_SUBDATA) { void *db = node_data(node); mc->mc_xcursor->mx_db.md_mod_txnid = mc->mc_txn->mt_txnid; memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDBX_db)); } - insert_data = (ecount != (size_t)mc->mc_xcursor->mx_db.md_entries); + insert_data = (probe != (size_t)mc->mc_xcursor->mx_db.md_entries); } /* Increment count unless we just replaced an existing item. */ if (insert_data) mc->mc_db->md_entries++; if (insert_key) { - /* Invalidate txn if we created an empty sub-DB */ - if (unlikely(rc)) - goto bad_sub; + if (unlikely(rc != MDBX_SUCCESS)) + goto dupsort_error; /* If we succeeded and the key didn't exist before, * make sure the cursor is marked valid. */ mc->mc_flags |= C_INITIALIZED; } - if (unlikely(flags & MDBX_MULTIPLE)) { - if (likely(rc == MDBX_SUCCESS)) { - continue_multiple: - mcount++; + if (likely(rc == MDBX_SUCCESS)) { + if (unlikely(batch_dupfixed_done)) { + batch_dupfixed_continue: /* let caller know how many succeeded, if any */ - data[1].iov_len = mcount; - if (mcount < dcount) { + if ((*batch_dupfixed_done += 1) < batch_dupfixed_given) { data[0].iov_base = ptr_disp(data[0].iov_base, data[0].iov_len); insert_key = insert_data = false; - dkey.iov_base = nullptr; + old_singledup.iov_base = nullptr; goto more; } } + if (AUDIT_ENABLED()) + rc = cursor_check(mc); } - if (rc == MDBX_SUCCESS && AUDIT_ENABLED()) - rc = cursor_check(mc); return rc; - bad_sub: + + dupsort_error: if (unlikely(rc == MDBX_KEYEXIST)) { /* should not happen, we deleted that item */ ERROR("Unexpected %i error while put to nested dupsort's hive", rc); @@ -22123,6 +22257,7 @@ static __hot int cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { return rc; MDBX_page *mp = mc->mc_pg[mc->mc_top]; + cASSERT(mc, IS_MODIFIABLE(mc->mc_txn, mp)); if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", mp->mp_pgno, mp->mp_flags); @@ -22141,7 +22276,7 @@ static __hot int cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { if (!(node_flags(node) & F_SUBDATA)) mc->mc_xcursor->mx_cursor.mc_pg[0] = node_data(node); rc = cursor_del(&mc->mc_xcursor->mx_cursor, 0); - if (unlikely(rc)) + if (unlikely(rc != MDBX_SUCCESS)) return rc; /* If sub-DB still has entries, we're done */ if (mc->mc_xcursor->mx_db.md_entries) { @@ -22150,11 +22285,10 @@ static __hot int cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { mc->mc_xcursor->mx_db.md_mod_txnid = mc->mc_txn->mt_txnid; memcpy(node_data(node), &mc->mc_xcursor->mx_db, sizeof(MDBX_db)); } else { - /* shrink fake page */ - node_shrink(mp, mc->mc_ki[mc->mc_top]); - node = page_node(mp, mc->mc_ki[mc->mc_top]); + /* shrink sub-page */ + node = node_shrink(mp, mc->mc_ki[mc->mc_top], node); mc->mc_xcursor->mx_cursor.mc_pg[0] = node_data(node); - /* fix other sub-DB cursors pointed at fake pages on this page */ + /* fix other sub-DB cursors pointed at sub-pages on this page */ for (MDBX_cursor *m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2 = m2->mc_next) { if (m2 == mc || m2->mc_snum < mc->mc_snum) @@ -22381,6 +22515,7 @@ __hot static int __must_check_result node_add_leaf2(MDBX_cursor *mc, const size_t ksize = mc->mc_db->md_xsize; cASSERT(mc, ksize == key->iov_len); const size_t nkeys = page_numkeys(mp); + cASSERT(mc, (((ksize & page_numkeys(mp)) ^ mp->mp_upper) & 1) == 0); /* Just using these for counting */ const intptr_t lower = mp->mp_lower + sizeof(indx_t); @@ -22400,6 +22535,8 @@ __hot static int __must_check_result node_add_leaf2(MDBX_cursor *mc, memmove(ptr_disp(ptr, ksize), ptr, diff * ksize); /* insert new key */ memcpy(ptr, key->iov_base, ksize); + + cASSERT(mc, (((ksize & page_numkeys(mp)) ^ mp->mp_upper) & 1) == 0); return MDBX_SUCCESS; } @@ -22566,6 +22703,7 @@ __hot static void node_del(MDBX_cursor *mc, size_t ksize) { mp->mp_lower -= sizeof(indx_t); cASSERT(mc, (size_t)UINT16_MAX - mp->mp_upper >= ksize - sizeof(indx_t)); mp->mp_upper += (indx_t)(ksize - sizeof(indx_t)); + cASSERT(mc, (((ksize & page_numkeys(mp)) ^ mp->mp_upper) & 1) == 0); return; } @@ -22605,35 +22743,28 @@ __hot static void node_del(MDBX_cursor *mc, size_t ksize) { /* Compact the main page after deleting a node on a subpage. * [in] mp The main page to operate on. * [in] indx The index of the subpage on the main page. */ -static void node_shrink(MDBX_page *mp, size_t indx) { - MDBX_node *node; - MDBX_page *sp, *xp; - size_t nsize, delta, len, ptr; - intptr_t i; - - node = page_node(mp, indx); - sp = (MDBX_page *)node_data(node); - delta = page_room(sp); - assert(delta > 0); +static MDBX_node *node_shrink(MDBX_page *mp, size_t indx, MDBX_node *node) { + assert(node = page_node(mp, indx)); + MDBX_page *sp = (MDBX_page *)node_data(node); + assert(IS_SUBP(sp) && page_numkeys(sp) > 0); + const size_t delta = + EVEN_FLOOR(page_room(sp) /* avoid the node uneven-sized */); + if (unlikely(delta) == 0) + return node; /* Prepare to shift upward, set len = length(subpage part to shift) */ - if (IS_LEAF2(sp)) { - delta &= /* do not make the node uneven-sized */ ~(size_t)1; - if (unlikely(delta) == 0) - return; - nsize = node_ds(node) - delta; - assert(nsize % 1 == 0); - len = nsize; - } else { - xp = ptr_disp(sp, delta); /* destination subpage */ - for (i = page_numkeys(sp); --i >= 0;) { + size_t nsize = node_ds(node) - delta, len = nsize; + assert(nsize % 1 == 0); + if (!IS_LEAF2(sp)) { + len = PAGEHDRSZ; + MDBX_page *xp = ptr_disp(sp, delta); /* destination subpage */ + for (intptr_t i = page_numkeys(sp); --i >= 0;) { assert(sp->mp_ptrs[i] >= delta); xp->mp_ptrs[i] = (indx_t)(sp->mp_ptrs[i] - delta); } - nsize = node_ds(node) - delta; - len = PAGEHDRSZ; } - sp->mp_upper = sp->mp_lower; + assert(sp->mp_upper >= sp->mp_lower + delta); + sp->mp_upper -= (indx_t)delta; sp->mp_pgno = mp->mp_pgno; node_set_ds(node, nsize); @@ -22641,15 +22772,17 @@ static void node_shrink(MDBX_page *mp, size_t indx) { void *const base = ptr_disp(mp, mp->mp_upper + PAGEHDRSZ); memmove(ptr_disp(base, delta), base, ptr_dist(sp, base) + len); - ptr = mp->mp_ptrs[indx]; - for (i = page_numkeys(mp); --i >= 0;) { - if (mp->mp_ptrs[i] <= ptr) { + const size_t pivot = mp->mp_ptrs[indx]; + for (intptr_t i = page_numkeys(mp); --i >= 0;) { + if (mp->mp_ptrs[i] <= pivot) { assert((size_t)UINT16_MAX - mp->mp_ptrs[i] >= delta); mp->mp_ptrs[i] += (indx_t)delta; } } assert((size_t)UINT16_MAX - mp->mp_upper >= delta); mp->mp_upper += (indx_t)delta; + + return ptr_disp(node, delta); } /* Initial setup of a sorted-dups cursor. @@ -23521,7 +23654,6 @@ static int page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { IS_LEAF(cdst->mc_pg[cdst->mc_db->md_depth - 1])); cASSERT(csrc, csrc->mc_snum < csrc->mc_db->md_depth || IS_LEAF(csrc->mc_pg[csrc->mc_db->md_depth - 1])); - cASSERT(cdst, page_room(pdst) >= page_used(cdst->mc_txn->mt_env, psrc)); const int pagetype = PAGETYPE_WHOLE(psrc); /* Move all nodes from src to dst */ @@ -23532,7 +23664,9 @@ static int page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { size_t j = dst_nkeys; if (unlikely(pagetype & P_LEAF2)) { /* Mark dst as dirty. */ - if (unlikely(rc = page_touch(cdst))) + rc = page_touch(cdst); + cASSERT(cdst, rc != MDBX_RESULT_TRUE); + if (unlikely(rc != MDBX_SUCCESS)) return rc; key.iov_len = csrc->mc_db->md_xsize; @@ -23540,6 +23674,7 @@ static int page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { size_t i = 0; do { rc = node_add_leaf2(cdst, j++, &key); + cASSERT(cdst, rc != MDBX_RESULT_TRUE); if (unlikely(rc != MDBX_SUCCESS)) return rc; key.iov_base = ptr_disp(key.iov_base, key.iov_len); @@ -23553,7 +23688,8 @@ static int page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { cursor_copy(csrc, &mn); /* must find the lowest key below src */ rc = page_search_lowest(&mn); - if (unlikely(rc)) + cASSERT(csrc, rc != MDBX_RESULT_TRUE); + if (unlikely(rc != MDBX_SUCCESS)) return rc; const MDBX_page *mp = mn.mc_pg[mn.mc_top]; @@ -23578,7 +23714,9 @@ static int page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { } /* Mark dst as dirty. */ - if (unlikely(rc = page_touch(cdst))) + rc = page_touch(cdst); + cASSERT(cdst, rc != MDBX_RESULT_TRUE); + if (unlikely(rc != MDBX_SUCCESS)) return rc; size_t i = 0; @@ -23592,6 +23730,7 @@ static int page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { cASSERT(csrc, node_flags(srcnode) == 0); rc = node_add_branch(cdst, j++, &key, node_pgno(srcnode)); } + cASSERT(cdst, rc != MDBX_RESULT_TRUE); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -23618,7 +23757,8 @@ static int page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { if (csrc->mc_ki[csrc->mc_top] == 0) { const MDBX_val nullkey = {0, 0}; rc = update_key(csrc, &nullkey); - if (unlikely(rc)) { + cASSERT(csrc, rc != MDBX_RESULT_TRUE); + if (unlikely(rc != MDBX_SUCCESS)) { csrc->mc_top++; return rc; } @@ -23653,7 +23793,8 @@ static int page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { } rc = page_retire(csrc, (MDBX_page *)psrc); - if (unlikely(rc)) + cASSERT(csrc, rc != MDBX_RESULT_TRUE); + if (unlikely(rc != MDBX_SUCCESS)) return rc; cASSERT(cdst, cdst->mc_db->md_entries > 0); @@ -23666,7 +23807,7 @@ static int page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { const uint16_t save_depth = cdst->mc_db->md_depth; cursor_pop(cdst); rc = rebalance(cdst); - if (unlikely(rc)) + if (unlikely(rc != MDBX_SUCCESS)) return rc; cASSERT(cdst, cdst->mc_db->md_entries > 0); @@ -23854,11 +23995,9 @@ static int rebalance(MDBX_cursor *mc) { mc->mc_snum = 0; mc->mc_top = 0; mc->mc_flags &= ~C_INITIALIZED; - - rc = page_retire(mc, mp); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - } else if (IS_BRANCH(mp) && nkeys == 1) { + return page_retire(mc, mp); + } + if (IS_BRANCH(mp) && nkeys == 1) { DEBUG("%s", "collapsing root page!"); mc->mc_db->md_root = node_pgno(page_node(mp, 0)); rc = page_get(mc, mc->mc_db->md_root, &mc->mc_pg[0], mp->mp_txnid); @@ -23891,15 +24030,10 @@ static int rebalance(MDBX_cursor *mc) { PAGETYPE_WHOLE(mc->mc_pg[mc->mc_top]) == pagetype); cASSERT(mc, mc->mc_snum < mc->mc_db->md_depth || IS_LEAF(mc->mc_pg[mc->mc_db->md_depth - 1])); - - rc = page_retire(mc, mp); - if (likely(rc == MDBX_SUCCESS)) - rc = page_touch(mc); - return rc; - } else { - DEBUG("root page %" PRIaPGNO " doesn't need rebalancing (flags 0x%x)", - mp->mp_pgno, mp->mp_flags); + return page_retire(mc, mp); } + DEBUG("root page %" PRIaPGNO " doesn't need rebalancing (flags 0x%x)", + mp->mp_pgno, mp->mp_flags); return MDBX_SUCCESS; } @@ -23948,6 +24082,7 @@ static int rebalance(MDBX_cursor *mc) { const size_t right_nkeys = right ? page_numkeys(right) : 0; bool involve = false; retry: + cASSERT(mc, mc->mc_snum > 1); if (left_room > room_threshold && left_room >= right_room && (IS_MODIFIABLE(mc->mc_txn, left) || involve)) { /* try merge with left */ @@ -24019,7 +24154,18 @@ static int rebalance(MDBX_cursor *mc) { return MDBX_SUCCESS; } - if (likely(!involve)) { + /* Заглушено в ветке v0.12.x, будет работать в v0.13.1 и далее. + * + * if (mc->mc_txn->mt_env->me_options.prefer_waf_insteadof_balance && + * likely(room_threshold > 0)) { + * room_threshold = 0; + * goto retry; + * } + */ + if (likely(!involve) && + (likely(mc->mc_dbi != FREE_DBI) || mc->mc_txn->tw.loose_pages || + MDBX_PNL_GETSIZE(mc->mc_txn->tw.relist) || (mc->mc_flags & C_GCU) || + (mc->mc_txn->mt_flags & MDBX_TXN_DRAINED_GC) || room_threshold)) { involve = true; goto retry; } @@ -24109,8 +24255,7 @@ __cold static int page_check(const MDBX_cursor *const mc, break; } - if (unlikely(mp->mp_upper < mp->mp_lower || - ((mp->mp_lower | mp->mp_upper) & 1) || + if (unlikely(mp->mp_upper < mp->mp_lower || (mp->mp_lower & 1) || PAGEHDRSZ + mp->mp_upper > env->me_psize)) rc = bad_page(mp, "invalid page lower(%u)/upper(%u) with limit %zu\n", mp->mp_lower, mp->mp_upper, page_space(env)); @@ -24126,11 +24271,6 @@ __cold static int page_check(const MDBX_cursor *const mc, bad_page(mp, "%s-page nkeys (%zu) < %u\n", IS_BRANCH(mp) ? "branch" : "leaf", nkeys, 1 + IS_BRANCH(mp)); } - if (!IS_LEAF2(mp) && unlikely(PAGEHDRSZ + mp->mp_upper + - nkeys * sizeof(MDBX_node) + nkeys - 1 > - env->me_psize)) - rc = bad_page(mp, "invalid page upper (%u) for nkeys %zu with limit %zu\n", - mp->mp_upper, nkeys, page_space(env)); const size_t ksize_max = keysize_max(env->me_psize, 0); const size_t leaf2_ksize = mp->mp_leaf2_ksize; @@ -24139,8 +24279,20 @@ __cold static int page_check(const MDBX_cursor *const mc, (mc->mc_db->md_flags & MDBX_DUPFIXED) == 0)) rc = bad_page(mp, "unexpected leaf2-page (db-flags 0x%x)\n", mc->mc_db->md_flags); - if (unlikely(leaf2_ksize < 1 || leaf2_ksize > ksize_max)) - rc = bad_page(mp, "invalid leaf2-key length (%zu)\n", leaf2_ksize); + else if (unlikely(leaf2_ksize != mc->mc_db->md_xsize)) + rc = bad_page(mp, "invalid leaf2_ksize %zu\n", leaf2_ksize); + else if (unlikely(((leaf2_ksize & nkeys) ^ mp->mp_upper) & 1)) + rc = bad_page( + mp, "invalid page upper (%u) for nkeys %zu with leaf2-length %zu\n", + mp->mp_upper, nkeys, leaf2_ksize); + } else { + if (unlikely((mp->mp_upper & 1) || PAGEHDRSZ + mp->mp_upper + + nkeys * sizeof(MDBX_node) + + nkeys - 1 > + env->me_psize)) + rc = + bad_page(mp, "invalid page upper (%u) for nkeys %zu with limit %zu\n", + mp->mp_upper, nkeys, page_space(env)); } MDBX_val here, prev = {0, 0}; @@ -24148,7 +24300,7 @@ __cold static int page_check(const MDBX_cursor *const mc, if (IS_LEAF2(mp)) { const char *const key = page_leaf2key(mp, i, leaf2_ksize); if (unlikely(end_of_page < key + leaf2_ksize)) { - rc = bad_page(mp, "leaf2-key beyond (%zu) page-end\n", + rc = bad_page(mp, "leaf2-item beyond (%zu) page-end\n", key + leaf2_ksize - end_of_page); continue; } @@ -24157,7 +24309,7 @@ __cold static int page_check(const MDBX_cursor *const mc, if (unlikely(leaf2_ksize < mc->mc_dbx->md_klen_min || leaf2_ksize > mc->mc_dbx->md_klen_max)) rc = bad_page( - mp, "leaf2-key size (%zu) <> min/max key-length (%zu/%zu)\n", + mp, "leaf2-item size (%zu) <> min/max length (%zu/%zu)\n", leaf2_ksize, mc->mc_dbx->md_klen_min, mc->mc_dbx->md_klen_max); else mc->mc_dbx->md_klen_min = mc->mc_dbx->md_klen_max = leaf2_ksize; @@ -24166,7 +24318,7 @@ __cold static int page_check(const MDBX_cursor *const mc, here.iov_base = (void *)key; here.iov_len = leaf2_ksize; if (prev.iov_base && unlikely(mc->mc_dbx->md_cmp(&prev, &here) >= 0)) - rc = bad_page(mp, "leaf2-key #%zu wrong order (%s >= %s)\n", i, + rc = bad_page(mp, "leaf2-item #%zu wrong order (%s >= %s)\n", i, DKEY(&prev), DVAL(&here)); prev = here; } @@ -24577,6 +24729,8 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, DKBUF; MDBX_page *const mp = mc->mc_pg[mc->mc_top]; + cASSERT(mc, (mp->mp_flags & P_ILL_BITS) == 0); + const size_t newindx = mc->mc_ki[mc->mc_top]; size_t nkeys = page_numkeys(mp); if (AUDIT_ENABLED()) { @@ -24668,7 +24822,7 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, /* It is reasonable and possible to split the page at the begin */ if (unlikely(newindx < minkeys)) { split_indx = minkeys; - if (newindx == 0 && foliage == 0 && !(naf & MDBX_SPLIT_REPLACE)) { + if (newindx == 0 && !(naf & MDBX_SPLIT_REPLACE)) { split_indx = 0; /* Checking for ability of splitting by the left-side insertion * of a pure page with the new key */ @@ -24688,10 +24842,19 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, } else get_key(page_node(mp, 0), &sepkey); cASSERT(mc, mc->mc_dbx->md_cmp(newkey, &sepkey) < 0); - /* Avoiding rare complex cases of split the parent page */ - if (page_room(mn.mc_pg[ptop]) < branch_size(env, &sepkey)) + /* Avoiding rare complex cases of nested split the parent page(s) */ + if (page_room(mc->mc_pg[ptop]) < branch_size(env, &sepkey)) split_indx = minkeys; } + if (foliage) { + TRACE("pure-left: foliage %u, top %i, ptop %zu, split_indx %zi, " + "minkeys %zi, sepkey %s, parent-room %zu, need4split %zu", + foliage, mc->mc_top, ptop, split_indx, minkeys, + DKEY_DEBUG(&sepkey), page_room(mc->mc_pg[ptop]), + branch_size(env, &sepkey)); + TRACE("pure-left: newkey %s, newdata %s, newindx %zu", + DKEY_DEBUG(newkey), DVAL_DEBUG(newdata), newindx); + } } } @@ -24704,9 +24867,10 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, sepkey = *newkey; } else if (unlikely(pure_left)) { /* newindx == split_indx == 0 */ - TRACE("no-split, but add new pure page at the %s", "left/before"); + TRACE("pure-left: no-split, but add new pure page at the %s", + "left/before"); cASSERT(mc, newindx == 0 && split_indx == 0 && minkeys == 1); - TRACE("old-first-key is %s", DKEY_DEBUG(&sepkey)); + TRACE("pure-left: old-first-key is %s", DKEY_DEBUG(&sepkey)); } else { if (IS_LEAF2(sister)) { /* Move half of the keys to the right sibling */ @@ -24737,6 +24901,7 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, mp->mp_lower += sizeof(indx_t); cASSERT(mc, mp->mp_upper >= ksize - sizeof(indx_t)); mp->mp_upper -= (indx_t)(ksize - sizeof(indx_t)); + cASSERT(mc, (((ksize & page_numkeys(mp)) ^ mp->mp_upper) & 1) == 0); } else { memcpy(sister->mp_ptrs, split, distance * ksize); void *const ins = page_leaf2key(sister, distance, ksize); @@ -24749,6 +24914,8 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, sister->mp_upper -= (indx_t)(ksize - sizeof(indx_t)); cASSERT(mc, distance <= (int)UINT16_MAX); mc->mc_ki[mc->mc_top] = (indx_t)distance; + cASSERT(mc, + (((ksize & page_numkeys(sister)) ^ sister->mp_upper) & 1) == 0); } if (AUDIT_ENABLED()) { @@ -24917,18 +25084,20 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, } } else if (unlikely(pure_left)) { MDBX_page *ptop_page = mc->mc_pg[ptop]; - DEBUG("adding to parent page %u node[%u] left-leaf page #%u key %s", + TRACE("pure-left: adding to parent page %u node[%u] left-leaf page #%u key " + "%s", ptop_page->mp_pgno, mc->mc_ki[ptop], sister->mp_pgno, DKEY(mc->mc_ki[ptop] ? newkey : NULL)); - mc->mc_top--; + assert(mc->mc_top == ptop + 1); + mc->mc_top = (uint8_t)ptop; rc = node_add_branch(mc, mc->mc_ki[ptop], mc->mc_ki[ptop] ? newkey : NULL, sister->mp_pgno); cASSERT(mc, mp == mc->mc_pg[ptop + 1] && newindx == mc->mc_ki[ptop + 1] && ptop == mc->mc_top); if (likely(rc == MDBX_SUCCESS) && mc->mc_ki[ptop] == 0) { - DEBUG("update prev-first key on parent %s", DKEY(&sepkey)); MDBX_node *node = page_node(mc->mc_pg[ptop], 1); + TRACE("pure-left: update prev-first key on parent to %s", DKEY(&sepkey)); cASSERT(mc, node_ks(node) == 0 && node_pgno(node) == mp->mp_pgno); cASSERT(mc, mc->mc_top == ptop && mc->mc_ki[ptop] == 0); mc->mc_ki[ptop] = 1; @@ -24936,6 +25105,9 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, cASSERT(mc, mc->mc_top == ptop && mc->mc_ki[ptop] == 1); cASSERT(mc, mp == mc->mc_pg[ptop + 1] && newindx == mc->mc_ki[ptop + 1]); mc->mc_ki[ptop] = 0; + } else { + TRACE("pure-left: no-need-update prev-first key on parent %s", + DKEY(&sepkey)); } mc->mc_top++; @@ -24984,7 +25156,7 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, &sepkey); if (mc->mc_dbx->md_cmp(newkey, &sepkey) < 0) { mc->mc_top -= (uint8_t)i; - DEBUG("update new-first on parent [%i] page %u key %s", + DEBUG("pure-left: update new-first on parent [%i] page %u key %s", mc->mc_ki[mc->mc_top], mc->mc_pg[mc->mc_top]->mp_pgno, DKEY(newkey)); rc = update_key(mc, newkey); @@ -24995,7 +25167,7 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, break; } } - } else if (tmp_ki_copy /* !IS_LEAF2(mp) */) { + } else if (tmp_ki_copy) { /* !IS_LEAF2(mp) */ /* Move nodes */ mc->mc_pg[mc->mc_top] = sister; i = split_indx; @@ -25114,7 +25286,7 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, m3->mc_ki[k + 1] = m3->mc_ki[k]; m3->mc_pg[k + 1] = m3->mc_pg[k]; } - m3->mc_ki[0] = m3->mc_ki[0] >= nkeys; + m3->mc_ki[0] = m3->mc_ki[0] >= nkeys + pure_left; m3->mc_pg[0] = mc->mc_pg[0]; m3->mc_snum++; m3->mc_top++; @@ -27566,8 +27738,7 @@ __cold static int walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, (mp ? page_room(mp) : pagesize - header_size) - payload_size; size_t align_bytes = 0; - for (size_t i = 0; err == MDBX_SUCCESS && i < nentries; - align_bytes += ((payload_size + align_bytes) & 1), ++i) { + for (size_t i = 0; err == MDBX_SUCCESS && i < nentries; ++i) { if (type == MDBX_page_dupfixed_leaf) { /* LEAF2 pages have no mp_ptrs[] or node headers */ payload_size += mp->mp_leaf2_ksize; @@ -27575,23 +27746,26 @@ __cold static int walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, } MDBX_node *node = page_node(mp, i); - payload_size += NODESIZE + node_ks(node); + const size_t node_key_size = node_ks(node); + payload_size += NODESIZE + node_key_size; if (type == MDBX_page_branch) { assert(i > 0 || node_ks(node) == 0); + align_bytes += node_key_size & 1; continue; } + const size_t node_data_size = node_ds(node); assert(type == MDBX_page_leaf); switch (node_flags(node)) { case 0 /* usual node */: - payload_size += node_ds(node); + payload_size += node_data_size; + align_bytes += (node_key_size + node_data_size) & 1; break; case F_BIGDATA /* long data on the large/overflow page */: { - payload_size += sizeof(pgno_t); const pgno_t large_pgno = node_largedata_pgno(node); - const size_t over_payload = node_ds(node); + const size_t over_payload = node_data_size; const size_t over_header = PAGEHDRSZ; npages = 1; @@ -27610,27 +27784,31 @@ __cold static int walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, over_payload, over_header, over_unused); if (unlikely(rc != MDBX_SUCCESS)) return (rc == MDBX_RESULT_TRUE) ? MDBX_SUCCESS : rc; + payload_size += sizeof(pgno_t); + align_bytes += node_key_size & 1; } break; case F_SUBDATA /* sub-db */: { - const size_t namelen = node_ks(node); - payload_size += node_ds(node); - if (unlikely(namelen == 0 || node_ds(node) != sizeof(MDBX_db))) { + const size_t namelen = node_key_size; + if (unlikely(namelen == 0 || node_data_size != sizeof(MDBX_db))) { assert(err == MDBX_CORRUPTED); err = MDBX_CORRUPTED; } + header_size += node_data_size; + align_bytes += (node_key_size + node_data_size) & 1; } break; case F_SUBDATA | F_DUPDATA /* dupsorted sub-tree */: - payload_size += sizeof(MDBX_db); - if (unlikely(node_ds(node) != sizeof(MDBX_db))) { + if (unlikely(node_data_size != sizeof(MDBX_db))) { assert(err == MDBX_CORRUPTED); err = MDBX_CORRUPTED; } + header_size += node_data_size; + align_bytes += (node_key_size + node_data_size) & 1; break; case F_DUPDATA /* short sub-page */: { - if (unlikely(node_ds(node) <= PAGEHDRSZ)) { + if (unlikely(node_data_size <= PAGEHDRSZ || (node_data_size & 1))) { assert(err == MDBX_CORRUPTED); err = MDBX_CORRUPTED; break; @@ -27658,16 +27836,17 @@ __cold static int walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, err = MDBX_CORRUPTED; } - for (size_t j = 0; err == MDBX_SUCCESS && j < nsubkeys; - subalign_bytes += ((subpayload_size + subalign_bytes) & 1), ++j) { - + for (size_t j = 0; err == MDBX_SUCCESS && j < nsubkeys; ++j) { if (subtype == MDBX_subpage_dupfixed_leaf) { /* LEAF2 pages have no mp_ptrs[] or node headers */ subpayload_size += sp->mp_leaf2_ksize; } else { assert(subtype == MDBX_subpage_leaf); - MDBX_node *subnode = page_node(sp, j); - subpayload_size += NODESIZE + node_ks(subnode) + node_ds(subnode); + const MDBX_node *subnode = page_node(sp, j); + const size_t subnode_size = node_ks(subnode) + node_ds(subnode); + subheader_size += NODESIZE; + subpayload_size += subnode_size; + subalign_bytes += subnode_size & 1; if (unlikely(node_flags(subnode) != 0)) { assert(err == MDBX_CORRUPTED); err = MDBX_CORRUPTED; @@ -27676,7 +27855,7 @@ __cold static int walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, } const int rc = - ctx->mw_visitor(pgno, 0, ctx->mw_user, deep + 1, name, node_ds(node), + ctx->mw_visitor(pgno, 0, ctx->mw_user, deep + 1, name, node_data_size, subtype, err, nsubkeys, subpayload_size, subheader_size, subunused_size + subalign_bytes); if (unlikely(rc != MDBX_SUCCESS)) @@ -27684,7 +27863,7 @@ __cold static int walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, header_size += subheader_size; unused_size += subunused_size; payload_size += subpayload_size; - align_bytes += subalign_bytes; + align_bytes += subalign_bytes + (node_key_size & 1); } break; default: @@ -32401,7 +32580,7 @@ retry_mapview:; ptr_disp(map->base, size), ((map->current < map->limit) ? map->current : map->limit) - size); } - map->current = size; + map->current = (size < map->limit) ? size : map->limit; } if (limit == map->limit) @@ -32562,6 +32741,7 @@ retry_mapview:; map->base = ptr; } map->limit = limit; + map->current = size; #if MDBX_ENABLE_MADVISE #ifdef MADV_DONTFORK @@ -33379,9 +33559,9 @@ __dll_export 0, 12, 9, - 0, - {"2023-12-11T23:24:05+03:00", "44ee35910be133a64a24525537f125bca0d5e037", "185e43f3a86b6d62482c933a1062a3e95c82b93c", - "v0.12.9-0-g185e43f3"}, + 16, + {"2024-03-06T22:58:31+03:00", "c5e6e3a4f75727b9e0039ad420ae167d3487d006", "fff3fbd866c50ee3c77b244a9b05f497e06a65e8", + "v0.12.9-16-gfff3fbd8"}, sourcery}; __dll_export diff --git a/mdbxdist/mdbx.c++ b/mdbxdist/mdbx.c++ index 0b6e7be..cd38971 100644 --- a/mdbxdist/mdbx.c++ +++ b/mdbxdist/mdbx.c++ @@ -12,7 +12,7 @@ * . */ #define xMDBX_ALLOY 1 -#define MDBX_BUILD_SOURCERY a3bc805b6c34de756a896bd408af909f74b858903ba60515d7c218c0181e020b_v0_12_9_0_g185e43f3 +#define MDBX_BUILD_SOURCERY 0c86daff919dc09ab5e99c8e5bfb1d51e1367babca9c630c97af92a2aa988ea9_v0_12_9_16_gfff3fbd8 #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif @@ -3628,8 +3628,12 @@ struct MDBX_env { struct MDBX_lockinfo *me_lck; unsigned me_psize; /* DB page size, initialized from me_os_psize */ - unsigned me_leaf_nodemax; /* max size of a leaf-node */ - unsigned me_branch_nodemax; /* max size of a branch-node */ + uint16_t me_leaf_nodemax; /* max size of a leaf-node */ + uint16_t me_branch_nodemax; /* max size of a branch-node */ + uint16_t me_subpage_limit; + uint16_t me_subpage_room_threshold; + uint16_t me_subpage_reserve_prereq; + uint16_t me_subpage_reserve_limit; atomic_pgno_t me_mlocked_pgno; uint8_t me_psize2log; /* log2 of DB page size */ int8_t me_stuck_meta; /* recovery-only: target meta page or less that zero */ diff --git a/mdbxdist/mdbx_chk.c b/mdbxdist/mdbx_chk.c index 57ba828..4c48027 100644 --- a/mdbxdist/mdbx_chk.c +++ b/mdbxdist/mdbx_chk.c @@ -34,7 +34,7 @@ * top-level directory of the distribution or, alternatively, at * . */ -#define MDBX_BUILD_SOURCERY a3bc805b6c34de756a896bd408af909f74b858903ba60515d7c218c0181e020b_v0_12_9_0_g185e43f3 +#define MDBX_BUILD_SOURCERY 0c86daff919dc09ab5e99c8e5bfb1d51e1367babca9c630c97af92a2aa988ea9_v0_12_9_16_gfff3fbd8 #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif @@ -3650,8 +3650,12 @@ struct MDBX_env { struct MDBX_lockinfo *me_lck; unsigned me_psize; /* DB page size, initialized from me_os_psize */ - unsigned me_leaf_nodemax; /* max size of a leaf-node */ - unsigned me_branch_nodemax; /* max size of a branch-node */ + uint16_t me_leaf_nodemax; /* max size of a leaf-node */ + uint16_t me_branch_nodemax; /* max size of a branch-node */ + uint16_t me_subpage_limit; + uint16_t me_subpage_room_threshold; + uint16_t me_subpage_reserve_prereq; + uint16_t me_subpage_reserve_limit; atomic_pgno_t me_mlocked_pgno; uint8_t me_psize2log; /* log2 of DB page size */ int8_t me_stuck_meta; /* recovery-only: target meta page or less that zero */ diff --git a/mdbxdist/mdbx_copy.c b/mdbxdist/mdbx_copy.c index 9aa0119..8af7231 100644 --- a/mdbxdist/mdbx_copy.c +++ b/mdbxdist/mdbx_copy.c @@ -34,7 +34,7 @@ * top-level directory of the distribution or, alternatively, at * . */ -#define MDBX_BUILD_SOURCERY a3bc805b6c34de756a896bd408af909f74b858903ba60515d7c218c0181e020b_v0_12_9_0_g185e43f3 +#define MDBX_BUILD_SOURCERY 0c86daff919dc09ab5e99c8e5bfb1d51e1367babca9c630c97af92a2aa988ea9_v0_12_9_16_gfff3fbd8 #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif @@ -3650,8 +3650,12 @@ struct MDBX_env { struct MDBX_lockinfo *me_lck; unsigned me_psize; /* DB page size, initialized from me_os_psize */ - unsigned me_leaf_nodemax; /* max size of a leaf-node */ - unsigned me_branch_nodemax; /* max size of a branch-node */ + uint16_t me_leaf_nodemax; /* max size of a leaf-node */ + uint16_t me_branch_nodemax; /* max size of a branch-node */ + uint16_t me_subpage_limit; + uint16_t me_subpage_room_threshold; + uint16_t me_subpage_reserve_prereq; + uint16_t me_subpage_reserve_limit; atomic_pgno_t me_mlocked_pgno; uint8_t me_psize2log; /* log2 of DB page size */ int8_t me_stuck_meta; /* recovery-only: target meta page or less that zero */ diff --git a/mdbxdist/mdbx_drop.c b/mdbxdist/mdbx_drop.c index 3340ec8..96cf201 100644 --- a/mdbxdist/mdbx_drop.c +++ b/mdbxdist/mdbx_drop.c @@ -36,7 +36,7 @@ * top-level directory of the distribution or, alternatively, at * . */ -#define MDBX_BUILD_SOURCERY a3bc805b6c34de756a896bd408af909f74b858903ba60515d7c218c0181e020b_v0_12_9_0_g185e43f3 +#define MDBX_BUILD_SOURCERY 0c86daff919dc09ab5e99c8e5bfb1d51e1367babca9c630c97af92a2aa988ea9_v0_12_9_16_gfff3fbd8 #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif @@ -3652,8 +3652,12 @@ struct MDBX_env { struct MDBX_lockinfo *me_lck; unsigned me_psize; /* DB page size, initialized from me_os_psize */ - unsigned me_leaf_nodemax; /* max size of a leaf-node */ - unsigned me_branch_nodemax; /* max size of a branch-node */ + uint16_t me_leaf_nodemax; /* max size of a leaf-node */ + uint16_t me_branch_nodemax; /* max size of a branch-node */ + uint16_t me_subpage_limit; + uint16_t me_subpage_room_threshold; + uint16_t me_subpage_reserve_prereq; + uint16_t me_subpage_reserve_limit; atomic_pgno_t me_mlocked_pgno; uint8_t me_psize2log; /* log2 of DB page size */ int8_t me_stuck_meta; /* recovery-only: target meta page or less that zero */ diff --git a/mdbxdist/mdbx_dump.c b/mdbxdist/mdbx_dump.c index bb4427b..588f588 100644 --- a/mdbxdist/mdbx_dump.c +++ b/mdbxdist/mdbx_dump.c @@ -34,7 +34,7 @@ * top-level directory of the distribution or, alternatively, at * . */ -#define MDBX_BUILD_SOURCERY a3bc805b6c34de756a896bd408af909f74b858903ba60515d7c218c0181e020b_v0_12_9_0_g185e43f3 +#define MDBX_BUILD_SOURCERY 0c86daff919dc09ab5e99c8e5bfb1d51e1367babca9c630c97af92a2aa988ea9_v0_12_9_16_gfff3fbd8 #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif @@ -3650,8 +3650,12 @@ struct MDBX_env { struct MDBX_lockinfo *me_lck; unsigned me_psize; /* DB page size, initialized from me_os_psize */ - unsigned me_leaf_nodemax; /* max size of a leaf-node */ - unsigned me_branch_nodemax; /* max size of a branch-node */ + uint16_t me_leaf_nodemax; /* max size of a leaf-node */ + uint16_t me_branch_nodemax; /* max size of a branch-node */ + uint16_t me_subpage_limit; + uint16_t me_subpage_room_threshold; + uint16_t me_subpage_reserve_prereq; + uint16_t me_subpage_reserve_limit; atomic_pgno_t me_mlocked_pgno; uint8_t me_psize2log; /* log2 of DB page size */ int8_t me_stuck_meta; /* recovery-only: target meta page or less that zero */ diff --git a/mdbxdist/mdbx_load.c b/mdbxdist/mdbx_load.c index 97ddc8c..b7eb75d 100644 --- a/mdbxdist/mdbx_load.c +++ b/mdbxdist/mdbx_load.c @@ -34,7 +34,7 @@ * top-level directory of the distribution or, alternatively, at * . */ -#define MDBX_BUILD_SOURCERY a3bc805b6c34de756a896bd408af909f74b858903ba60515d7c218c0181e020b_v0_12_9_0_g185e43f3 +#define MDBX_BUILD_SOURCERY 0c86daff919dc09ab5e99c8e5bfb1d51e1367babca9c630c97af92a2aa988ea9_v0_12_9_16_gfff3fbd8 #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif @@ -3650,8 +3650,12 @@ struct MDBX_env { struct MDBX_lockinfo *me_lck; unsigned me_psize; /* DB page size, initialized from me_os_psize */ - unsigned me_leaf_nodemax; /* max size of a leaf-node */ - unsigned me_branch_nodemax; /* max size of a branch-node */ + uint16_t me_leaf_nodemax; /* max size of a leaf-node */ + uint16_t me_branch_nodemax; /* max size of a branch-node */ + uint16_t me_subpage_limit; + uint16_t me_subpage_room_threshold; + uint16_t me_subpage_reserve_prereq; + uint16_t me_subpage_reserve_limit; atomic_pgno_t me_mlocked_pgno; uint8_t me_psize2log; /* log2 of DB page size */ int8_t me_stuck_meta; /* recovery-only: target meta page or less that zero */ diff --git a/mdbxdist/mdbx_stat.c b/mdbxdist/mdbx_stat.c index 894cec0..f844322 100644 --- a/mdbxdist/mdbx_stat.c +++ b/mdbxdist/mdbx_stat.c @@ -34,7 +34,7 @@ * top-level directory of the distribution or, alternatively, at * . */ -#define MDBX_BUILD_SOURCERY a3bc805b6c34de756a896bd408af909f74b858903ba60515d7c218c0181e020b_v0_12_9_0_g185e43f3 +#define MDBX_BUILD_SOURCERY 0c86daff919dc09ab5e99c8e5bfb1d51e1367babca9c630c97af92a2aa988ea9_v0_12_9_16_gfff3fbd8 #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif @@ -3650,8 +3650,12 @@ struct MDBX_env { struct MDBX_lockinfo *me_lck; unsigned me_psize; /* DB page size, initialized from me_os_psize */ - unsigned me_leaf_nodemax; /* max size of a leaf-node */ - unsigned me_branch_nodemax; /* max size of a branch-node */ + uint16_t me_leaf_nodemax; /* max size of a leaf-node */ + uint16_t me_branch_nodemax; /* max size of a branch-node */ + uint16_t me_subpage_limit; + uint16_t me_subpage_room_threshold; + uint16_t me_subpage_reserve_prereq; + uint16_t me_subpage_reserve_limit; atomic_pgno_t me_mlocked_pgno; uint8_t me_psize2log; /* log2 of DB page size */ int8_t me_stuck_meta; /* recovery-only: target meta page or less that zero */